In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

### Hello to Numpy

In [3]:
# Import numpy library
import numpy as np

In [4]:
# Create a 1D array
arr1d = np.array([1, 2, 3, 4, 5])
print("Array 1: ", arr1d)

# Create a 2D array
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print("Array 2: \n", arr2d)

Array 1:  [1 2 3 4 5]
Array 2: 
 [[1 2 3]
 [4 5 6]
 [7 8 9]]


In [5]:
# Shape and size of numpy arrays
print("Shape of arr2d:", arr2d.shape)
print("Number of elements in arr2d:", arr2d.size)

Shape of arr2d: (3, 3)
Number of elements in arr2d: 9


In [6]:
# Change the shape (reshape)
reshaped_arr = arr1d.reshape(5, 1)
print("Reshaped array: ", reshaped_arr)
print("New shape of array: \n", reshaped_arr.shape)

Reshaped array:  [[1]
 [2]
 [3]
 [4]
 [5]]
New shape of array: 
 (5, 1)


In [7]:
# Indexing
element_at_index_2 = arr1d[2]
print(f'Element value at position 2: ', element_at_index_2)

Element value at position 2:  3


In [8]:
# Boolean masking
mask = arr1d > 3
print('Mask: ', mask)
filtered_arr = arr1d[mask]
print("Filtered array: ", filtered_arr)

Mask:  [False False False  True  True]
Filtered array:  [4 5]


In [9]:
# Array slicing
sliced_arr = arr1d[1:4]
print("Sliced array: ", sliced_arr)

Sliced array:  [2 3 4]


In [10]:
# Element-wise addition
added_arr = arr1d + 2
print("Added array: ", added_arr)

# Element-wise multiplication
multiplied_arr = arr1d * 3
print("Multiplied array: ", multiplied_arr)

Added array:  [3 4 5 6 7]
Multiplied array:  [ 3  6  9 12 15]


In [11]:
# Matrix multiplication
mat1 = np.array([[1, 2], [3, 4]])
mat2 = np.array([[5, 6], [7, 8]])
mat_result = np.dot(mat1, mat2)
print("Matrix multiplication: \n", mat_result)

Matrix multiplication: 
 [[19 22]
 [43 50]]


In [12]:
# Mean and standard deviation
mean_val = np.mean(arr1d)
std_dev = np.std(arr1d)
print(f'Mean: {mean_val} | Standard deviation: {std_dev}')

Mean: 3.0 | Standard deviation: 1.4142135623730951


In [13]:
# Generate random numbers
random_arr = np.random.rand(3, 3)
print("Random array: \n", random_arr)

Random array: 
 [[0.50463149 0.06576615 0.84940436]
 [0.47487184 0.77274575 0.44405602]
 [0.30932647 0.51942196 0.0179538 ]]


### Hello to pandas

In [14]:
# Import pandas
import pandas as pd

In [15]:
# Create pandas dataframe manually
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35],
        'Score': [90, 85, 88]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Score
0,Alice,25,90
1,Bob,30,85
2,Charlie,35,88


In [16]:
# Read data into pandas dataframe
trips_data = pd.read_excel('./MLA_P1_D1.xlsx')

# Display first 5 rows
trips_data.head(5)

Unnamed: 0,Salary,City,Age,Vacation_preferences,Transport_preferences,Family members,Target
0,196000,Krasnodar,25,Shopping,Car,1,New York
1,152000,Ekaterinburg,60,Shopping,Plain,2,London
2,83000,Tomsk,49,Architecture,Train,1,Sydney
3,146000,Krasnodar,41,Architecture,Car,1,New York
4,59000,Krasnodar,58,Architecture,Car,2,Sydney


In [17]:
# Display basic information about data
trips_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Salary                 40 non-null     int64 
 1   City                   40 non-null     object
 2   Age                    40 non-null     int64 
 3   Vacation_preferences   40 non-null     object
 4   Transport_preferences  40 non-null     object
 5   Family members         40 non-null     int64 
 6   Target                 40 non-null     object
dtypes: int64(3), object(4)
memory usage: 2.3+ KB


In [18]:
# Summary statistics for numeric columns
trips_data.describe()

Unnamed: 0,Salary,Age,Family members
count,40.0,40.0,40.0
mean,105300.0,53.45,2.325
std,54762.049371,11.331712,1.327954
min,12000.0,25.0,1.0
25%,63000.0,48.75,1.0
50%,83000.0,55.0,2.0
75%,155000.0,60.0,3.0
max,216000.0,75.0,6.0


In [19]:
# Checking for missing values
print('Null values: \n', trips_data.isnull().sum(), '\n')

# Dropping rows with missing values
df_cleaned = trips_data.dropna()

# Filling missing values with a specific value
df_filled = df.fillna(0)

Null values: 
 Salary                   0
City                     0
Age                      0
Vacation_preferences     0
Transport_preferences    0
Family members           0
Target                   0
dtype: int64 



In [20]:
# Selecting a single column
ages = trips_data['Age']
print("Age column: \n", ages.head(5), '\n')

# Filtering based on a condition
young_people = trips_data[trips_data['Age'] < 30]
print("Young people: \n", young_people.head(5))

Age column: 
 0    25
1    60
2    49
3    41
4    58
Name: Age, dtype: int64 

Young people: 
     Salary       City  Age Vacation_preferences Transport_preferences  \
0   196000  Krasnodar   25             Shopping                   Car   
20  196000  Krasnodar   25             Shopping                   Car   

    Family members    Target  
0                1  New York  
20               1    Sydney  


In [21]:
# Adding a new column
trips_data['Family members 2'] = trips_data['Family members']
print(f'Columns after adding new column: ', trips_data.columns)

# Removing a column
trips_data.drop('Family members 2', axis=1, inplace=True)
print(f'Columns after removing new column: ', trips_data.columns)

Columns after adding new column:  Index(['Salary', 'City', 'Age', 'Vacation_preferences',
       'Transport_preferences', 'Family members', 'Target',
       'Family members 2'],
      dtype='object')
Columns after removing new column:  Index(['Salary', 'City', 'Age', 'Vacation_preferences',
       'Transport_preferences', 'Family members', 'Target'],
      dtype='object')


In [22]:
# Grouping by a column and calculating mean
grouped_data = trips_data.groupby('Family members').mean()
grouped_data.head(5)

Unnamed: 0_level_0,Salary,Age
Family members,Unnamed: 1_level_1,Unnamed: 2_level_1
1,115923.076923,46.692308
2,103076.923077,59.076923
3,117333.333333,51.666667
4,68600.0,60.8
5,110500.0,48.0
