In [18]:
import numpy as np
import pandas as pd
data = pd.read_csv("Spain.csv")
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [17]:
dropped = data.drop(columns = ['Salary'], axis = 1)
# method 1 - dropping entire columns
dropped

Unnamed: 0,Country,Age,Purchased
0,France,44.0,0
1,Spain,27.0,1
2,Germany,30.0,0
3,Spain,38.0,0
4,Germany,40.0,1
5,France,35.0,1
6,Spain,,0
7,France,48.0,1
8,Germany,50.0,0
9,France,37.0,1


In [8]:
# method 2 - imputing
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [16]:
from sklearn.impute import SimpleImputer #import library
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean') #specifiy the way u are going to impute 
dropped_cat = data.drop(columns = ['Purchased','Country'],axis = 1) #dropping categorical because it we will take care of it later
mean_imputed = imputer.fit_transform(dropped_cat)

In [12]:
print(pd.DataFrame(mean_imputed)) #this format of print is only used for readability purpose

           0             1
0  44.000000  72000.000000
1  27.000000  48000.000000
2  30.000000  54000.000000
3  38.000000  61000.000000
4  40.000000  63777.777778
5  35.000000  58000.000000
6  38.777778  52000.000000
7  48.000000  79000.000000
8  50.000000  83000.000000
9  37.000000  67000.000000


In [13]:
data_c = pd.DataFrame(data) 
#creating a copy of original data as we will be 
#transforming the data...and we want to retain the original set incase we mess up
data_c

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [27]:
# METHOD 1 - LABEL ENCODER

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data_c['Purchased'] = label_encoder.fit_transform(data_c['Purchased'])
data_c

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,,1
5,0,35.0,58000.0,1
6,2,,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


In [28]:
data_c['Country'] = label_encoder.fit_transform(data_c['Country'])
data_c

# now we have a problem. "france" is given 0, "spain" is given 1 and "germany" 
# is given 2 for a machine which only understands numbers. it can interpret 
# germany > spain> france but we know that germany = spain = france, 
# i.e they are equal in value

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,,1
5,0,35.0,58000.0,1
6,2,,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


In [24]:
# METHOD 2 - ONE HOT ENCODER solution to the problem that label encoder created

data_o = data.copy(deep = True)
from sklearn.preprocessing import OneHotEncoder #import onehotencoder
from sklearn.compose import ColumnTransformer #importing columntranformer(used to apply encodingto whole columns at once)

ct = ColumnTransformer(
    [('encoder',OneHotEncoder(),[0])],
    remainder='passthrough')#create the transformer, then apply to data next
data_processed = pd.DataFrame(ct.fit_transform(data_o)) #applying the function
data_processed

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,0.0,44.0,72000.0,No
1,0.0,0.0,1.0,27.0,48000.0,Yes
2,0.0,1.0,0.0,30.0,54000.0,No
3,0.0,0.0,1.0,38.0,61000.0,No
4,0.0,1.0,0.0,40.0,,Yes
5,1.0,0.0,0.0,35.0,58000.0,Yes
6,0.0,0.0,1.0,,52000.0,No
7,1.0,0.0,0.0,48.0,79000.0,Yes
8,0.0,1.0,0.0,50.0,83000.0,No
9,1.0,0.0,0.0,37.0,67000.0,Yes


In [29]:
#CHALLENGE 1

data_o_1 = data.copy(deep = True)
data_o_1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [30]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data_o['Purchased'] = label_encoder.fit_transform(data_o['Purchased'])
data_o

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,,1
5,France,35.0,58000.0,1
6,Spain,,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


In [31]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
minmax = MinMaxScaler()

data_c

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,,1
5,0,35.0,58000.0,1
6,2,,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


In [32]:
data_c['Age'] = data_c['Age'].replace(to_replace = np.nan, value = data_c['Age'].median())
data_c['Salary'] = data_c['Salary'].replace(to_replace = np.nan, value = data_c['Salary'].mean())
data_c

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,63777.777778,1
5,0,35.0,58000.0,1
6,2,38.0,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


In [33]:
data_c = data_c.drop(['Country', 'Purchased'], axis = 1)
data_c

Unnamed: 0,Age,Salary
0,44.0,72000.0
1,27.0,48000.0
2,30.0,54000.0
3,38.0,61000.0
4,40.0,63777.777778
5,35.0,58000.0
6,38.0,52000.0
7,48.0,79000.0
8,50.0,83000.0
9,37.0,67000.0


In [34]:
sc_var = scaler.fit_transform(data_c)
print(pd.DataFrame(sc_var))

          0             1
0  0.769734  7.494733e-01
1 -1.699225 -1.438178e+00
2 -1.263526 -8.912655e-01
3 -0.101663 -2.532004e-01
4  0.188803  6.632192e-16
5 -0.537362 -5.266569e-01
6 -0.101663 -1.073570e+00
7  1.350666  1.387538e+00
8  1.641132  1.752147e+00
9 -0.246896  2.937125e-01


In [35]:
no_var = minmax.fit_transform(data_c)
print(pd.DataFrame(no_var))

          0         1
0  0.739130  0.685714
1  0.000000  0.000000
2  0.130435  0.171429
3  0.478261  0.371429
4  0.565217  0.450794
5  0.347826  0.285714
6  0.478261  0.114286
7  0.913043  0.885714
8  1.000000  1.000000
9  0.434783  0.542857
