The first step is to import the necessary modules

In [57]:
import numpy as np

In [58]:
import pandas as pd

Now we'll read the data file present on the same folder in which preprocessing needs to be performed

In [59]:
df = pd.read_csv("Data.csv")

In [60]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [61]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

Next, step is to take the X data and Y data

In [62]:
X = df.iloc[:,:3].values

In [63]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [64]:
y = df.iloc[:,-1].values

In [65]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

After collecting the data, we'll remove the nan values using the SimpleImputer module. By deafult the value is converted
to the mean of that column

In [66]:
from sklearn.impute import SimpleImputer
si = SimpleImputer()
X[:,1:3]= si.fit_transform(X[:,1:3])

In [67]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

Now, since our data contains categorial values and if we make them numerical in the form of 1,2,3 then the variable with
higher value will dominate. So to keep things consistent and fair we'll divide the number of categories into that many number
of different columns. Each column representing a different category.

In [68]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct= ColumnTransformer([("OneHot",OneHotEncoder(),[0])], remainder='passthrough')

In [69]:
X = ct.fit_transform(X)

In [70]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

Now, we'll divide our dataset into two parts, the train data and test data.

In [71]:
from sklearn.model_selection import train_test_split

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state= 0)

In [73]:
X_train

array([[1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [74]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778]], dtype=object)

Now, if we see the values in a particular row, then we can see that if the values are higher then that attribute will 
dominate. So we need to scale that attribute so that every attribute has an equal importance and given equal contribution.

In [75]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(float))
X_test = sc.fit_transform(X_test.astype(float))

In [76]:
X_train

array([[ 0.8660254 ,  0.        , -0.8660254 , -0.2029809 ,  0.44897083],
       [-1.15470054,  0.        ,  1.15470054, -1.82168936, -1.41706417],
       [-1.15470054,  0.        ,  1.15470054,  0.08478949, -1.0242147 ],
       [ 0.8660254 ,  0.        , -0.8660254 ,  1.5775984 ,  1.62751925],
       [-1.15470054,  0.        ,  1.15470054, -0.04111006, -0.14030338],
       [ 0.8660254 ,  0.        , -0.8660254 ,  0.93011502,  0.94003267],
       [ 0.8660254 ,  0.        , -0.8660254 , -0.52672259, -0.43494049]])