In [213]:
# Performing std Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [214]:
df = pd.read_csv('Data.csv')

In [215]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


### Taking care of missing data

In [216]:
# X = df.drop('Purchased',axis=1)
X = df.iloc[:,:-1]
y = df['Purchased']

In [217]:
from sklearn.impute import SimpleImputer

In [218]:
imputer = SimpleImputer(strategy='mean',)

In [219]:
imputer = imputer.fit(X[['Age','Salary']])

In [220]:
X[['Age','Salary']] = imputer.transform(X[['Age','Salary']])
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


#### Catagorical Variable

In [221]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [222]:
label_encoder_x = LabelEncoder()

In [223]:
label_encoder_x.fit(X['Country'])

LabelEncoder()

In [224]:
X['Country'] = label_encoder_x.transform(X['Country'])

In [225]:
X

Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,63777.777778
5,0,35.0,58000.0
6,2,38.777778,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,67000.0


Everthing works well with this process but the problem here is : Assigning 0,1,2, to different countries will
 add some kind of precedence to it.

In [226]:
onehotencoder = OneHotEncoder(categorical_features = [0],)

In [227]:
X = onehotencoder.fit_transform(X).toarray()


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [228]:
y = label_encoder_x.fit_transform(y)


In [229]:
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

#### Train test split

In [230]:
from sklearn.model_selection import train_test_split

In [231]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Feature Scaling

In [232]:
from sklearn.preprocessing import StandardScaler

In [233]:
sc = StandardScaler()

In [234]:
X_train = sc.fit_transform(X_train)

In [235]:
X_test = sc.transform(X_test)

In [236]:
X_train

array([[ 1.15470054, -0.63245553, -0.63245553,  0.8790543 ,  0.8892086 ],
       [ 1.15470054, -0.63245553, -0.63245553,  1.64292217,  1.67822469],
       [-0.8660254 ,  1.58113883, -0.63245553, -1.79448326, -1.1396899 ],
       [ 1.15470054, -0.63245553, -0.63245553, -0.45771448,  0.32562569],
       [-0.8660254 ,  1.58113883, -0.63245553,  0.11518643, -0.03757219],
       [-0.8660254 , -0.63245553,  1.58113883, -0.26674751, -0.35067382],
       [-0.8660254 , -0.63245553,  1.58113883, -0.11821765, -1.36512307]])

In [237]:
X_test

array([[-0.8660254 ,  1.58113883, -0.63245553,  2.02485611,  2.12909102],
       [-0.8660254 , -0.63245553,  1.58113883, -2.36738416, -1.8159894 ],
       [ 1.15470054, -0.63245553, -0.63245553, -0.83964842, -0.68882357]])