In [115]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [116]:
dataset = pd.read_csv('mini_data.csv')

In [117]:
print(dataset)

  Profession   Age   Salary Purchased
0     Doctor  44.0  72000.0        No
1     Lawyer  27.0  48000.0       Yes
2     Banker  30.0  54000.0        No
3     Lawyer  38.0  61000.0        No
4     Banker  40.0      NaN       Yes
5     Doctor  35.0  58000.0       Yes
6     Lawyer   NaN  52000.0        No
7     Doctor  48.0  79000.0       Yes
8     Banker  50.0  83000.0        No
9     Doctor  37.0  67000.0       Yes


In [118]:
X = dataset.iloc[ : , 0:-1].values

In [119]:
print(X)

[['Doctor' 44.0 72000.0]
 ['Lawyer' 27.0 48000.0]
 ['Banker' 30.0 54000.0]
 ['Lawyer' 38.0 61000.0]
 ['Banker' 40.0 nan]
 ['Doctor' 35.0 58000.0]
 ['Lawyer' nan 52000.0]
 ['Doctor' 48.0 79000.0]
 ['Banker' 50.0 83000.0]
 ['Doctor' 37.0 67000.0]]


In [120]:
Y = dataset.iloc[ : , 3].values

In [121]:
print(Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [122]:
from sklearn.impute import SimpleImputer

In [123]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

In [124]:
fitting = imputer.fit(X[ : , 1:])

In [125]:
transformation = fitting.transform(X[ : , 1:])

In [126]:
X[ : , 1: ] = transformation

In [127]:
print(X)

[['Doctor' 44.0 72000.0]
 ['Lawyer' 27.0 48000.0]
 ['Banker' 30.0 54000.0]
 ['Lawyer' 38.0 61000.0]
 ['Banker' 40.0 63777.77777777778]
 ['Doctor' 35.0 58000.0]
 ['Lawyer' 38.77777777777778 52000.0]
 ['Doctor' 48.0 79000.0]
 ['Banker' 50.0 83000.0]
 ['Doctor' 37.0 67000.0]]


# Encoding categorical variable

## Encoding independent variable

In [128]:
from sklearn.compose import ColumnTransformer

In [129]:
from sklearn.preprocessing import OneHotEncoder

In [130]:
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')

In [131]:
X = np.array(ct.fit_transform(X))

In [132]:
print(X)

[[0.0 1.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 48.0 79000.0]
 [1.0 0.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 37.0 67000.0]]


## Encoding dependent variable

In [133]:
from sklearn.preprocessing import LabelEncoder

In [134]:
le = LabelEncoder()

In [135]:
Y = le.fit_transform(Y)

In [136]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


# Splitting the data set into training set and test set

In [137]:
from sklearn.model_selection import train_test_split

In [138]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [139]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 48.0 79000.0]
 [1.0 0.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 35.0 58000.0]]


In [140]:
print(X_test)

[[1.0 0.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 37.0 67000.0]]


In [141]:
print(Y_train)

[0 1 0 0 1 1 0 1]


In [142]:
print(Y_test)

[0 1]


# Feature Scaling

In [143]:
from sklearn.preprocessing import StandardScaler

In [144]:
sc = StandardScaler()

In [145]:
X_train[ : , 3: ] = sc.fit_transform(X_train[ : , 3: ])

In [146]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [1.0 0.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [0.0 1.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [0.0 1.0 0.0 1.1475343068237058 1.232653363453549]
 [1.0 0.0 0.0 1.4379472069688968 1.5749910381638885]
 [0.0 1.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [147]:
#MeanX_train = np.mean(X_train[ : , 3 ])

In [148]:
#print(MeanX_train)

In [149]:
#StdX_train = np.std(X_train[ : , 3 ])

In [150]:
#print(StdX_train)

In [151]:
#X_trainFirstScalerValue = (X_train[ 0 , 3 ] - MeanX_train) / StdX_train

In [152]:
#print(X_trainFirstScalerValue)

In [153]:
X_test[ : , 3: ] = sc.transform(X_test[ : , 3: ])

In [154]:
print(X_test)

[[1.0 0.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [0.0 1.0 0.0 -0.44973664397484414 0.2056403393225306]]
