## Importing the Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn

## Importing the data

In [3]:
data = pd.read_csv('data/data.csv')
# Splitting data into X and y data
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [6]:
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [7]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [8]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Taking care of missing data
### Using sklearn `SimpleImputer`

In [14]:
# Visualizing where there is missing data
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [18]:
from sklearn.impute import SimpleImputer
# Here, I fill in the data using with the  mean value of the particular column 
imputer = SimpleImputer(missing_values = np.nan,
                       strategy = 'mean')

# Filling in the missing observations in the Age and Salary columns
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [19]:
# Checking if X has been filled
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encoding Categorical Independent Variable Data (X)
We have to convert the `Country` Categorical column form type (String) in to a form that can be passed through the model.  
Done by **OneHotEncoding** the `Country` column which will create three new binary columns linked to the `Country` name. 

In [20]:
data.dtypes

Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Creating a columntransfomer object
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                      remainder = 'passthrough')

# OneHotEncoding the X data
X = np.array(ct.fit_transform(X))

In [24]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

## Encoding the Dependent Variable Data (y)
Encoding the yes and no observations in y data in to 0's and 1's  
Using the `LabelEncoder` from sklearn.

In [25]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [26]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into the training and test set

**We split the dataset into training and test sets BEFORE doing Feature Scaling to prevent information leakage from the future test data into the training data during the transformation process**

In [27]:
from sklearn.model_selection import train_test_split

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [28]:
X_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [29]:
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [30]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [31]:
y_test

array([0, 1])

## Feature Scaling
![](images/feature_scaling.jpg)

### Standardization
Values of features will fall between the ranges of (-3 -> 3)  
**Can be used ubiquitously**
### Normalization
Values of features will fall between the ranges of (0 -> 1)  
**Should be used on data that follows a normal distribution curve**

In [34]:
data.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


**We notice that the `std` and `mean` as well as the scale of values in the `Salary` column far exceed those from the `Age` column which means that the effect of the `Salary`  will dominate the effect of the `Age` column.**  
**Therefore, we apply feature scaling to ensure that there is no effect mismatch btwn the two numerical variable data values.**

### Standardizing the data

In [35]:
from sklearn.preprocessing import StandardScaler

# Creating a standardscaler object
sc = StandardScaler()

# Scaling the variables besides the onehotencoded/dummy variables
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [36]:
X_train

array([[0.0, 0.0, 1.0, -0.19159184384578545, -1.0781259408412425],
       [0.0, 1.0, 0.0, -0.014117293757057777, -0.07013167641635372],
       [1.0, 0.0, 0.0, 0.566708506533324, 0.633562432710455],
       [0.0, 0.0, 1.0, -0.30453019390224867, -0.30786617274297867],
       [0.0, 0.0, 1.0, -1.9018011447007988, -1.420463615551582],
       [1.0, 0.0, 0.0, 1.1475343068237058, 1.232653363453549],
       [0.0, 1.0, 0.0, 1.4379472069688968, 1.5749910381638885],
       [1.0, 0.0, 0.0, -0.7401495441200351, -0.5646194287757332]],
      dtype=object)

In [37]:
X_test

array([[0.0, 1.0, 0.0, -1.4661817944830124, -0.9069571034860727],
       [1.0, 0.0, 0.0, -0.44973664397484414, 0.2056403393225306]],
      dtype=object)