<h3 align="center">Importing Libraries</h3>

In [140]:
import numpy as np               # Numeric Data Manipulation
import matplotlib.pyplot as plt  # Data visualization
import pandas as pd
from sklearn.preprocessing import Imputer               # Preprocessing of missing data
from sklearn.preprocessing import LabelEncoder          # For catagorical encoding
from sklearn.preprocessing import OneHotEncoder         # For Catagorical Encoding
from sklearn.cross_validation import train_test_split   # For Splitting the Datasets into Test and Train.
from sklearn.preprocessing import StandardScaler        # To scale the features

<h3 align="center">Importing Data</h3>

In [141]:
dataset = pd.read_csv("Data.csv")
dataset.head(5)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


<h4><code> iloc[Rows, Columns] </code>  ------->   Returns Data Frame </h4>
<h4><code> iloc[Rows, Columns].values </code>  ------->   Returns Data Frame </h4>

In [142]:
# iloc[Rows, Columns]                      Returns Data Frame
# iloc[Rows, Columns].values               Returns the array
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1:].values

In [143]:
Y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

<h3 align="center">Handling Missing Data</h3>

<h4> There are few missing values. First we'll take mean of all the column for that respective column for that missing value</h4>

In [144]:
# axis = 0 ---> shows columns
# axis = 1 ---> shows Rows
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(X[:, 1:3])   # Not the upper bound 1:3 ---> 1 to 2
X[:, 1:3] = imputer.transform(X[:, 1:3]) 
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

<h3 align="center">Handling Catagorical Data</h3>

<h4>First Convert it into Labels via LabelEncoder and then encode into OneHot Encoding Scheme</h4>

In [145]:
label_X = LabelEncoder()
X[:, 0] = label_X.fit_transform(X[:, 0]) # Encoding X
X  # Focus on 1st column
# Problem is one country with high value might get precedence if we put them in equation.

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)


<h3 align="center">Onehot Encoder For Catagorical Data</h3>

In [146]:
oneHot_X = OneHotEncoder(categorical_features= [0])
X = oneHot_X.fit_transform(X).toarray()
X

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

In [147]:
# Similarly with Y Column
label_Y = LabelEncoder()
Y = label_Y.fit_transform(Y)             # Encoding Y
Y  # No need for oneHot Encoding. It is only needed if catagorical variables are greater than 2.

  y = column_or_1d(y, warn=True)


array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

<h3 align="center">Splitting Data into Test and Training Dataset</h3>

In [148]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.3)

<h3 align="center">Feature Scaling</h3>

In [149]:
X_Scale = StandardScaler()
X_Train = X_Scale.fit_transform(X_Train) # Fit the Training data
X_Test = X_Scale.transform(X_Test)       # No need to fit the testing data only transformation will work.

# Note: We don't need to apply feature scaling on dependent variable because it our case it's catagorical variable
# But this could be different in case of regression.