# Data Preprocessing

## Topics:
* Importing Python Library
* Reading Data
* Missing Data
* Dealing with Categorical Data
* Splitting Data
* Normalize Data

## Import Python Library, Read Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("Data.csv")

In [3]:
data

Unnamed: 0,City,Age,Smoke,HappinessIndex,Healthy
0,Mumbai,24.0,Yes,241.0,Yes
1,London,80.0,No,928.0,No
2,NewYork,38.0,Yes,,Yes
3,NewYork,22.0,Yes,786.0,Yes
4,NewYork,36.0,Yes,967.0,Yes
5,London,,Yes,665.0,Yes
6,Mumbai,17.0,No,293.0,No
7,NewYork,28.0,No,494.0,Yes
8,Mumbai,45.0,No,707.0,No
9,London,29.0,Yes,599.0,No


In [4]:
# Split into input data(X) and output data(y)
X = data.iloc[:, 0:4].values
y = data.iloc[:, 4].values

In [5]:
X

array([['Mumbai', 24.0, 'Yes', 241.0],
       ['London', 80.0, 'No', 928.0],
       ['NewYork', 38.0, 'Yes', nan],
       ['NewYork', 22.0, 'Yes', 786.0],
       ['NewYork', 36.0, 'Yes', 967.0],
       ['London', nan, 'Yes', 665.0],
       ['Mumbai', 17.0, 'No', 293.0],
       ['NewYork', 28.0, 'No', 494.0],
       ['Mumbai', 45.0, 'No', 707.0],
       ['London', 29.0, 'Yes', 599.0]], dtype=object)

In [6]:
y

array(['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No'], dtype=object)

## Missing Data

In [7]:
X

array([['Mumbai', 24.0, 'Yes', 241.0],
       ['London', 80.0, 'No', 928.0],
       ['NewYork', 38.0, 'Yes', nan],
       ['NewYork', 22.0, 'Yes', 786.0],
       ['NewYork', 36.0, 'Yes', 967.0],
       ['London', nan, 'Yes', 665.0],
       ['Mumbai', 17.0, 'No', 293.0],
       ['NewYork', 28.0, 'No', 494.0],
       ['Mumbai', 45.0, 'No', 707.0],
       ['London', 29.0, 'Yes', 599.0]], dtype=object)

In [8]:
from sklearn.preprocessing import Imputer
#to deal with empty data

In [9]:
Imputer?
#to check help

In [10]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [11]:
X[:, 1:2] = imputer.fit_transform(X[:, 1:2]) #for column 1
X[:, 3:4] = imputer.fit_transform(X[:, 3:4]) #for column 1

# fit - take input data and compute the value using the strategy specified(eg. mean)
# transform - actually append computed value in place of computed value
# fit + transform = fit_transform

In [12]:
X

array([['Mumbai', 24.0, 'Yes', 241.0],
       ['London', 80.0, 'No', 928.0],
       ['NewYork', 38.0, 'Yes', 631.1111111111111],
       ['NewYork', 22.0, 'Yes', 786.0],
       ['NewYork', 36.0, 'Yes', 967.0],
       ['London', 35.44444444444444, 'Yes', 665.0],
       ['Mumbai', 17.0, 'No', 293.0],
       ['NewYork', 28.0, 'No', 494.0],
       ['Mumbai', 45.0, 'No', 707.0],
       ['London', 29.0, 'Yes', 599.0]], dtype=object)

# Categorical Data

In [13]:
X

array([['Mumbai', 24.0, 'Yes', 241.0],
       ['London', 80.0, 'No', 928.0],
       ['NewYork', 38.0, 'Yes', 631.1111111111111],
       ['NewYork', 22.0, 'Yes', 786.0],
       ['NewYork', 36.0, 'Yes', 967.0],
       ['London', 35.44444444444444, 'Yes', 665.0],
       ['Mumbai', 17.0, 'No', 293.0],
       ['NewYork', 28.0, 'No', 494.0],
       ['Mumbai', 45.0, 'No', 707.0],
       ['London', 29.0, 'Yes', 599.0]], dtype=object)

In [14]:
y

array(['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No'], dtype=object)

### Converting Categorical Data
Strategy 1:
> Yes -> 1, No -> 0  
> Mumbai -> 0, London -> 1, New York -> 2

When more than 2 levels are available in a single feature:  
For first feature, use LabelEncoder  
  
For second feature, use OneHotEnconder

| Mumbai | London | New York |
|--------|--------|----------|
| 1      | 0      | 0        |
| 0      | 1      | 0        |


> LabelEncoder -- Converts categorical data into numeric data
> OneHotEncoder -- Converts into matrix format with values 0 and 1

In [15]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [16]:
le_x = LabelEncoder()
le_y = LabelEncoder()

In [17]:
X[:, 0] = le_x.fit_transform(X[:, 0])
X[:, 2] = le_x.fit_transform(X[:, 2])

y = le_y.fit_transform(y)

In [18]:
X

array([[1, 24.0, 1, 241.0],
       [0, 80.0, 0, 928.0],
       [2, 38.0, 1, 631.1111111111111],
       [2, 22.0, 1, 786.0],
       [2, 36.0, 1, 967.0],
       [0, 35.44444444444444, 1, 665.0],
       [1, 17.0, 0, 293.0],
       [2, 28.0, 0, 494.0],
       [1, 45.0, 0, 707.0],
       [0, 29.0, 1, 599.0]], dtype=object)

In [19]:
y

array([1, 0, 1, 1, 1, 1, 0, 1, 0, 0], dtype=int64)

In [20]:
ohe = OneHotEncoder(categorical_features=[0])

In [21]:
X = ohe.fit_transform(X).toarray()
# tranformed 'City' into matrix
# i.e. transformed 1 value into 3 values
# i.e. tranformed 4 features into 6 features

In [22]:
X
# first 3 values represent City
# rest of the values represent the rest of the featues

array([[   0.        ,    1.        ,    0.        ,   24.        ,
           1.        ,  241.        ],
       [   1.        ,    0.        ,    0.        ,   80.        ,
           0.        ,  928.        ],
       [   0.        ,    0.        ,    1.        ,   38.        ,
           1.        ,  631.11111111],
       [   0.        ,    0.        ,    1.        ,   22.        ,
           1.        ,  786.        ],
       [   0.        ,    0.        ,    1.        ,   36.        ,
           1.        ,  967.        ],
       [   1.        ,    0.        ,    0.        ,   35.44444444,
           1.        ,  665.        ],
       [   0.        ,    1.        ,    0.        ,   17.        ,
           0.        ,  293.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
           0.        ,  494.        ],
       [   0.        ,    1.        ,    0.        ,   45.        ,
           0.        ,  707.        ],
       [   1.        ,    0.        ,

# Splitting Data in Training and Testing Set

Machine Learning = Training + Testing

In [23]:
from sklearn.cross_validation import train_test_split

In [24]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = 0.3)

In [27]:
X_train

array([[   1.        ,    0.        ,    0.        ,   35.44444444,
           1.        ,  665.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
           0.        ,  494.        ],
       [   0.        ,    0.        ,    1.        ,   36.        ,
           1.        ,  967.        ],
       [   0.        ,    0.        ,    1.        ,   38.        ,
           1.        ,  631.11111111],
       [   0.        ,    1.        ,    0.        ,   45.        ,
           0.        ,  707.        ],
       [   1.        ,    0.        ,    0.        ,   80.        ,
           0.        ,  928.        ],
       [   0.        ,    1.        ,    0.        ,   17.        ,
           0.        ,  293.        ]])

In [28]:
y_train

array([1, 1, 1, 1, 0, 0, 0], dtype=int64)

In [29]:
X_test

array([[   1.,    0.,    0.,   29.,    1.,  599.],
       [   0.,    1.,    0.,   24.,    1.,  241.],
       [   0.,    0.,    1.,   22.,    1.,  786.]])

In [30]:
y_test

array([0, 1, 1], dtype=int64)

# Normalize Data

In [31]:
X_train

array([[   1.        ,    0.        ,    0.        ,   35.44444444,
           1.        ,  665.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
           0.        ,  494.        ],
       [   0.        ,    0.        ,    1.        ,   36.        ,
           1.        ,  967.        ],
       [   0.        ,    0.        ,    1.        ,   38.        ,
           1.        ,  631.11111111],
       [   0.        ,    1.        ,    0.        ,   45.        ,
           0.        ,  707.        ],
       [   1.        ,    0.        ,    0.        ,   80.        ,
           0.        ,  928.        ],
       [   0.        ,    1.        ,    0.        ,   17.        ,
           0.        ,  293.        ]])

In [32]:
from sklearn.preprocessing import StandardScaler

In [34]:
sc_x = StandardScaler()
# (X-Xmin) / (Range=max-min)
# (X-Xmean) / Xstd

In [35]:
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [36]:
X_train

array([[ 1.58113883, -0.63245553, -0.8660254 , -0.24488912,  1.15470054,
        -0.01978826],
       [-0.63245553, -0.63245553,  1.15470054, -0.65216925, -0.8660254 ,
        -0.80642621],
       [-0.63245553, -0.63245553,  1.15470054, -0.21449508,  1.15470054,
         1.36947878],
       [-0.63245553, -0.63245553,  1.15470054, -0.10507654,  1.15470054,
        -0.17568467],
       [-0.63245553,  1.58113883, -0.8660254 ,  0.27788836, -0.8660254 ,
         0.17342107],
       [ 1.58113883, -0.63245553, -0.8660254 ,  2.19271284, -0.8660254 ,
         1.19007012],
       [-0.63245553,  1.58113883, -0.8660254 , -1.25397122, -0.8660254 ,
        -1.73107083]])

In [37]:
X_test

array([[ 1.58113883, -0.63245553, -0.8660254 , -0.59745997,  1.15470054,
        -0.32340291],
       [-0.63245553,  1.58113883, -0.8660254 , -0.87100633,  1.15470054,
        -1.97028237],
       [-0.63245553, -0.63245553,  1.15470054, -0.98042487,  1.15470054,
         0.5368386 ]])