# Data Preprocessing Tools

## 1) Importing Libraries

In [49]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## 2) Importing data set

In [50]:
dataset= pd.read_csv("C:\\Users\\kava2\\Documents\\Udemy\\MachineLearningA_Z\\Datasets\\Data.csv")

#subset to get all feature vectors except target using pandas' iloc[] function- stands for index locate
x= dataset.iloc[:,:-1].values      #:-1 all columns except last one
y= dataset.iloc[:,-1].values             # .values to convert from data frame to array vectors

In [51]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [52]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## 3) Taking Care of Missing Data

In [53]:
#Replace missing values with average of the data column using Sklearn

from sklearn.impute import SimpleImputer
imputer= SimpleImputer(missing_values=np.nan, strategy='mean')    #np.nan stands for all missing values in the np array
imputer.fit(x[:,1:3])  #Apply fit on imputer object using method fit
x[:,1:3]= imputer.transform(x[:,1:3]) #Finally apply the transformation

#Check the transformation
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## 4) Encoding Categorical Data Using One-Hot Encoding (Binary Code)

### Encoding the Independent Variable

In [54]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#Create an instance of the ColumnTransformer Class
ct= ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') 
#remainder = passthrough lets us to keep all other columns in addition to the ones we're encoding

#fit and transform the categorical variable column to complete encoding
x= np.array(ct.fit_transform(x))

print(x)


[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [55]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
y= le.fit_transform(y)

print(y)

[0 1 0 0 1 1 0 1 0 1]


## 5) Splitting the Data set into Test and Train set

Feature Scaling should always be done after splitting the data into train and test. This is because performing feature scaling on the data set before split can cause some information leakage from the Test data set and this might affect the accuracy of the model during testing. Test data is supposed to be unseen by the model and hence, should not be used for scaling.

In [56]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= 0.2, random_state=1) #setting seed through random state

# Check all the matrices
print(x_train)
print(x_test)
print(y_train)
print(y_test)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
[0 1 0 0 1 1 0 1]
[0 1]


## 6) Feature Scaling

Feature scaling or Data Normalization is a method that is performed to ensure that all the input features have the same range of values. Some ML objective functions require that all features are within the Normalized range. Also, some ML algorithms like Gradient Descent converge faster when features are Normalized.

There are two techniques in Feature Scaling.

1) Feature Standardization : x(std)= x- mean(x)/ SD(x)
   - All values will take values between -3 and +3

2) Feature Normalization: x(norm) = x- min(x)/ max(x)- min(x)
   - All values will take values between 0 and 1
   
Standardization works almost always while Normlization is recommended when the data contains features with a Normal Distribution. Hence, it is recommended to go with Standardization.

In [None]:
from sklearn.preprocessing import StandardScaler 

#Let us now create an object of Standard Scaler class
sc= StandardScaler()

#Feature scaling makes the Categorical variables to lose their interpretability. Hence, we should only apply scaling to the
#numerical variables and leave the Encoded Categorical variables out
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:]) #index 3 to rest of the columns after encoding categoricals 
x_test[:, 3:] = sc.transform(x_test[:, 3:])       # Age and Salary are scaled

In [59]:
#Check the outputs
print(x_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [60]:
print(x_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
