# Data Preprocessing Tools

## Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


- Independent variables: The features are the columns that you are going to use to make a prediction.
- Dependent variable: The prediction that you are trying to make

In [3]:
'''
The .values changes type from pandas to a numpy array, it strips the labels of rows and columns
'''

X = dataset.iloc[:, :-1].values # Independent variables - the features.
y = dataset.iloc[:, -1].values # dependent variable - prediction

In [4]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [5]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Take care of missing data

While looking at the X matrix, we see that there is missing data in the fifth and seventh row. In the second and third column

In [6]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


To solve this missing data, which can cause problems in the machine learning model, we will use a sklearn class to fill the value with the average of the column

In [7]:
from sklearn.impute import SimpleImputer

'''
Create an object of the class simpleImputer 
Parameters:
- First parameter: assign that the missing values are the ones that have the value of Nan as their value in column
- Second parameter: strategy is to say how the missing values will be handled. In this case, replace the missing values with the average of the column

The imputer first needs to fit the data, to calculate the mean. And then apply the transform method to apply the changes into the array
'''

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')


imputer.fit(X[:, 1:3]) # pass the columns that only have numerical values
X[:, 1:3] = imputer.transform(X[:, 1:3]) # pass the columns of X where want to replace the missing data, and save the returning value to the array

In [8]:
print(X) # The two missing values where replaced by the means of their respective columns

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encode categorical data
The dataset in this folder has a feature with categories (names of countries). Must be careful, if try to assign a number to categorical values, then the model could learn a certain order to the countries, which is an incorrect assumption. A better choice is one hot encoding, which consists of separating all the possible categories into different columns and assingning a binary vectore, meaning a value of '1' or '0', to see if that category is selected.

In this case, 'France', 'Germany', and 'Spain' will be converted into three columns and their value will be if they are selected or not. Will need to apply a similar approach to the dependent variable. 

### Encoding Independent Variable

In [9]:
'''
Use columnTransform to apply binary vector encoding. 

For ColumnTransformer
1. Transformers: array with values -> type of transformation, name of class, [indices of the column that want to apply transformation to]
2. Remainder: Passthrough tells to keep columns that are not being transformed, 
'''
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X)) # Pass matrix to transform and save the returned object as a numpy array. 

In [10]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding Dependant Variable
Do the same as before but now with a label encoder

In [11]:
from sklearn.preprocessing import LabelEncoder

lt = LabelEncoder()
y = np.array(lt.fit_transform(y)) # Pass vector to transform and save the returned object as a numpy array.

In [12]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into the training set and test set

In [13]:
from sklearn.model_selection import train_test_split

'''
Train test split:
1. First pass the feature matrix
2. Then pass the dependent variable
3. Test size, percentage of the data that will go into the test set
4. Specify a seed to get the same values as the video. This because the training/testing set usually are divided randombly
'''

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1) # 20% for testing dataset

In [14]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [15]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [16]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [17]:
print(y_test)

[0 1]


## Feature Scaling
Feature scaling must be applied after splitting the dataset because the test set is something that will be used only for testing your model. Should not be modified in any way, shape, or form by you. When performing feature scaling it takes the mean of all the values, so that will modify the testing set. So, to not modified th testing set, should do it after in order for the method to not take into consideration the whole dataset when performing feature scaling. It will cause some information leakage

The main two feature scaling techniques:
1. Standarisation: (X - mean(X)) / (standard deviation x) -> Put values between [-3, 3] (except some outliers)
2. Xnorm = (X - min(X)) / (max(X) - min(X)) -> Put values between [0, 1]

- Normalization recommended when have normal distribution in most of features. 
- Standarization works well all the time

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() # Don't need parameters

'''
Don't need to apply scaler to the OneHotEncoding variables. Because if you do, lose some information. Also, they are already between [-3, 3].

Apply feature scaling to numerical values
'''

# Fit gets the mean and the standar devaition, and then transform applies the changes to the matrix
X_train[:, 3:] = scaler.fit_transform(X_train[:, 3:]) # Select the last two columns, i.e. from third column onwards
X_test[:, 3:] = scaler.transform(X_test[:, 3:]) # Because in production this will be new data, we MUST transform the test set using the same mean/deviation

In [20]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [21]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
