# Data Preprocessing Tools

## Importing the libraries

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [20]:
dataset = pd.read_csv('Data.csv')
countries = dataset.iloc[:, 0].values
age = dataset.iloc[:, 1].values
salary = dataset.iloc[:, 1].values
purchased = dataset.iloc[:, 1].values

x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [21]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [22]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [23]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

In [24]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [26]:
# Each new column corresponds to the encoding of each country
"""
First column corresponds to France as is the first country
Second column corresponds to Spain
Third column corresponds to Germany
-------------------
If more countries where in our table, then we would have more columns
"""
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [27]:
from sklearn.preprocessing import LabelEncoder

le =LabelEncoder()
y = le.fit_transform(y)

In [28]:
"""
set a label to the values stored in y, meaning a yes or no
We could have used the original from "ColumnTransformer", same process as we did with X
However this was done just to show different methods of encoding.
"""
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [29]:
"""

"""

'\n\n'

In [30]:
from sklearn.model_selection import train_test_split

# 4 normal variables returned from 'train_test_split'
# If we had 3 different datasets we would have 6 different outputs

# We add the variables that we want to split, and then the split size
# It is recommended to have 80% in the training set and 20% in the test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [None]:
print(x_train)
"""
First columns represent to an identity matrix for each different variable, in this case each column is a country
The following column is the age
The final one is the salary
"""

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [None]:
print(x_test)

"""
We have at the beginning the matrix of features
Then the age and then the salary
"""

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [None]:
print(y_train)

"""
This 'purchased' decisions correspond to the 8 different rows for the x_train
"""

[0 1 0 0 1 1 0 1]


In [34]:
print(y_test)

[0 1]


## Feature Scaling

In [None]:
"""
We do Feature Scaling after splitting the dataset.
Feature scaling means that we will be 'forcing' all of our values in our dataset to be in the same range [normalizing]
-> We do this to prevent one feature to dominate the ML training

Feature scaling in reality obtain the standard deviation to actually do the scaling.
Techniques:
-> Standardisation
We substract each value of our feature by the mean(of all the values of the feature) and the we divide by the standard deviation of all the values of the feature
-> Normalisation
We substract each value of our feature by the min(of all the values of the feature) and the we divide by the substracting the difference of the max(of all the values of the features) and the min(of all the values of the features)
"""

"\nWe do Feature Scaling after splitting the dataset.\nFeature scaling means that we will be 'forcing' all of our values in our dataset to be in the same range [normalizing]\n-> We do this to prevent one feature to dominate the ML training\n\nFeature scaling in reality obtain the standard deviation to actually do the scaling.\n"

In [None]:
from sklearn.preprocessing import StandardScaler

# Scales data to have a mean of 0 and a standard deviation of 1
standardScaler = StandardScaler()

# Fit: Calculate mean and standard deviation of each feature in your training data.
# Transform: Apply the calculated mean and standard deviation to scale the training data.
"""
This step must be performed only on your training data because your training set defines the statistical parameters (mean and standard deviation). 
These parameters are crucial and represent what the model "learns" during scaling.
"""
x_train[:, 3:] = standardScaler.fit_transform(x_train[:, 3:])
"""
Once you have computed the scaling parameters from your training data, you must use these same parameters to scale the test set. 
You must not compute new scaling parameters from your test data because it introduces data leakage (information from your test set influences your training process), leading to overly optimistic and incorrect evaluations.
"""
x_test[:, 3:] = standardScaler.transform(x_test[:, 3:])

"""
[!] Always use fit_transform() on the training data.
[!] Always use transform() on validation/test/new data.
"""

In [37]:
print(x_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [38]:
print(x_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
