# Data Preprocessing

## Importing the libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt

## Importing the dataset

In [3]:
data_set = pd.read_csv("Data.csv")
data_set

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
x = data_set.iloc[:,:-1].values
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
y = data_set.iloc[:,-1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Taking care of missing data (using pandas)

In [6]:
data_set.isnull().values.any()

True

In [7]:
data_set.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [8]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [9]:
data_set['Age'].mean()

38.77777777777778

In [10]:
data_set.fillna(data_set.mean())

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Taking care of missing data (using sklearn)

In [11]:
# One Hot encoding. Converting the categorical variables to numerical variables
# (splitting the columns into multiple ones creating vectors)

In [12]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [13]:
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

In [14]:
x[:, 1:3]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, 63777.77777777778],
       [35.0, 58000.0],
       [38.77777777777778, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

## Encoding Categorical Data

In [15]:
# Encoding the Independent Variable

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
ct.fit_transform(x)

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [17]:
x = np.array(ct.fit_transform(x))
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [18]:
# Encoding the Dependent Variable

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [20]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting data into training set and testing set.

In [21]:
# Do we have to apply the featre scaling before splitting or after splitting?
# Answer - After splitting the data into train and test set. 
# Reason - the essential reason why you should not apply features scaling before the split is to prevent information leakage on the test set which you're not supposed to have until the training is done.

# training set - where you're going to train your missionary model on existing observations
# test set - where you're going to evaluate the performance of your model on new observations.

# feature scaling - simply consists of scaling all your variables/features actually to make sure they all take values in the same scale.
# And we do this so as to prevent one feature to dominate the other which therefore would be neglected by the machine learning model.

In [22]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=1)

In [23]:
print(x_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [24]:
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [25]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [26]:
print(y_test)

[0 1]


## Feature Scaling

In [27]:
# Two main techniques for feature scaling - 
# Standardisation - (x-mean)/SD
# Normalisation - (x-min(x))/(max(x)-min(x))

# SD - sqrt(V)

# Normalization is recommended when you have a normal distribution in most of your features.This will be a great feature scaling technique. 
# Standardization actually works well all the time.

# Therefore since standardisation is a technique that will work all the time and normalization is a technique that is more recommended for some specific situations where you have most of your features following a normal distribution, then my ultimate recommendation for sure is standardization because indeed this will always work.
# Standardization will always do some relevant feature scaling and this will always improve the training process.

In [28]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [29]:
# Do we have to apply feature scaling/standardization to the dummy variables in the matrix of features.
# Answer - No.

# The goal of standardization/feature scaling in general is to have all the values of the features in the same range.
# And that standardization actually transforms our features so that they take values between more or less minus three and plus three.
# Since here, dummy variables already take values between minus three and plus three(because they're equal to either 1 or 0).
# There is nothing extra to be done here with standardization and actually standardization will only make it worse because indeed it will still transform these values between minus three plus three.

# After we apply features killing if we apply it on the dummy variables we will get nonsense numerical values and we will be absolutely incapable to say which couple of three values here correspond to which country.
# So we will totally lose the interpretation.
# And besides this won't improve training performance. 
# Indeed our dummy variables are any way already between the same skill range as your other variables.

# So dont apply feature scaling on dummy variables. There is no considereable difference that would justify this. 

In [32]:
# fit - will give the mean and SD of each of the features
# transform - will apply the formula to transform these values, so that they can all stay in the same scale.

# After the missionary model is trained and since this machine learning model will be trained with a particular scalar.
# The scalar applied on the training set to make predictions will be congruent with the way the model was trained.
# And we need to apply the same skill that was used on the training set onto the test set so that we can get indeed the same transformation and therefore some relevant predictions can be applied to X test.
# So here it's clearly the transfer method that must only be applied to make it efficient.

x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.transform(x_test[:, 3:])

In [31]:
print(x_train)
print(x_test)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]
[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
