# <span style="color:orange">Data Preprocessing Tools</span>

### <span style="color:orange">Importing the Libraries</span>

In [1]:
import numpy as np
import pandas as pd

### <span style="color:orange">Importing the dataset</span>

In [2]:
dataset = pd.read_csv('datasets/Data.csv')
matrix_of_features = dataset.iloc[ : , :-1].values      # // X
dependent_variable = dataset.iloc[ : , -1].values  # // y

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
matrix_of_features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
print(matrix_of_features)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [6]:
dependent_variable

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### <span style="color:orange">Taking care of missing data</span>

In [7]:
print(dataset.isnull().sum() )

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [8]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # // average value
imputer.fit(matrix_of_features[ : , 1:3])  # // the cols with integer values
matrix_of_features[ : , 1:3] = imputer.transform(matrix_of_features[ : , 1:3])

In [9]:
print("the new matrix of features: \n", matrix_of_features)

the new matrix of features: 
 [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### <span style="color:orange">Encoding Categorical Data</span>

#### <span style="color:orange">Encoding the Independent variables</span>

In [10]:
print(dataset.nunique() )

Country      3
Age          9
Salary       9
Purchased    2
dtype: int64


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


column_transformer = ColumnTransformer(transformers=[("encoding", OneHotEncoder(), [0] ) ]
                                       , remainder='passthrough')
matrix_of_features = np.array(column_transformer.fit_transform(matrix_of_features) )

In [12]:
print("After O_H_Encoding the col.: \n", matrix_of_features)

After O_H_Encoding the col.: 
 [[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


#### <span style="color:orange">Encoding the Dependent variable</span>

In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
dependent_variable = label_encoder.fit_transform(dependent_variable)

In [14]:
print(dependent_variable)

[0 1 0 0 1 1 0 1 0 1]


### <span style="color:orange">Splitting the data</span>

In [15]:
from sklearn.model_selection import train_test_split

# // X_train, X_test, y_train, y_test, (tuple unpacking)
(matrix_of_features_train, matrix_of_features_test,
 dependent_variable_train, dependent_variable_test) = train_test_split(matrix_of_features, dependent_variable, test_size=0.2, random_state=1)

In [16]:
print("matrix_of_features_train = \n", matrix_of_features_train)
print("matrix_of_features_test = \n", matrix_of_features_test)
print("dependent_variable_train = \n", dependent_variable_train)
print("dependent_variable_test = \n", dependent_variable_test)

matrix_of_features_train = 
 [[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
matrix_of_features_test = 
 [[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
dependent_variable_train = 
 [0 1 0 0 1 1 0 1]
dependent_variable_test = 
 [0 1]


### <span style="color:orange">Feature Scaling</span>

#### <span style="color:orange">Standardization</span>

In [17]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

## X_train
matrix_of_features_train[ : , 3: ] = standard_scaler.fit_transform(matrix_of_features_train[ : , 3: ] )
print("Scaled matrix of features = \n", matrix_of_features_train)

## X_test, * transform only
matrix_of_features_test[ : , 3: ] = standard_scaler.transform(matrix_of_features_test[ : , 3: ] )
print("Scaled matrix of features test = \n", matrix_of_features_test)

Scaled matrix of features = 
 [[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]
Scaled matrix of features test = 
 [[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]


#### <span style="color:orange">Normalization</span>

In [18]:
from sklearn.preprocessing import MinMaxScaler

normalization_scaler = MinMaxScaler()
matrix_of_features_train[ : , 3: ] = normalization_scaler.fit_transform(matrix_of_features_train[ : , 3: ] )
matrix_of_features_test[ : , 3: ] = normalization_scaler.transform(matrix_of_features_test[ : , 3: ] )

print("Scaled matrix of features = \n", matrix_of_features_train)
print("Scaled matrix of features test = \n", matrix_of_features_test)

Scaled matrix of features = 
 [[0.0 0.0 1.0 0.5120772946859904 0.11428571428571427]
 [0.0 1.0 0.0 0.5652173913043478 0.4507936507936508]
 [1.0 0.0 0.0 0.7391304347826086 0.6857142857142856]
 [0.0 0.0 1.0 0.4782608695652174 0.37142857142857133]
 [0.0 0.0 1.0 0.0 0.0]
 [1.0 0.0 0.0 0.9130434782608696 0.8857142857142856]
 [0.0 1.0 0.0 1.0 0.9999999999999998]
 [1.0 0.0 0.0 0.34782608695652173 0.28571428571428564]]
Scaled matrix of features test = 
 [[0.0 1.0 0.0 0.13043478260869562 0.17142857142857137]
 [1.0 0.0 0.0 0.4347826086956522 0.5428571428571427]]


**You should use only one, I used both for illustration only**

___