# Data Preprocessing

In [9]:
# Importo Numpy e Pandas

import numpy as np
import pandas as pd

In [10]:
# Carico il dataset

df = pd.read_csv('data/data_preprocessing.csv')
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [11]:
# Estraggo la feature matrix 'X'
X = df.iloc[:, :-1].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [12]:
# Estraggo l'array degli output 'y'
y = df.iloc[:, -1]
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [13]:
# Istanzio un 'Imputer' per eliminare i missing data

from sklearn.preprocessing import Imputer

nan_columns = [1,2]

imputer = Imputer(
    missing_values = 'NaN',
    strategy = 'mean',
    axis = 0)
imputer.fit(X[:, nan_columns])

X[:, nan_columns] = imputer.transform(X[:, nan_columns])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [14]:
# Per gestire i categorical data, mi serve innanzitutto un LabelEncoder
# In questo modo trasformo i dati testuali in numeri interi

from sklearn.preprocessing import LabelEncoder
np.set_printoptions(precision = 2, suppress = True)

categorical_column = 0

X[:, categorical_column] = LabelEncoder().fit_transform(X[:, categorical_column])
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [15]:
# Adesso utilizzo OneHotEncoder per trasformare questi interi in dummy variables

from sklearn.preprocessing import OneHotEncoder

X = OneHotEncoder(categorical_features = [categorical_column]).fit_transform(X).toarray()
X

array([[    1.  ,     0.  ,     0.  ,    44.  , 72000.  ],
       [    0.  ,     0.  ,     1.  ,    27.  , 48000.  ],
       [    0.  ,     1.  ,     0.  ,    30.  , 54000.  ],
       [    0.  ,     0.  ,     1.  ,    38.  , 61000.  ],
       [    0.  ,     1.  ,     0.  ,    40.  , 63777.78],
       [    1.  ,     0.  ,     0.  ,    35.  , 58000.  ],
       [    0.  ,     0.  ,     1.  ,    38.78, 52000.  ],
       [    1.  ,     0.  ,     0.  ,    48.  , 79000.  ],
       [    0.  ,     1.  ,     0.  ,    50.  , 83000.  ],
       [    1.  ,     0.  ,     0.  ,    37.  , 67000.  ]])

In [16]:
# Se il vettore di output 'y' è binario, ad esempio (Si,No), mi basta solo il LabelEncode

y = LabelEncoder().fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [17]:
# Splitto X e Y in training set e test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2)

In [19]:
# Applichiamo il Feature Scaling al training set

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train

array([[-1.  ,  1.29, -0.38, -1.65, -1.14],
       [-1.  , -0.77,  2.65, -0.25, -1.33],
       [ 1.  , -0.77, -0.38,  0.58,  0.56],
       [ 1.  , -0.77, -0.38, -0.85, -0.76],
       [ 1.  , -0.77, -0.38, -0.53,  0.08],
       [-1.  ,  1.29, -0.38,  1.54,  1.59],
       [ 1.  , -0.77, -0.38,  1.22,  1.21],
       [-1.  ,  1.29, -0.38, -0.06, -0.22]])

In [None]:
# Se il vettore di output 'y' è binario, non serve applicare il Feature Scaling
# E' meglio applicarlo quando 'y' può raggiungere valori molto alti