# Data Preprocessing

In [1]:
# Importer les librairies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## importer et vérifier les données

In [2]:
dataset = pd.read_csv('../data/Data_Preprocessing.csv')
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country      10 non-null object
Age          9 non-null float64
Salary       9 non-null float64
Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 400.0+ bytes


## Préparer les données

### séparer les features de la target

In [4]:
# prendre toutes les lignes de toutes les colonnes comme features
X = dataset.iloc[:, :-1].values

# prendre toutes les lignes de la dernière colonne comme target
y = dataset.iloc[:, -1].values

In [5]:
cols = ['Pays','Age','Salary']
pd.DataFrame(X,columns=cols)

Unnamed: 0,Pays,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [6]:
cols = ['Purchased']
pd.DataFrame(y,columns=cols)

Unnamed: 0,Purchased
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


### les données manquantes

In [7]:
# Gérer les données manquantes
from sklearn.preprocessing import Imputer

# creation de l'objet
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
# transfert de la connaissance
imputer.fit(X[:, 1:3])
# récuprération des données transformées
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [8]:
cols = ['Pays','Age','Salary']
pd.DataFrame(X,columns=cols)

Unnamed: 0,Pays,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.8
5,France,35.0,58000.0
6,Spain,38.7778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


### les données catégorielles

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# transformer le pays en donnée numérique
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

# one hot encoding :D du pays
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

# transformer la target (achat oui/non) en donnée numérique
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [10]:
cols = ['France','Spain','Germany','Age','Salary']
pd.DataFrame(X,columns=cols)

Unnamed: 0,France,Spain,Germany,Age,Salary
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


In [11]:
cols = ['Purchased']
pd.DataFrame(y,columns=cols)

Unnamed: 0,Purchased
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


### split training et test set

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [13]:
print(f'X_train : {X_train.shape} - y_train : {y_train.shape} - X_test : {X_test.shape}  - y_test : {y_test.shape}')

X_train : (8, 5) - y_train : (8,) - X_test : (2, 5)  - y_test : (2,)


### standardiser et normaliser : le feature scaling

In [14]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
cols = ['France','Spain','Germany','Age','Salary']
pd.DataFrame(X_train,columns=cols)

Unnamed: 0,France,Spain,Germany,Age,Salary
0,-1.0,2.645751,-0.774597,0.263068,0.123815
1,1.0,-0.377964,-0.774597,-0.253501,0.461756
2,-1.0,-0.377964,1.290994,-1.975398,-1.530933
3,-1.0,-0.377964,1.290994,0.052614,-1.11142
4,1.0,-0.377964,-0.774597,1.640585,1.720297
5,-1.0,-0.377964,1.290994,-0.081312,-0.167514
6,1.0,-0.377964,-0.774597,0.951826,0.986148
7,1.0,-0.377964,-0.774597,-0.597881,-0.482149
