# SimpleImputer

In [5]:
from sklearn.impute import SimpleImputer
import numpy as np

In [6]:
X = np.array([[10, 3],
            [0, 4],
            [5, 3],
            [np.nan, 3]])

Remplacement suivant la strategy(mean, median, most_frequent, constant)

In [7]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit_transform(X)

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [ 5.,  3.]])

In [8]:
X_test = np.array([[12, 5],
                    [40, 2],
                     [5, 5],
                    [np.nan, np.nan]])

In [9]:
imputer.transform(X_test) #transform(X_test) :remplace les valeurs manquantes par les moyennes calculées sur X_train

array([[12.  ,  5.  ],
       [40.  ,  2.  ],
       [ 5.  ,  5.  ],
       [ 5.  ,  3.25]])

==>le calcul du moyenne de X_train & X_test ensemble provoque une fuite d'information du test_set vers le train_set

==> il faut les séparer

# KNNImputer

Remplace toutes les valeurs manquantes par des valeurs des plus proches voisins

In [10]:
from sklearn.impute import KNNImputer

In [11]:
X = np.array([[10, 3],
            [0, 4],
            [5, 3],
            [np.nan, 3]])

In [12]:
imputer = KNNImputer(n_neighbors=1)
imputer.fit_transform(X)

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [10.,  3.]])

# MissingIndicator

variable boléenne qui indique l'absence de valeurs dans le dataset

In [13]:
from sklearn.impute import MissingIndicator

In [14]:
# let's take X as Titanic dataset
X = np.array([[1, 100],
            [2, 30],
            [3, 15],
            [np.nan, np.nan]])

In [15]:
imputer = MissingIndicator()
imputer.fit_transform(X) # indique l'absence de valeurs dans le dataset ou non

array([[False, False],
       [False, False],
       [False, False],
       [ True,  True]])

make_union: créer tel colonne + traiter plusieurs données parallélement puis concaténer dans un seul tableau

In [16]:
from sklearn.pipeline import make_union  

pipeline = make_union(SimpleImputer(strategy='constant', fill_value=-99),
                      MissingIndicator())
#SimpleImputer: nous donne la classe d'un passager et prix de son ticket
#MissingIndicator: nous donne soit un passager soit un membre d'équipage
pipeline.fit_transform(X) 

array([[  1., 100.,   0.,   0.],
       [  2.,  30.,   0.,   0.],
       [  3.,  15.,   0.,   0.],
       [-99., -99.,   1.,   1.]])

# APPLICATION : 
optimiser le nbr de paramétres KNNImputer grace à GridSearchCV

In [17]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

In [18]:
titanic = pd.read_csv('titanic-passengers.csv', sep=';')


In [19]:
X=titanic[['Pclass', 'Age']]
y=titanic['Survived']

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.30, random_state=42)

In [22]:
model = make_pipeline(KNNImputer(), SGDClassifier())

In [23]:
params = {
    'knnimputer__n_neighbors': [1,2,3,4]
}

In [27]:
grid = GridSearchCV(model, params, cv=5)

In [28]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('knnimputer', KNNImputer()),
                                       ('sgdclassifier', SGDClassifier())]),
             param_grid={'knnimputer__n_neighbors': [1, 2, 3, 4]})

In [29]:
grid.best_params_

{'knnimputer__n_neighbors': 2}