<a href="https://colab.research.google.com/github/MachineLearnia/Python-Machine-Learning/blob/master/Bonus%20%3A%20Sklearn%20Imputers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imputation avec Sklearn

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import SGDClassifier

  import pandas.util.testing as tm


## 1. SimpleImputer

In [0]:
from sklearn.impute import SimpleImputer

In [3]:
X = np.array([[10, 3],
              [0, 4],
              [5, 3],
             [np.nan, 3]])

imputer = SimpleImputer(missing_values=np.nan,
             strategy='mean')

imputer.fit_transform(X)

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [ 5.,  3.]])

In [4]:
X_test = np.array([[12, 5],
                   [40, 2],
                   [5, 5],
                   [np.nan, np.nan]])

imputer.transform(X_test)

array([[12.  ,  5.  ],
       [40.  ,  2.  ],
       [ 5.  ,  5.  ],
       [ 5.  ,  3.25]])

## 2. KNNImputer

In [0]:
from sklearn.impute import KNNImputer

In [8]:
X = np.array([[1, 100],
             [2, 30],
             [3, 15],
             [np.nan, 20]])

imputer = KNNImputer(n_neighbors=1)
imputer.fit_transform(X)

array([[  1., 100.],
       [  2.,  30.],
       [  3.,  15.],
       [  3.,  20.]])

In [9]:
X_test = np.array([[np.nan, 35]])

imputer.transform(X_test)

array([[ 2., 35.]])

## 3. MissingIndicator

In [0]:
from sklearn.impute import MissingIndicator
from sklearn.pipeline import make_union

In [11]:
X = np.array([[1, 100],
             [2, 30],
             [3, 15],
             [np.nan, np.nan]])

MissingIndicator().fit_transform(X)

array([[False, False],
       [False, False],
       [False, False],
       [ True,  True]])

In [12]:
pipeline = make_union(SimpleImputer(strategy='constant', fill_value=-99),
                     MissingIndicator())

pipeline.fit_transform(X)

array([[  1., 100.,   0.,   0.],
       [  2.,  30.,   0.,   0.],
       [  3.,  15.,   0.,   0.],
       [-99., -99.,   1.,   1.]])

## 4. Application

In [0]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

In [0]:
titanic = sns.load_dataset('titanic')
X = titanic[['pclass', 'age']]
y = titanic['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [15]:
model = make_pipeline(KNNImputer(), SGDClassifier())

params = {'knnimputer__n_neighbors' : [1, 2, 3, 4]}

grid = GridSearchCV(model, param_grid=params, cv=5)

grid.fit(X_train, y_train)
grid.best_params_

{'knnimputer__n_neighbors': 4}