# SKLEARN IMPUTER

Remplace les valeurs manquantes par une valeur statistique: missing_values

- Strategy:
    - mean
    - median
    - most_frequent
    - constant

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier

## SimpleImputer

In [4]:
from sklearn.impute import SimpleImputer

In [5]:
X=np.array([[10,3],[0,4],[5,3],[np.nan,3]])

In [8]:
X.shape

(4, 2)

In [10]:
#Mean
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')

imputer.fit_transform(X)

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [ 5.,  3.]])

In [13]:
X=np.array([[10,3],[0,4],[5,3],[np.nan,np.nan]])

In [14]:
#Mean
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')

imputer.fit_transform(X)

array([[10.        ,  3.        ],
       [ 0.        ,  4.        ],
       [ 5.        ,  3.        ],
       [ 5.        ,  3.33333333]])

## KNNImputer

In [15]:
from sklearn.impute import KNNImputer

In [16]:
X=np.array([[10,100],[18,4],[5,3],[np.nan,1]])

In [18]:
X

array([[ 10., 100.],
       [ 18.,   4.],
       [  5.,   3.],
       [ nan,   1.]])

In [17]:
imputer=KNNImputer(n_neighbors=1)
imputer.fit_transform(X)

array([[ 10., 100.],
       [ 18.,   4.],
       [  5.,   3.],
       [  5.,   1.]])

## MissingIndicator

In [29]:
from sklearn.impute import MissingIndicator
from sklearn.pipeline import make_union

In [26]:
X=np.array([[10,100],[18,4],[5,3],[np.nan,np.nan]])

In [27]:
X

array([[ 10., 100.],
       [ 18.,   4.],
       [  5.,   3.],
       [ nan,  nan]])

In [28]:
MissingIndicator().fit_transform(X)

array([[False, False],
       [False, False],
       [False, False],
       [ True,  True]])

In [31]:
pipeline=make_union(SimpleImputer(strategy="constant",fill_value=-99),MissingIndicator())
pipeline.fit_transform(X)

array([[ 10., 100.,   0.,   0.],
       [ 18.,   4.,   0.,   0.],
       [  5.,   3.,   0.,   0.],
       [-99., -99.,   1.,   1.]])

## APPLICATION

In [36]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.linear_model import SGDClassifier
import seaborn as sns

In [34]:
titanic=sns.load_dataset("titanic")
X=titanic[["pclass","age"]]
y=titanic["survived"]

In [35]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [37]:
model=make_pipeline(KNNImputer(),SGDClassifier())

In [38]:
params={
    'knnimputer__n_neighbors':[1,2,3,3]
}

In [39]:
grid=GridSearchCV(model,param_grid=params,cv=5)

In [41]:
grid.fit(X_train,y_train)

In [42]:
grid.best_params_

{'knnimputer__n_neighbors': 3}