# Imputers


                                            PREPROCESSING
                        
                        
                        
    
                       Imputation: Replace missing datas with statistical values.
    
                     
                                           sklearn.impute


 
 
- Replace all missing values with a statistical value.
- Mean
- median
- most_frequent
- constant

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns
from sklearn.linear_model import SGDClassifier

# SimpleImputer

### Replace NaN by the Mean.

In [26]:
from sklearn.impute import SimpleImputer

In [27]:
x_train=np.array([[10,3],[0,4],[5,3],[np.nan,3]])

In [28]:
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')

In [29]:
imputer.fit_transform(x_train)

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [ 5.,  3.]])

- For the test set we use the mean of the train set. 

In [30]:
X_test=np.array([[12,5],[40,2],[5,5],[np.nan,np.nan]])
imputer.transform(X_test)

array([[12.  ,  5.  ],
       [40.  ,  2.  ],
       [ 5.  ,  5.  ],
       [ 5.  ,  3.25]])

# KNNImputer
### Replace your missing values by closest ones
- Imputation for completing missing values using k-Nearest Neighbors.

- Each sample’s missing values are imputed using the mean value from n_neighbors nearest neighbors found in the training set. Two samples are close if the features that neither is missing are close.

In [31]:
from sklearn.impute import KNNImputer


x=np.array([[1,100],[2,30],[3,15],[np.nan,20]])


imputer=KNNImputer(n_neighbors=1)

imputer.fit_transform(x)

array([[  1., 100.],
       [  2.,  30.],
       [  3.,  15.],
       [  3.,  20.]])

# MissingIndicator

### Position of missing values by BOOLEAN.

In [32]:
from sklearn.impute import MissingIndicator

In [39]:
x=np.array([[1,100],[2,30],[3,15],[np.nan,np.nan]])
x

array([[  1., 100.],
       [  2.,  30.],
       [  3.,  15.],
       [ nan,  nan]])

In [35]:
MissingIndicator().fit_transform(x)

array([[False, False],
       [False, False],
       [False, False],
       [ True,  True]])

### Let's say that "NaN" are not unknown values but represent a certain kind of variables.
   #### NaN may represents  "super cars".



In [38]:
from sklearn.pipeline import make_union

pipeline = make_union(SimpleImputer(strategy='constant',fill_value=-99),
                     MissingIndicator())

pipeline.fit_transform(x)

array([[  1., 100.,   0.,   0.],
       [  2.,  30.,   0.,   0.],
       [  3.,  15.,   0.,   0.],
       [-99., -99.,   1.,   1.]])


- We replaced the NaN values with -99.
- Then, because we said before that the NaN value can be (“Super Cars”), we created a table on the right showing us by a binary representation the position of our information.
- In our case: 1 represents "Super Cars" and 0 represents all other 'Normal' cars.

# Example with Pipeline and GridSearchCV


In [57]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
import seaborn as sns
from sklearn.model_selection import train_test_split

titanic=sns.load_dataset('titanic')
titanic.head()

y = titanic['survived']
X = titanic[['pclass','age']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


In [58]:
model=make_pipeline(KNNImputer(),SGDClassifier())

In [59]:
params={
    'knnimputer__n_neighbors':[1,2,3,4]
}

In [60]:
from sklearn.model_selection import GridSearchCV

grid=GridSearchCV(model,param_grid=params,cv=5)

In [61]:
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('knnimputer',
                                        KNNImputer(add_indicator=False,
                                                   copy=True,
                                                   metric='nan_euclidean',
                                                   missing_values=nan,
                                                   n_neighbors=5,
                                                   weights='uniform')),
                                       ('sgdclassifier',
                                        SGDClassifier(alpha=0.0001,
                                                      average=False,
                                                      class_weight=None,
                                                      early_stopping=False,
                                                      epsilon=0.1, eta0=0.0,
                           

In [62]:
grid.best_params_

{'knnimputer__n_neighbors': 3}