# ScikitLearn for Missing Values Imputation
## ScikitLearn provides a class to make most of common data imputation techniques
 1)Mean Median Imputation for numerical values
 2)Arbitrary value imputation for both categorical and numerical values
 3)Most frequent category for categorical variables
 4)Missing category imputation 
 5)Adding missing indicator
 6)Automatic imputation of best imputation technique

## 1)Mean Median Imputation for numerical values

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
house=pd.read_csv('house_loan.csv',usecols=['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'WoodDeckSF',
    'BsmtUnfSF', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt', 'SalePrice'])
house.head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,GarageYrBlt,WoodDeckSF,SalePrice
0,65.0,7,196.0,150,856,856,1710,2003.0,0,208500
1,80.0,6,0.0,284,1262,1262,1262,1976.0,298,181500
2,68.0,7,162.0,434,920,920,1786,2001.0,0,223500
3,60.0,7,0.0,540,756,961,1717,1998.0,0,140000
4,84.0,8,350.0,490,1145,1145,2198,2000.0,192,250000


In [5]:
house.shape

(1460, 10)

In [6]:
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   LotFrontage  1201 non-null   float64
 1   OverallQual  1460 non-null   int64  
 2   MasVnrArea   1452 non-null   float64
 3   BsmtUnfSF    1460 non-null   int64  
 4   TotalBsmtSF  1460 non-null   int64  
 5   1stFlrSF     1460 non-null   int64  
 6   GrLivArea    1460 non-null   int64  
 7   GarageYrBlt  1379 non-null   float64
 8   WoodDeckSF   1460 non-null   int64  
 9   SalePrice    1460 non-null   int64  
dtypes: float64(3), int64(7)
memory usage: 114.2 KB


In [7]:
house.isnull().sum()

LotFrontage    259
OverallQual      0
MasVnrArea       8
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
GrLivArea        0
GarageYrBlt     81
WoodDeckSF       0
SalePrice        0
dtype: int64

In [None]:
## features LotFrontage ,GarageYrBlt,MasVnrArea contains missing data

In [10]:
house.columns

Index(['LotFrontage', 'OverallQual', 'MasVnrArea', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', 'GrLivArea', 'GarageYrBlt', 'WoodDeckSF', 'SalePrice'],
      dtype='object')

In [11]:

X=house.iloc[:,:-1]
y=house.iloc[:,-1]

In [12]:
X.head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,GarageYrBlt,WoodDeckSF
0,65.0,7,196.0,150,856,856,1710,2003.0,0
1,80.0,6,0.0,284,1262,1262,1262,1976.0,298
2,68.0,7,162.0,434,920,920,1786,2001.0,0
3,60.0,7,0.0,540,756,961,1717,1998.0,0
4,84.0,8,350.0,490,1145,1145,2198,2000.0,192


In [13]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [15]:
X_train.shape, X_test.shape

((1022, 9), (438, 9))

In [16]:
y_train.shape,y_test.shape

((1022,), (438,))

In [17]:
X_train.isnull().mean()

LotFrontage    0.184932
OverallQual    0.000000
MasVnrArea     0.004892
BsmtUnfSF      0.000000
TotalBsmtSF    0.000000
1stFlrSF       0.000000
GrLivArea      0.000000
GarageYrBlt    0.052838
WoodDeckSF     0.000000
dtype: float64

In [18]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [19]:
imputer = SimpleImputer(strategy='median')
imputer.fit(X_train)

SimpleImputer(strategy='median')

In [20]:
# we can look at the learnt medians like this:
imputer.statistics_

array([  69. ,    6. ,    0. ,  486.5,  992. , 1095. , 1479. , 1979. ,
          0. ])

In [21]:
X_train.median()

LotFrontage      69.0
OverallQual       6.0
MasVnrArea        0.0
BsmtUnfSF       486.5
TotalBsmtSF     992.0
1stFlrSF       1095.0
GrLivArea      1479.0
GarageYrBlt    1979.0
WoodDeckSF        0.0
dtype: float64

In [None]:
## Note that SimpleImputer learns the medians of ALL the variables in the dataset, those with NA and those without NA.

In [23]:

# impute the train and test set
# NOTE: the data is returned as a numpy array!!!
X_train=imputer.transform(X_train)
X_test=imputer.transform(X_test)

In [26]:
pd.DataFrame(X_train,columns=X.columns).head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,GarageYrBlt,WoodDeckSF
0,69.0,7.0,573.0,318.0,1057.0,1057.0,2034.0,1998.0,576.0
1,69.0,6.0,0.0,288.0,1291.0,1291.0,1291.0,1996.0,307.0
2,50.0,5.0,0.0,162.0,858.0,858.0,858.0,1979.0,117.0
3,60.0,6.0,0.0,356.0,560.0,698.0,1258.0,1939.0,0.0
4,60.0,2.0,0.0,0.0,290.0,438.0,438.0,1930.0,0.0


 ## 1a) Different imputation techniques on different features
 We need to use another class called ColumnTransformer.
 
 
 ColumnTransformer(transformers, *, remainder='drop', sparse_threshold=0.3, n_jobs=None, transformer_weights=None, verbose=False)
 
 We also use Pipeine
 
 Pipeline(steps : list )
      List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained,    with the last object an estimator.
      
      ex:
      
      pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
      The pipeline can be used as any other estimator
      
      

In [45]:
## create list for feartures to be imputed with different techniques

features_mean = ['LotFrontage']
features_median = ['MasVnrArea', 'GarageYrBlt']

## create pipeline one for mean and another for median with name and estimators

pipe_mean=Pipeline([('imputer', SimpleImputer(strategy='mean'))])
pipe_median=Pipeline([('imputer', SimpleImputer(strategy='median'))])

## use this lists and pipeline in columntransformer


ct = ColumnTransformer([
                                   ('mean_imputer', pipe_mean, features_mean),
                                   ('median_imputer', pipe_median, features_median)
                                 ], remainder='passthrough')


# remainder = 'passthrough' indicates that we want to retain ALL the columns in the dataset
            # otherwise only those specified in the imputing steps will be kept

In [46]:
ct.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt'])])

In [47]:
ct.transformers

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt'])]

In [48]:
ct.named_transformers_['mean_imputer'].named_steps['imputer'].statistics_

array([241.53131115])

In [49]:
ct.named_transformers_['median_imputer'].named_steps['imputer'].statistics_

array([   0., 1977.])

In [50]:
X_train[features_mean].mean()
X_train[features_median].median()

MasVnrArea        0.0
GarageYrBlt    1977.0
dtype: float64

In [51]:

# and now we can impute the data and check if it worked
X_train = ct.transform(X_train)
np.mean(np.isnan(X_train))

0.0

In [55]:
X_train  ## this returns numpy array . to put this in dataframe, be careful of column order

array([[ 999.,  573., 1998., ..., 1057., 2034.,  576.],
       [ 999.,    0., 1996., ..., 1291., 1291.,  307.],
       [  50.,    0.,  999., ...,  858.,  858.,  117.],
       ...,
       [  68.,    0., 1978., ..., 1318., 1902.,    0.],
       [ 999.,   18., 2003., ..., 1557., 1557.,  143.],
       [  58.,   30., 1998., ..., 1195., 1839.,    0.]])

## 2)Arbitrary value imputation for both categorical and numerical values

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [28]:
house1=pd.read_csv('house_loan.csv',usecols=['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'WoodDeckSF',
    'BsmtUnfSF', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt', 'SalePrice'])
house1.head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,GarageYrBlt,WoodDeckSF,SalePrice
0,65.0,7,196.0,150,856,856,1710,2003.0,0,208500
1,80.0,6,0.0,284,1262,1262,1262,1976.0,298,181500
2,68.0,7,162.0,434,920,920,1786,2001.0,0,223500
3,60.0,7,0.0,540,756,961,1717,1998.0,0,140000
4,84.0,8,350.0,490,1145,1145,2198,2000.0,192,250000


In [29]:
house1.isnull().sum()

LotFrontage    259
OverallQual      0
MasVnrArea       8
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
GrLivArea        0
GarageYrBlt     81
WoodDeckSF       0
SalePrice        0
dtype: int64

In [30]:

X=house1.iloc[:,:-1]
y=house1.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [31]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [32]:
imputer = SimpleImputer(strategy='constant', fill_value = 999)
imputer.fit(X_train)

SimpleImputer(fill_value=999, strategy='constant')

In [33]:
imputer.statistics_

array([999., 999., 999., 999., 999., 999., 999., 999., 999.])

In [34]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train

array([[ 999.,    7.,  573., ..., 2034., 1998.,  576.],
       [ 999.,    6.,    0., ..., 1291., 1996.,  307.],
       [  50.,    5.,    0., ...,  858.,  999.,  117.],
       ...,
       [  68.,    6.,    0., ..., 1902., 1978.,    0.],
       [ 999.,    7.,   18., ..., 1557., 2003.,  143.],
       [  58.,    7.,   30., ..., 1839., 1998.,    0.]])

In [36]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_train

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,GarageYrBlt,WoodDeckSF
0,999.0,7.0,573.0,318.0,1057.0,1057.0,2034.0,1998.0,576.0
1,999.0,6.0,0.0,288.0,1291.0,1291.0,1291.0,1996.0,307.0
2,50.0,5.0,0.0,162.0,858.0,858.0,858.0,999.0,117.0
3,60.0,6.0,0.0,356.0,560.0,698.0,1258.0,1939.0,0.0
4,60.0,2.0,0.0,0.0,290.0,438.0,438.0,1930.0,0.0
...,...,...,...,...,...,...,...,...,...
1017,82.0,8.0,673.0,89.0,1252.0,1268.0,2365.0,1999.0,0.0
1018,60.0,4.0,0.0,625.0,1067.0,1067.0,1067.0,1996.0,290.0
1019,68.0,6.0,0.0,0.0,0.0,1318.0,1902.0,1978.0,0.0
1020,999.0,7.0,18.0,1374.0,1374.0,1557.0,1557.0,2003.0,143.0


## 3)Most frequent category for categorical variables 

In [37]:
## imputer = SimpleImputer(strategy='most_frequent')


## 4)Missing category imputation 

In [38]:
## imputer = SimpleImputer(strategy='constant', fill_value = 'Missing')

## 5)Adding missing indicator 

In [39]:
## from sklearn.impute import SimpleImputer, MissingIndicator
## indicator = MissingIndicator(error_on_new=True, features='missing-only')
