# Missing value imputation using Sckit learn

# Different strategy for different variables(numerical & categorical) with sckit learn

In [7]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [8]:
train=pd.read_csv("C:\\Users\\Junaid Ahmed\\Downloads\\train.csv")
test=pd.read_csv("C:\\Users\\Junaid Ahmed\\Downloads\\test.csv")
print("shape of train df=",train.shape)
print("shape of test df=",test.shape)

shape of train df= (1460, 81)
shape of test df= (1459, 80)


In [11]:
X_train=train.drop(columns="SalePrice",axis=1)
Y_train=train["SalePrice"]
X_test=test.copy()
print("the shape of X_train ",X_train.shape)
print("the shape of Y_train",Y_train.shape)
print("the shape of X_test",X_test.shape)

the shape of X_train  (1460, 80)
the shape of Y_train (1460,)
the shape of X_test (1459, 80)


In [46]:
isnull_sum=X_train .isnull().sum()
isnull_sum

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

In [13]:
num_var=X_train.select_dtypes(include=["int64","float64"]).columns

In [14]:
num_var_miss=[var for var in num_var if isnull_sum[var]>0]

In [15]:
num_var_miss

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [16]:
cat_var=X_train.select_dtypes(include=["O"]).columns
cat_var_miss=[var for var in cat_var if isnull_sum[var]>0]
cat_var_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [18]:
num_var_mean=['LotFrontage']
num_var_median=['MasVnrArea', 'GarageYrBlt']
cat_var_mode=['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu']
cat_var_missing=['GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [28]:
num_var_mean_imputer=Pipeline(steps=[("imputer",SimpleImputer(strategy="mean"))])
num_var_median_imputer=Pipeline(steps=[("imputer",SimpleImputer(strategy="median"))])
cat_var_mode_imputer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent"))])
cat_var_missing_imputer=Pipeline(steps=[("imputer",SimpleImputer(strategy="constant",fill_value="missing"))])

In [29]:
preprocessor=ColumnTransformer(transformers=[("mean_imputer",num_var_mean_imputer,num_var_mean),
                               ("median_imputer",num_var_median_imputer,num_var_median),
                               ("mode_imputer",cat_var_mode_imputer,cat_var_mode),
                               ("missing_imputer",cat_var_missing_imputer,cat_var_missing)])

In [30]:
preprocessor.fit(X_train)

In [31]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('missing_im

In [32]:
preprocessor.named_transformers_['mean_imputer'].named_steps['imputer'].statistics_

array([70.04995837])

In [33]:
train['LotFrontage'].mean()

70.04995836802665

In [34]:
preprocessor.named_transformers_['mode_imputer'].named_steps['imputer'].statistics_

array(['Grvl', 'BrkFace', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd'],
      dtype=object)

In [35]:
X_train_clean=preprocessor.transform(X_train)
X_test_clean=preprocessor.transform(X_test)

In [36]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('missing_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='missing', strategy='constant'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'drop',
  [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   

In [37]:
X_test_clean_miiss_var=pd.DataFrame(X_test_clean,columns=num_var_mean+num_var_median+cat_var_mode+cat_var_missing)

In [38]:
X_test_clean_miiss_var.head()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,80.0,0.0,1961.0,Grvl,BrkFace,TA,TA,No,Rec,LwQ,SBrkr,Gd,Attchd,Unf,TA,TA,missing,MnPrv,missing
1,81.0,108.0,1958.0,Grvl,BrkFace,TA,TA,No,ALQ,Unf,SBrkr,Gd,Attchd,Unf,TA,TA,missing,missing,Gar2
2,74.0,0.0,1997.0,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,TA,Attchd,Fin,TA,TA,missing,MnPrv,missing
3,78.0,20.0,1998.0,Grvl,BrkFace,TA,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,Fin,TA,TA,missing,missing,missing
4,43.0,0.0,1992.0,Grvl,BrkFace,Gd,TA,No,ALQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,missing,missing,missing


In [39]:
X_test_clean_miiss_var.isnull().sum().sum()

0

In [40]:
train['Alley'].value_counts()

Alley
Grvl    50
Pave    41
Name: count, dtype: int64

In [41]:
X_test_clean_miiss_var['Alley'].value_counts()

Alley
Grvl    1422
Pave      37
Name: count, dtype: int64

In [42]:
X_test_clean_miiss_var['MiscFeature'].value_counts()

MiscFeature
missing    1408
Shed         46
Gar2          3
Othr          2
Name: count, dtype: int64