# Feature Engine for Missing Values Imputation
## AN OPEN SOURCE PYTHON PACKAGE TO CREATE REPRODUCIBLE FEATURE ENGINEERING STEPS AND SMOOTH MODEL DEPLOYMENT
Feature-engine includes transformers for:

Missing value imputation

Categorical variable encoding

Outlier capping

Discretisation

Numerical variable transformation

 1)Mean Median Imputation for numerical values
 2)Arbitrary value imputation for both categorical and numerical values
 3)End of distribution imputation
 4)Most frequent category for categorical variables
 5)Missing category imputation 
 6)Random sample imputation
 7)Missing Indicator
 
 https://feature-engine.readthedocs.io/en/latest/

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
house=pd.read_csv('house_loan.csv',usecols=['BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'])
house.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [3]:
house.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [4]:
house.isnull().sum()

LotFrontage    259
MasVnrArea       8
BsmtQual        37
FireplaceQu    690
GarageYrBlt     81
SalePrice        0
dtype: int64

In [5]:
house.columns

Index(['LotFrontage', 'MasVnrArea', 'BsmtQual', 'FireplaceQu', 'GarageYrBlt',
       'SalePrice'],
      dtype='object')

In [6]:

X=house.iloc[:,:-1]
y=house.iloc[:,-1]

In [7]:
X.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
0,65.0,196.0,Gd,,2003.0
1,80.0,0.0,Gd,TA,1976.0
2,68.0,162.0,Gd,TA,2001.0
3,60.0,0.0,TA,Gd,1998.0
4,84.0,350.0,Gd,TA,2000.0


In [8]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [10]:
X_train.shape, X_test.shape

((1022, 5), (438, 5))

In [11]:
X_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.052838
dtype: float64

In [15]:
from feature_engine import missing_data_imputers as mdi

## 1)Mean Median Imputation for numerical values

## Feature-Engine captures the numerical variables automatically

In [18]:
imputer=mdi.MeanMedianImputer(imputation_method='mean')

In [19]:
imputer.fit(X_train)

MeanMedianImputer(imputation_method='mean',
                  variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])

In [20]:
imputer.variables ## Feature-Engine captures the numerical variables automatically¶

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [22]:
imputer.imputer_dict_ ## mean assigned to each variable

{'LotFrontage': 69.66866746698679,
 'MasVnrArea': 103.55358898721731,
 'GarageYrBlt': 1978.0123966942149}

In [24]:
tr=imputer.transform(X_train) # feature-engine returns a dataframe

In [25]:
tr.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,69.668667,573.0,Gd,,1998.0
682,69.668667,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,,1978.012397
1384,60.0,0.0,TA,,1939.0
1100,60.0,0.0,TA,,1930.0


In [26]:
tr[imputer.variables].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
GarageYrBlt    0.0
dtype: float64

In [27]:
## no more NA values

## Feature-Engine allows you to specify variable groups easily

In [28]:
imputer = mdi.MeanMedianImputer(imputation_method='mean',
                                variables=['LotFrontage', 'MasVnrArea'])

imputer.fit(X_train)

MeanMedianImputer(imputation_method='mean',
                  variables=['LotFrontage', 'MasVnrArea'])

In [29]:
imputer.variables
imputer.imputer_dict_

{'LotFrontage': 69.66866746698679, 'MasVnrArea': 103.55358898721731}

In [30]:
tr=imputer.transform(X_train) # feature-engine returns a dataframe

In [31]:
tr[imputer.variables].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
dtype: float64

## Feature-Engine can be used with the Scikit-learn pipeline

In [34]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('median_imputer', mdi.MeanMedianImputer(imputation_method='median',
                                             variables = ['LotFrontage', 'GarageYrBlt'])),
     
    ('mean_imputer', mdi.MeanMedianImputer(imputation_method='mean',
                                          variables = ['MasVnrArea'])),
     ])

In [35]:
pipe.fit(X_train)

Pipeline(steps=[('median_imputer',
                 MeanMedianImputer(variables=['LotFrontage', 'GarageYrBlt'])),
                ('mean_imputer',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['MasVnrArea']))])

In [36]:
pipe.named_steps['median_imputer'].imputer_dict_

{'LotFrontage': 69.0, 'GarageYrBlt': 1979.0}

In [37]:
pipe.named_steps['mean_imputer'].imputer_dict_

{'MasVnrArea': 103.55358898721731}

In [41]:
tr=pipe.transform(X_train)
X_train.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,,573.0,Gd,,1998.0
682,,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,,
1384,60.0,0.0,TA,,1939.0
1100,60.0,0.0,TA,,1930.0


In [42]:
tr.isnull().sum()

LotFrontage      0
MasVnrArea       0
BsmtQual        24
FireplaceQu    478
GarageYrBlt      0
dtype: int64

In [None]:
## There are no more null values for the 3 imputed numerical variables.

## 2)Arbitrary value imputation for both categorical and numerical values 

In [None]:
## imputer = mdi.ArbitraryNumberImputer(arbitrary_number = -999)
## imputer = mdi.ArbitraryNumberImputer(arbitrary_number = -999, variables=['LotFrontage', 'MasVnrArea'])
## imputer.fit(X_train)
## imputer.variables
## imputer.arbitrary_number
## tmp = imputer.transform(X_train)
## tmp[imputer.variables].isnull().mean()
## pipe = Pipeline([
    ('imputer_999', mdi.ArbitraryNumberImputer(arbitrary_number = -999,
                                             variables = ['LotFrontage', 'MasVnrArea'])),
     
    ('imputer_minus1', mdi.ArbitraryNumberImputer(arbitrary_number = -1,
                                          variables = ['GarageYrBlt'])),
     ])

## pipe.fit(X_train)
## pipe.named_steps['imputer_999'].arbitrary_number
## pipe.named_steps['imputer_minus1'].arbitrary_number
## tmp = pipe.transform(X_train)
## tmp.isnull().mean()

## 3)End of distribution(End of tail) imputation

In [None]:


## distribution Can take ‘gaussian’, ‘skewed’ or ‘max’.
## gaussian: the imputer will use the Gaussian limits to find the values to replace missing data.
##skewed: the imputer will use the IQR limits to find the values to replace missing data.
##max: the imputer will use the maximum values to replace missing data. Note that if ‘max’ is passed, the parameter ‘tail’ is ignored.


In [None]:
## imputer = mdi.EndTailImputer(distribution='gaussian', tail='right') 
## imputer.fit(X_train)
## imputer.variables
## imputer.imputer_dict_  ( the o/p of this is calcuted as follows)
## X_train[imputer.variables].mean() + 3 * X_train[imputer.variables].std() ## this is how the values are calculated
## tmp = imputer.transform(X_train)
## tmp.head()
## tmp[imputer.variables].isnull().mean()
## imputer = mdi.EndTailImputer(distribution='skewed', tail='left',
                             variables=['LotFrontage', 'MasVnrArea'])
## pipe = Pipeline([
    ('imputer_skewed', mdi.EndTailImputer(distribution='skewed', tail='right',
                                          variables=['GarageYrBlt', 'MasVnrArea'])),

    ('imputer_gaussian', mdi.EndTailImputer(distribution='gaussian', tail='right',
                                            variables=['LotFrontage'])),
])
## pipe.fit(X_train)
## pipe.named_steps['imputer_skewed'].imputer_dict_
## pipe.named_steps['imputer_gaussian'].imputer_dict_
## tmp = pipe.transform(X_train)
## tmp.isnull().mean()


## 4)Most frequent category for categorical variables

### Feature-Engine captures the categorical variables automatically

In [None]:
## imputer = mdi.CategoricalVariableImputer(imputation_method='frequent')
## imputer.fit(X_train) ## automatically takes the categorical variables
## imputer.variables
## imputer.imputer_dict_
## X_train[imputer.variables].mode() ### let's check those values agains the train data

## tmp = imputer.transform(X_train)
## tmp.head()
## tmp[imputer.variables].isnull().mean()
## imputer = mdi.CategoricalVariableImputer(
    imputation_method='frequent', variables=['BsmtQual'])

## imputer.fit(X_train)
## imputer.variables
## imputer.imputer_dict_
# feature-engine returns a dataframe

## tmp = imputer.transform(X_train)

# let's check null values are gone
## tmp[imputer.variables].isnull().mean()

## 5)Missing category imputation 

In [44]:
##Feature-Engine captures the categorical variables automatically

imputer = mdi.CategoricalVariableImputer()

## or we can mention the categorical variables

## imputer = mdi.CategoricalVariableImputer(variables=['BsmtQual'])


## imputer.fit(X_train)
## imputer.variables
## tmp = imputer.transform(X_train)
## tmp.head()
## tmp[imputer.variables].isnull().mean()
## pipe = Pipeline([
    ('imputer_mode', mdi.CategoricalVariableImputer(imputation_method='frequent', variables=['BsmtQual'])),
    ('imputer_missing', mdi.CategoricalVariableImputer(variables=['FireplaceQu'])),
])
## pipe.fit(X_train)
## pipe.named_steps['imputer_mode'].variables
## pipe.named_steps['imputer_missing'].variables
## tmp = pipe.transform(X_train)
## tmp.isnull().mean()


BsmtQual    0.0
dtype: float64

## 6)Random sample imputation

In [None]:
##imputer = mdi.RandomSampleImputer(random_state = 29)
## imputer.fit(X_train)
## imputer.variables
## tmp = imputer.transform(X_train)
## tmp.head()
## tmp[imputer.variables].isnull().mean()

##  7)Missing Indicator
 

In [None]:
## imputer = mdi.AddMissingIndicator() ##Feature-Engine's missing indicator selects all variables by default

## imputer = mdi.AddMissingIndicator(variables=['BsmtQual', 'FireplaceQu', 'LotFrontage']) ## select a few variables


## imputer.fit(X_train)
## imputer.variables
## tmp = imputer.transform(X_train)
## tmp.head()


## Pipeline
### These are the steps we will concatenate

1) Add Missing Indicator to all variables
2) Median Imputation to numerical variables
3) Missing category imputation to categorical variables

In [None]:
pipe = Pipeline([
    ('missing_ind', mdi.AddMissingIndicator()),
    
    ('imputer_mode', mdi.CategoricalVariableImputer(
        imputation_method='frequent', variables=['FireplaceQu', 'BsmtQual'])),
    
    ('imputer_median', mdi.MeanMedianImputer(imputation_method='median',
                                             variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])),
])

In [None]:
pipe.fit(X_train)
# inspect the separate steps
pipe.named_steps['missing_ind'].variables
pipe.named_steps['imputer_mode'].imputer_dict_
pipe.named_steps['imputer_median'].imputer_dict_

tmp = pipe.transform(X_train)
tmp.isnull().mean()