In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# from feature-engine
from feature_engine import missing_data_imputers as mdi

In [None]:
#load the dataset with a selected group of variables

cols_to_use = ['BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt', 'SalePrice']

data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)
data.head()

In [None]:
data.isnull().mean()

In [None]:
# splitting the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
# separate into training and testing set

# drop the target from the feature list
cols_to_use.remove('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],
                                                    data['SalePrice'],
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

### feature-Engine captures the categorical variables automatically

In [None]:
# call the imputer from featur- engine

imputer = mdi.CategoricalVariableImputer()

In [None]:
# fitting the imputer

imputer.fit(X_train)

In [None]:
# imputer found the categorical variables to impute with the frequent category

imputer.variables

**imputer will replace missing data in categorical values by 'Missing'**

In [None]:
# feature-engine returns a dataframe

tmp = imputer.transform(X_train)
tmp.head()

In [None]:
# checking that the numerical variables don't contain NA any more

tmp[imputer.variables].isnull().mean()

## feature-engine allows to specify variable groups easily

In [None]:
# imputation bover 1 of the 2 categorical variables

imputer = mdi.CategoricalVariableImputer(variables=['BsmtQual'])

imputer.fit(X_train)

In [None]:
# imputer uses only the variables we indicated

imputer.variables

In [None]:
# imputer uses only the variables we indicated (test again)

imputer.variables

## Feature-engine  with the Scikit-learn pipeline

In [None]:
# check the percentage of NA in each categorical variable

X_train.isnull().mean()

- BsmtQual: 0.023 ==> frequent category imputation
- FirePlaceQu: 0.46 ==> missing category imputation

In [None]:
pipe = Pipeline([
    ('imputer_mode', mdi.FrequentCategoryImputer(variables=['BsmtQual'])),
    ('imputer_missing', mdi.CategoricalVariableImputer(variables=['FireplaceQu'])),
])

In [None]:
pipe.fit(X_train)

In [None]:
pipe.named_steps['imputer_mode'].variables

In [None]:
pipe.named_steps['imputer_missing'].variables

In [None]:
#  transform the data with the pipeline
tmp = pipe.transform(X_train)

# checking if the null values are gone
tmp.isnull().mean()