In [None]:
#libraries needed
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
# Splitting the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
# from feature-engine
from feature_engine import missing_data_imputers as mdi

In [None]:
# Load the dataset with a selected group of variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('../houseprice.csv', usecols=cols_to_use)
data.head()

In [None]:
data.isnull().mean()

In [None]:
# Splitting into training and testing set

# removing target from the feature list
cols_to_use.remove('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],
                                                    data['SalePrice'],
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

### Feature-Engine captures the numerical variables automatically

In [None]:
imputer = mdi.MeanMedianImputer(imputation_method='median')

In [None]:
# fitting

imputer.fit(X_train)

In [None]:
imputer.variables

In [None]:
imputer.imputer_dict_

In [None]:
# feature-engine returns a dataframe

tmp = imputer.transform(X_train)
tmp.head()

In [None]:
tmp[imputer.variables].isnull().mean()

## Feature-Engine which allows you to specify variable groups easily

In [None]:
imputer = mdi.MeanMedianImputer(imputation_method='mean',
                                variables=['LotFrontage', 'MasVnrArea'])

imputer.fit(X_train)

In [None]:
imputer.variables

In [None]:
imputer.imputer_dict_

In [None]:
X_train[imputer.variables].mean()

In [None]:
# feature-engine returns a dataframe

tmp = imputer.transform(X_train)

# check null values if are gone
tmp[imputer.variables].isnull().mean()

## Feature-Engine  with the Scikit-learn 

In [None]:
pipe = Pipeline([
    ('median_imputer', mdi.MeanMedianImputer(imputation_method='median',
                                             variables = ['LotFrontage', 'GarageYrBlt'])),
     
    ('mean_imputer', mdi.MeanMedianImputer(imputation_method='mean',
                                          variables = ['MasVnrArea'])),
     ])

In [None]:
pipe.fit(X_train)

In [None]:
pipe.named_steps['median_imputer'].imputer_dict_

In [None]:
pipe.named_steps['mean_imputer'].imputer_dict_

In [None]:
# transforming the data with the pipeline
tmp = pipe.transform(X_train)

# checking null values if are gone
tmp.isnull().mean()