In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine import missing_data_imputers as mdi

### Feature Engine - Mean/Median Imputation

In [2]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('data/housing.csv', usecols=cols_to_use)

# let's separate into training and testing set
# first drop the target from the feature list
cols_to_use.remove('SalePrice')
# split
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes

(1022, 5) (438, 5)


BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object

#### Feature Engine automatically detects the numerical features

In [3]:
imputer = mdi.MeanMedianImputer(imputation_method="median")
imputer.fit(x_train)

MeanMedianImputer(variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])

In [4]:
imputer.variables

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [5]:
imputer.imputer_dict_

{'LotFrontage': 69.0, 'MasVnrArea': 0.0, 'GarageYrBlt': 1979.0}

In [6]:
# feature engine unlike sklearn returns a DataFrame
tmp = imputer.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,69.0,573.0,1998.0
682,Gd,Gd,69.0,0.0,1996.0
960,TA,,50.0,0.0,1979.0
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


In [7]:
tmp[imputer.variables].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
GarageYrBlt    0.0
dtype: float64

#### Feature Engine also allows you to group features 

In [8]:
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes

(1022, 5) (438, 5)


BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object

In [11]:
imputer = mdi.MeanMedianImputer(
    imputation_method="mean", 
    variables=["LotFrontage", "MasVnrArea"]
)

In [12]:
imputer.fit(x_train)

MeanMedianImputer(imputation_method='mean',
                  variables=['LotFrontage', 'MasVnrArea'])

In [13]:
imputer.imputer_dict_

{'LotFrontage': 69.66866746698679, 'MasVnrArea': 103.55358898721731}

In [14]:
tmp = imputer.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,69.668667,573.0,1998.0
682,Gd,Gd,69.668667,0.0,1996.0
960,TA,,50.0,0.0,
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


#### Feature Engine can be used along with sklearn pipelines

In [15]:
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes

(1022, 5) (438, 5)


BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object

In [16]:
pipeline = Pipeline(
    steps=[
        ("median_imputer", mdi.MeanMedianImputer(imputation_method="median", variables=["LotFrontage", "GarageYrBlt"])),
        ("mean_imputer", mdi.MeanMedianImputer(imputation_method="mean", variables=["MasVnrArea"]))
    ]
)

pipeline.fit(x_train)

Pipeline(steps=[('median_imputer',
                 MeanMedianImputer(variables=['LotFrontage', 'GarageYrBlt'])),
                ('mean_imputer',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['MasVnrArea']))])

In [17]:
pipeline.named_steps["median_imputer"].imputer_dict_

{'LotFrontage': 69.0, 'GarageYrBlt': 1979.0}

In [18]:
pipeline.named_steps["mean_imputer"].imputer_dict_

{'MasVnrArea': 103.55358898721731}

In [19]:
tmp = pipeline.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,69.0,573.0,1998.0
682,Gd,Gd,69.0,0.0,1996.0
960,TA,,50.0,0.0,1979.0
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


### Feature Engine - Arbitary Value Imputation

In [21]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('data/housing.csv', usecols=cols_to_use)

# let's separate into training and testing set
# first drop the target from the feature list
cols_to_use.remove('SalePrice')
# split
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes

(1022, 5) (438, 5)


BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object

In [22]:
imputer = mdi.ArbitraryNumberImputer(arbitrary_number=-999)
imputer.fit(x_train)

ArbitraryNumberImputer(arbitrary_number=-999,
                       variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])

In [24]:
imputer.variables, imputer.arbitrary_number

(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], -999)

In [25]:
tmp = imputer.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,-999.0,573.0,1998.0
682,Gd,Gd,-999.0,0.0,1996.0
960,TA,,50.0,0.0,-999.0
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


In [27]:
# using along with sklearn pipelines
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)

(1022, 5) (438, 5)


In [28]:
pipeline = Pipeline(
    steps=[
        ("imputer_999", mdi.ArbitraryNumberImputer(arbitrary_number=-999, variables=['LotFrontage', 'MasVnrArea'])),
        ("imputer_minus1", mdi.ArbitraryNumberImputer(arbitrary_number=-1, variables=['GarageYrBlt']))
    ]
)
pipeline.fit(x_train)

Pipeline(steps=[('imputer_999',
                 ArbitraryNumberImputer(arbitrary_number=-999,
                                        variables=['LotFrontage',
                                                   'MasVnrArea'])),
                ('imputer_minus1',
                 ArbitraryNumberImputer(arbitrary_number=-1,
                                        variables=['GarageYrBlt']))])

In [29]:
pipeline.named_steps["imputer_999"]

ArbitraryNumberImputer(arbitrary_number=-999,
                       variables=['LotFrontage', 'MasVnrArea'])

In [30]:
pipeline.named_steps["imputer_minus1"]

ArbitraryNumberImputer(arbitrary_number=-1, variables=['GarageYrBlt'])

In [None]:
# 