In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine import missing_data_imputers as mdi

### Feature Engine - Mean/Median Imputation

In [2]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('data/housing.csv', usecols=cols_to_use)

# let's separate into training and testing set
# first drop the target from the feature list
cols_to_use.remove('SalePrice')
# split
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes

(1022, 5) (438, 5)


BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object

#### Feature Engine automatically detects the numerical features

In [3]:
imputer = mdi.MeanMedianImputer(imputation_method="median")
imputer.fit(x_train)

MeanMedianImputer(variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])

In [4]:
imputer.variables

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [5]:
imputer.imputer_dict_

{'LotFrontage': 69.0, 'MasVnrArea': 0.0, 'GarageYrBlt': 1979.0}

In [6]:
# feature engine unlike sklearn returns a DataFrame
tmp = imputer.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,69.0,573.0,1998.0
682,Gd,Gd,69.0,0.0,1996.0
960,TA,,50.0,0.0,1979.0
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


In [7]:
tmp[imputer.variables].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
GarageYrBlt    0.0
dtype: float64

#### Feature Engine also allows you to group features 

In [8]:
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes

(1022, 5) (438, 5)


BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object

In [11]:
imputer = mdi.MeanMedianImputer(
    imputation_method="mean", 
    variables=["LotFrontage", "MasVnrArea"]
)

In [12]:
imputer.fit(x_train)

MeanMedianImputer(imputation_method='mean',
                  variables=['LotFrontage', 'MasVnrArea'])

In [13]:
imputer.imputer_dict_

{'LotFrontage': 69.66866746698679, 'MasVnrArea': 103.55358898721731}

In [14]:
tmp = imputer.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,69.668667,573.0,1998.0
682,Gd,Gd,69.668667,0.0,1996.0
960,TA,,50.0,0.0,
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


#### Feature Engine can be used along with sklearn pipelines

In [15]:
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes

(1022, 5) (438, 5)


BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object

In [16]:
pipeline = Pipeline(
    steps=[
        ("median_imputer", mdi.MeanMedianImputer(imputation_method="median", variables=["LotFrontage", "GarageYrBlt"])),
        ("mean_imputer", mdi.MeanMedianImputer(imputation_method="mean", variables=["MasVnrArea"]))
    ]
)

pipeline.fit(x_train)

Pipeline(steps=[('median_imputer',
                 MeanMedianImputer(variables=['LotFrontage', 'GarageYrBlt'])),
                ('mean_imputer',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['MasVnrArea']))])

In [17]:
pipeline.named_steps["median_imputer"].imputer_dict_

{'LotFrontage': 69.0, 'GarageYrBlt': 1979.0}

In [18]:
pipeline.named_steps["mean_imputer"].imputer_dict_

{'MasVnrArea': 103.55358898721731}

In [19]:
tmp = pipeline.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,69.0,573.0,1998.0
682,Gd,Gd,69.0,0.0,1996.0
960,TA,,50.0,0.0,1979.0
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


### Feature Engine - Arbitary Value Imputation

In [21]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('data/housing.csv', usecols=cols_to_use)

# let's separate into training and testing set
# first drop the target from the feature list
cols_to_use.remove('SalePrice')
# split
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes

(1022, 5) (438, 5)


BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object

In [22]:
imputer = mdi.ArbitraryNumberImputer(arbitrary_number=-999)
imputer.fit(x_train)

ArbitraryNumberImputer(arbitrary_number=-999,
                       variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])

In [24]:
imputer.variables, imputer.arbitrary_number

(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], -999)

In [25]:
tmp = imputer.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,-999.0,573.0,1998.0
682,Gd,Gd,-999.0,0.0,1996.0
960,TA,,50.0,0.0,-999.0
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


In [27]:
# using along with sklearn pipelines
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)

(1022, 5) (438, 5)


In [28]:
pipeline = Pipeline(
    steps=[
        ("imputer_999", mdi.ArbitraryNumberImputer(arbitrary_number=-999, variables=['LotFrontage', 'MasVnrArea'])),
        ("imputer_minus1", mdi.ArbitraryNumberImputer(arbitrary_number=-1, variables=['GarageYrBlt']))
    ]
)
pipeline.fit(x_train)

Pipeline(steps=[('imputer_999',
                 ArbitraryNumberImputer(arbitrary_number=-999,
                                        variables=['LotFrontage',
                                                   'MasVnrArea'])),
                ('imputer_minus1',
                 ArbitraryNumberImputer(arbitrary_number=-1,
                                        variables=['GarageYrBlt']))])

In [29]:
pipeline.named_steps["imputer_999"]

ArbitraryNumberImputer(arbitrary_number=-999,
                       variables=['LotFrontage', 'MasVnrArea'])

In [30]:
pipeline.named_steps["imputer_minus1"]

ArbitraryNumberImputer(arbitrary_number=-1, variables=['GarageYrBlt'])

In [31]:
# feature_engine returns a dataframe
tmp = pipeline.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,-999.0,573.0,1998.0
682,Gd,Gd,-999.0,0.0,1996.0
960,TA,,50.0,0.0,-1.0
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


### Feature Engine - End of Tail Distribution Imputation

In [32]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('data/housing.csv', usecols=cols_to_use)

# let's separate into training and testing set
# first drop the target from the feature list
cols_to_use.remove('SalePrice')
# split
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes

(1022, 5) (438, 5)


BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object

In [34]:
# end of tail distribution with grouped features and pipelines
pipeline = Pipeline(
    steps=[
        ("imputer_skewed", mdi.EndTailImputer(distribution="skewed", tail="right", 
                                              variables=['GarageYrBlt', 'MasVnrArea'])),
        ("imputer_gaussian", mdi.EndTailImputer(distribution="gaussian", tail="right",
                                               variables=["LotFrontage"])),
    ]
)

pipeline.fit(x_train)

Pipeline(steps=[('imputer_skewed',
                 EndTailImputer(distribution='skewed',
                                variables=['GarageYrBlt', 'MasVnrArea'])),
                ('imputer_gaussian',
                 EndTailImputer(variables=['LotFrontage']))])

In [35]:
pipeline.named_steps["imputer_skewed"]

EndTailImputer(distribution='skewed', variables=['GarageYrBlt', 'MasVnrArea'])

In [36]:
tmp = pipeline.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,138.90222,573.0,1998.0
682,Gd,Gd,138.90222,0.0,1996.0
960,TA,,50.0,0.0,2121.0
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


In [39]:
tmp.isnull().mean()

BsmtQual       0.023483
FireplaceQu    0.467710
LotFrontage    0.000000
MasVnrArea     0.000000
GarageYrBlt    0.000000
dtype: float64

### Feature Engine - Frequent Category Imputation


In [42]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('data/housing.csv', usecols=cols_to_use)

# let's separate into training and testing set
# first drop the target from the feature list
cols_to_use.remove('SalePrice')
# split
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes
print(x_train.isnull().mean())

(1022, 5) (438, 5)
BsmtQual       0.023483
FireplaceQu    0.467710
LotFrontage    0.184932
MasVnrArea     0.004892
GarageYrBlt    0.052838
dtype: float64


In [44]:
## Feature Engine automatically captures categorical/ numerical variables

pipeline = Pipeline(
    steps=[
        ("numerical_imputer", mdi.MeanMedianImputer(imputation_method="mean")),
        # we can specify frequent as the imputation method for categoricalimputer
        ("categorical_imputer", mdi.CategoricalVariableImputer(imputation_method="frequent")),
    ]
)
pipeline.fit(x_train)

Pipeline(steps=[('numerical_imputer',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['LotFrontage', 'MasVnrArea',
                                              'GarageYrBlt'])),
                ('categorical_imputer',
                 CategoricalVariableImputer(imputation_method='frequent',
                                            variables=['BsmtQual',
                                                       'FireplaceQu']))])

In [45]:
pipeline.named_steps["numerical_imputer"]    

MeanMedianImputer(imputation_method='mean',
                  variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])

In [47]:
pipeline.named_steps["categorical_imputer"].imputer_dict_

{'BsmtQual': 'TA', 'FireplaceQu': 'Gd'}

In [48]:
x_train[pipeline.named_steps["categorical_imputer"].variables].mode()

Unnamed: 0,BsmtQual,FireplaceQu
0,TA,Gd


In [49]:
tmp = pipeline.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,Gd,69.668667,573.0,1998.0
682,Gd,Gd,69.668667,0.0,1996.0
960,TA,Gd,50.0,0.0,1978.012397
1384,TA,Gd,60.0,0.0,1939.0
1100,TA,Gd,60.0,0.0,1930.0


In [50]:
tmp.isnull().mean()

BsmtQual       0.0
FireplaceQu    0.0
LotFrontage    0.0
MasVnrArea     0.0
GarageYrBlt    0.0
dtype: float64

### Feature Engine - Missing Category Imputation


In [42]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('data/housing.csv', usecols=cols_to_use)

# let's separate into training and testing set
# first drop the target from the feature list
cols_to_use.remove('SalePrice')
# split
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
x_train.dtypes
print(x_train.isnull().mean())

(1022, 5) (438, 5)
BsmtQual       0.023483
FireplaceQu    0.467710
LotFrontage    0.184932
MasVnrArea     0.004892
GarageYrBlt    0.052838
dtype: float64


In [51]:
pipeline = Pipeline(
    steps=[
        ("imputer_mode", mdi.CategoricalVariableImputer(imputation_method="frequent", variables=["BsmtQual"])),
        # here for missing category imputation we dont need to specify anything as the default imputation
        # method is the missing category imputation
        ("imputer_missing", mdi.CategoricalVariableImputer(variables=["FireplaceQu"])),
    ]
)
pipeline.fit(x_train)

Pipeline(steps=[('imputer_mode',
                 CategoricalVariableImputer(imputation_method='frequent',
                                            variables=['BsmtQual'])),
                ('imputer_missing',
                 CategoricalVariableImputer(variables=['FireplaceQu']))])

In [52]:
pipeline.named_steps["imputer_missing"]

CategoricalVariableImputer(variables=['FireplaceQu'])

In [53]:
tmp = pipeline.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,Missing,,573.0,1998.0
682,Gd,Gd,,0.0,1996.0
960,TA,Missing,50.0,0.0,
1384,TA,Missing,60.0,0.0,1939.0
1100,TA,Missing,60.0,0.0,1930.0


In [60]:
tmp.isnull().mean()

BsmtQual       0.000000
FireplaceQu    0.000000
LotFrontage    0.184932
MasVnrArea     0.004892
GarageYrBlt    0.052838
dtype: float64

### Feature Engine - Random Sample Imputation


----------------
- https://feature-engine.readthedocs.io/en/latest/imputers/RandomSampleImputer.html

In [75]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('data/housing.csv', usecols=cols_to_use)

# let's separate into training and testing set
# first drop the target from the feature list
cols_to_use.remove('SalePrice')
# split
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
print(x_train.dtypes)
print(x_train.isnull().mean())

(1022, 5) (438, 5)
BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object
BsmtQual       0.023483
FireplaceQu    0.467710
LotFrontage    0.184932
MasVnrArea     0.004892
GarageYrBlt    0.052838
dtype: float64


---------------------
A seed can be set to a pre-defined number and all observations will be replaced in batch. Alternatively, a seed can be set using the values of 1 or more numerical variables. In this case, the observations will be imputed individually, one at a time, using the values of the variables as a seed.

In [76]:
## Feature engine captures all variables by default
imputer = mdi.RandomSampleImputer(
    random_state=["LotFrontage", "MasVnrArea"],
    seed="observation",
    seeding_method="add"
)
imputer.fit(x_train)

RandomSampleImputer(random_state=['LotFrontage', 'MasVnrArea'],
                    seed='observation',
                    variables=['BsmtQual', 'FireplaceQu', 'LotFrontage',
                               'MasVnrArea', 'GarageYrBlt'])

In [77]:
# the imputer stores a copy of the selected variables from
# the train set, from which to extract the random sample

imputer.X_.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,,,573.0,1998.0
682,Gd,Gd,,0.0,1996.0
960,TA,,50.0,0.0,
1384,TA,,60.0,0.0,1939.0
1100,TA,,60.0,0.0,1930.0


In [78]:
# feature-engine returns a dataframe

tmp = imputer.transform(x_train)
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt
64,Gd,TA,37.0,573.0,1998.0
682,Gd,Gd,60.0,0.0,1996.0
960,TA,TA,50.0,0.0,2001.0
1384,TA,TA,60.0,0.0,1939.0
1100,TA,TA,60.0,0.0,1930.0


### Feature Engine - Missing Indicator


In [79]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    'BsmtQual', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt',
    'SalePrice'
]

data = pd.read_csv('data/housing.csv', usecols=cols_to_use)

# let's separate into training and testing set
# first drop the target from the feature list
cols_to_use.remove('SalePrice')
# split
x_train, x_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0
)
print(x_train.shape, x_test.shape)
print(x_train.dtypes)
print(x_train.isnull().mean())

(1022, 5) (438, 5)
BsmtQual        object
FireplaceQu     object
LotFrontage    float64
MasVnrArea     float64
GarageYrBlt    float64
dtype: object
BsmtQual       0.023483
FireplaceQu    0.467710
LotFrontage    0.184932
MasVnrArea     0.004892
GarageYrBlt    0.052838
dtype: float64



-----------------------
Let's try to use the missing indicator along with the other imputations 

- Add Missing Indicator to all variables
- Median Imputation to numerical variables
- Missing category imputation to categorical variables

In [80]:
pipeline = Pipeline(
    steps=[
        # this adds missing indicator to all nan's
        ("missing_indicator", mdi.AddMissingIndicator()),
        # median imputation to all numerical variables
        ("imputer_numerical", mdi.MeanMedianImputer(imputation_method="median", 
                                                    variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])),
        # now most frequent imputation for categorical
        ("imputer_categorical", mdi.CategoricalVariableImputer(imputation_method="frequent", 
                                                               variables=['FireplaceQu', 'BsmtQual'])),
    ]
)
pipeline.fit(x_train)

Pipeline(steps=[('missing_indicator', AddMissingIndicator()),
                ('imputer_numerical',
                 MeanMedianImputer(variables=['LotFrontage', 'MasVnrArea',
                                              'GarageYrBlt'])),
                ('imputer_categorical',
                 CategoricalVariableImputer(imputation_method='frequent',
                                            variables=['FireplaceQu',
                                                       'BsmtQual']))])

In [95]:
pipeline.named_steps["missing_indicator"].variables

In [96]:
tmp = pipeline.transform(x_train)
tmp.isnull().mean()

BsmtQual          0.0
FireplaceQu       0.0
LotFrontage       0.0
MasVnrArea        0.0
GarageYrBlt       0.0
BsmtQual_na       0.0
FireplaceQu_na    0.0
LotFrontage_na    0.0
MasVnrArea_na     0.0
GarageYrBlt_na    0.0
dtype: float64

In [97]:
tmp.head()

Unnamed: 0,BsmtQual,FireplaceQu,LotFrontage,MasVnrArea,GarageYrBlt,BsmtQual_na,FireplaceQu_na,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
64,Gd,Gd,69.0,573.0,1998.0,0,1,1,0,0
682,Gd,Gd,69.0,0.0,1996.0,0,0,1,0,0
960,TA,Gd,50.0,0.0,1979.0,0,1,0,0,1
1384,TA,Gd,60.0,0.0,1939.0,0,1,0,0,0
1100,TA,Gd,60.0,0.0,1930.0,0,1,0,0,0
