In [1]:
pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m174.1/328.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.2


**MEAN_MEDIAN IMPUTAION**

In [2]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine.imputation import MeanMedianImputer

In [3]:
cols_to_use = [
    "BsmtQual",
    "FireplaceQu",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

In [4]:
data = pd.read_csv("House_price.csv",usecols=cols_to_use)

In [5]:
data.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),
    data["SalePrice"],
    test_size=0.3,
    random_state=0,
)
X_train.shape, X_test.shape

((1022, 5), (438, 5))

In [7]:
y_train.shape, y_test.shape

((1022,), (438,))

In [8]:
X_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.052838
dtype: float64

**Capture numerical variables automatically**

In [9]:
imputer = MeanMedianImputer(imputation_method="median")

In [10]:
imputer.fit(X_train)

In [11]:
imputer.variables_

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [12]:
imputer.imputer_dict_

{'LotFrontage': 69.0, 'MasVnrArea': 0.0, 'GarageYrBlt': 1979.0}

In [13]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,69.0,573.0,Gd,,1998.0
682,69.0,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,,1979.0
1384,60.0,0.0,TA,,1939.0
1100,60.0,0.0,TA,,1930.0


In [15]:
X_train[imputer.variables_].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
GarageYrBlt    0.0
dtype: float64

**Specify variable groups**

In [16]:
imputer = MeanMedianImputer(
    imputation_method="mean",
    variables=["LotFrontage", "MasVnrArea"]
)

In [17]:
imputer.fit(X_train)

In [18]:
imputer.variables_

['LotFrontage', 'MasVnrArea']

In [19]:
imputer.imputer_dict_

{'LotFrontage': 69.54500978473581, 'MasVnrArea': 103.04696673189824}

In [20]:
X_train[imputer.variables_].mean()

LotFrontage     69.545010
MasVnrArea     103.046967
dtype: float64

In [21]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train[imputer.variables_].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
dtype: float64

**Pipeline**

In [22]:
pipe = Pipeline([
    ("median_imputer",MeanMedianImputer(imputation_method="median",variables=["LotFrontage", "GarageYrBlt"])),
    ("mean_imputer",MeanMedianImputer(imputation_method="mean",variables=["MasVnrArea"]))
])

In [23]:
pipe.fit(X_train)

In [24]:
pipe.named_steps["median_imputer"].imputer_dict_

{'LotFrontage': 69.0, 'GarageYrBlt': 1979.0}

In [25]:
pipe.named_steps["mean_imputer"].imputer_dict_

{'MasVnrArea': 103.04696673189824}

In [26]:
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)
X_train.isnull().mean()

LotFrontage    0.000000
MasVnrArea     0.000000
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.000000
dtype: float64

**ARBITARY IMPUTATION**

In [27]:
import pandas as pd
import matplotlib.pyplot as plt

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine.imputation import ArbitraryNumberImputer

In [28]:
cols_to_use1 = [
    "BsmtQual",
    "FireplaceQu",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]


In [29]:
data1 = pd.read_csv("House_price.csv",usecols=cols_to_use1)

In [30]:
data1.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [31]:
x_train,x_test,y_train,y_test=train_test_split(
    data1.drop("SalePrice",axis=1),
    data1["SalePrice"],
    test_size=0.3,
    random_state=0
)

In [32]:
x_train.shape, x_test.shape

((1022, 5), (438, 5))

In [33]:
x_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.052838
dtype: float64

In [34]:
imputer = ArbitraryNumberImputer(arbitrary_number=-999)

In [35]:
imputer.fit(X_train)

In [36]:
imputer.variables_

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [37]:
imputer.arbitrary_number

-999

In [39]:
x_train = imputer.transform(x_train)
x_test = imputer.transform(x_test)
x_train.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,-999.0,573.0,Gd,,1998.0
682,-999.0,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,,-999.0
1384,60.0,0.0,TA,,1939.0
1100,60.0,0.0,TA,,1930.0


In [41]:
x_train.isnull().sum()

LotFrontage      0
MasVnrArea       0
BsmtQual        24
FireplaceQu    478
GarageYrBlt      0
dtype: int64

In [None]:
#Impute different variables with different numbers
#imputer = ArbitraryNumberImputer(
#    imputer_dict={
#        "LotFrontage": -999,
#        "MasVnrArea": -999,
#        "GarageYrBlt": -1})

#imputer.fit(X_train)

**Frequent Category Imputation**

In [47]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine.imputation import CategoricalImputer

In [42]:
cols_to_use2 = [
    "BsmtQual",
    "FireplaceQu",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

In [43]:
data2 = pd.read_csv("House_price.csv",usecols=cols_to_use2)

In [44]:
data2.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),
    data["SalePrice"],
    test_size=0.3,
    random_state=0
)
X_train.shape, X_test.shape

((1022, 5), (438, 5))

In [46]:
X_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.052838
dtype: float64

In [48]:
imputer = CategoricalImputer(imputation_method="frequent")

In [49]:
imputer.fit(X_train)

In [50]:
imputer.variables_

['BsmtQual', 'FireplaceQu']

In [51]:
imputer.imputer_dict_

{'BsmtQual': 'TA', 'FireplaceQu': 'Gd'}

In [52]:
X_train[imputer.variables_].mode()

Unnamed: 0,BsmtQual,FireplaceQu
0,TA,Gd


In [53]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,,573.0,Gd,Gd,1998.0
682,,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,Gd,
1384,60.0,0.0,TA,Gd,1939.0
1100,60.0,0.0,TA,Gd,1930.0


In [54]:
X_train[imputer.variables_].isnull().mean()

BsmtQual       0.0
FireplaceQu    0.0
dtype: float64

**Missing Category Imputation**

By default it performs imputation with a string missing.



In [None]:
#imputer = CategoricalImputer()
#imputer.fit(X_train)

Pipeline

In [57]:
cols_to_use3 = [
    "BsmtQual",
    "FireplaceQu",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

In [58]:
data4 = pd.read_csv("House_price.csv", usecols=cols_to_use3)
data4.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [59]:
X_train, X_test, y_train, y_test = train_test_split(
    data4.drop("SalePrice", axis=1),
    data4["SalePrice"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((1022, 5), (438, 5))

In [60]:
X_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.052838
dtype: float64

In [61]:
pipe =Pipeline([
    ("impute_mode",CategoricalImputer(imputation_method="frequent",variables=["BsmtQual"])),
    ("impute_missing",CategoricalImputer(variables=["FireplaceQu"]))
])

In [62]:
pipe.fit(X_train)

In [64]:
pipe.named_steps["impute_mode"].variables

['BsmtQual']

In [65]:
pipe.named_steps["impute_missing"].variables

['FireplaceQu']

In [66]:
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

# let's check null values are gone
X_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.000000
FireplaceQu    0.000000
GarageYrBlt    0.052838
dtype: float64

**Missing Indicator **

In [67]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

In [68]:
cols_to_use5 = [
    "BsmtQual",
    "FireplaceQu",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

In [70]:
data5 = pd.read_csv("House_price.csv", usecols=cols_to_use5)

data5.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [71]:
data5.isnull().sum()

LotFrontage    259
MasVnrArea       8
BsmtQual        37
FireplaceQu    690
GarageYrBlt     81
SalePrice        0
dtype: int64

In [72]:
X_train, X_test, y_train, y_test = train_test_split(
    data5.drop("SalePrice", axis=1),
    data5["SalePrice"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((1022, 5), (438, 5))

In [73]:
X_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.052838
dtype: float64

In [74]:
imputer = AddMissingIndicator(missing_only=True)

In [75]:
imputer.fit(X_train)

In [76]:
imputer.variables_

['LotFrontage', 'MasVnrArea', 'BsmtQual', 'FireplaceQu', 'GarageYrBlt']

In [77]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)
X_train.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,LotFrontage_na,MasVnrArea_na,BsmtQual_na,FireplaceQu_na,GarageYrBlt_na
64,,573.0,Gd,,1998.0,1,0,0,1,0
682,,0.0,Gd,Gd,1996.0,1,0,0,0,0
960,50.0,0.0,TA,,,0,0,0,1,1
1384,60.0,0.0,TA,,1939.0,0,0,0,1,0
1100,60.0,0.0,TA,,1930.0,0,0,0,1,0


In [78]:
X_train.isnull().mean()

LotFrontage       0.184932
MasVnrArea        0.004892
BsmtQual          0.023483
FireplaceQu       0.467710
GarageYrBlt       0.052838
LotFrontage_na    0.000000
MasVnrArea_na     0.000000
BsmtQual_na       0.000000
FireplaceQu_na    0.000000
GarageYrBlt_na    0.000000
dtype: float64

In [79]:
pipe = Pipeline([
    ("missing_ind",AddMissingIndicator()),
    ("mode_impute",CategoricalImputer(imputation_method="frequent",variables=["FireplaceQu", "BsmtQual"])),
    ("mean_impute",MeanMedianImputer(imputation_method="median",variables=["LotFrontage", "MasVnrArea", "GarageYrBlt"]))
])

In [80]:
pipe.fit(X_train)

In [81]:
pipe.named_steps["missing_ind"].variables_

['LotFrontage', 'MasVnrArea', 'BsmtQual', 'FireplaceQu', 'GarageYrBlt']

In [83]:
pipe.named_steps["mode_impute"].imputer_dict_

{'FireplaceQu': 'Gd', 'BsmtQual': 'TA'}

In [84]:
pipe.named_steps["mean_impute"].imputer_dict_

{'LotFrontage': 69.0, 'MasVnrArea': 0.0, 'GarageYrBlt': 1979.0}

In [85]:
X_train= pipe.transform(X_train)
X_test= pipe.transform(X_test)
X_train.isnull().mean()

LotFrontage       0.0
MasVnrArea        0.0
BsmtQual          0.0
FireplaceQu       0.0
GarageYrBlt       0.0
LotFrontage_na    0.0
MasVnrArea_na     0.0
BsmtQual_na       0.0
FireplaceQu_na    0.0
GarageYrBlt_na    0.0
dtype: float64