In [1]:
# handle datasets
import pandas as pd
import numpy as np
# plotting
import matplotlib.pyplot as plt
# save the pipeline
import joblib
# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer
# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)
from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)
from feature_engine.transformation import (
    LogTransformer,
    YeoJohnsonTransformer,
)
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

pd.pandas.set_option('display.max_columns', None)

In [2]:
# load dataset
data = pd.read_csv('../datasets/train.csv')
print(data.shape)

(1460, 81)


In [3]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.1,
    random_state=0
)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(1314, 79) (146, 79)
(1314,) (146,)


In [4]:
# target
y_train = np.log(y_train)
y_test = np.log(y_test)

In [5]:
cat_vars = [var for var in data.columns if data[var].dtype=='O']
cat_vars = cat_vars + ['MSSubClass']

X_train[cat_vars] = X_train[cat_vars].astype('O')
X_test[cat_vars] = X_test[cat_vars].astype('O')

print(len(cat_vars))

44


In [6]:
cat_vars_with_na = [
    var for var in cat_vars
    if X_train[var].isnull().sum() > 0
]

In [7]:
# variables with missing prevalence: the strategy is to fill NaNs with 'missing' string
with_string_missing = [
    var for var in cat_vars_with_na if X_train[var].isnull().mean() > 0.1
]
# variables without missing prevalence: the strategy is to fill NaNs with most frequent category
with_frequent_category = [
    var for var in cat_vars_with_na if X_train[var].isnull().mean() <= 0.1
]

In [8]:
# replace missing values with new label: missing
cat_imputer_missing = CategoricalImputer(
    imputation_method='missing', variables=with_string_missing
)
# fit the class to the trains et
cat_imputer_missing.fit(X_train)
# the class learns and stores the parameters
cat_imputer_missing.imputer_dict_

{'Alley': 'Missing',
 'MasVnrType': 'Missing',
 'FireplaceQu': 'Missing',
 'PoolQC': 'Missing',
 'Fence': 'Missing',
 'MiscFeature': 'Missing'}

In [9]:
# replace NA by "Missing"
X_train = cat_imputer_missing.transform(X_train)
X_test = cat_imputer_missing.transform(X_test)

In [10]:
# replace missing with the most frequent value

## set up class
cat_imputer_frequent = CategoricalImputer(
    imputation_method='frequent', variables=with_frequent_category
)

## fit the class to the train set
cat_imputer_frequent.fit(X_train)

# the class learns and stores the parameters
cat_imputer_frequent.imputer_dict_

{'BsmtQual': 'TA',
 'BsmtCond': 'TA',
 'BsmtExposure': 'No',
 'BsmtFinType1': 'Unf',
 'BsmtFinType2': 'Unf',
 'Electrical': 'SBrkr',
 'GarageType': 'Attchd',
 'GarageFinish': 'Unf',
 'GarageQual': 'TA',
 'GarageCond': 'TA'}

In [11]:
# replace NA by the most frequent value 
X_train = cat_imputer_frequent.transform(X_train)
X_test = cat_imputer_frequent.transform(X_test)

In [12]:
X_train[cat_vars_with_na].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Electrical      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [13]:
# vars with numerical values
num_vars = [
    var for var in X_train.columns if var not in cat_vars and var != 'SalePrice'
    ]
print(len(num_vars))

35


In [14]:
# numerical vars with missing
vars_with_na = [
    var for var in num_vars
    if X_train[var].isnull().sum() > 0
]

X_train[vars_with_na].isnull().mean()

LotFrontage    0.177321
MasVnrArea     0.004566
GarageYrBlt    0.056317
dtype: float64

In [15]:
# add missing indicator

missing_ind = AddMissingIndicator(variables=vars_with_na)
missing_ind.fit(X_train)

X_train = missing_ind.transform(X_train)
X_test = missing_ind.transform(X_test)

X_train[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']].head()

Unnamed: 0,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
930,0,0,0
656,0,0,0
45,0,0,0
1348,1,0,0
55,0,0,0


In [16]:
# then replace missing data with the mean

# set the imputer
mean_imputer = MeanMedianImputer(
    imputation_method='mean', variables=vars_with_na
)

mean_imputer.fit(X_train)

mean_imputer.imputer_dict_

{'LotFrontage': 69.87974098057354,
 'MasVnrArea': 103.7974006116208,
 'GarageYrBlt': 1978.2959677419356}

In [17]:
X_train = mean_imputer.transform(X_train)
X_test = mean_imputer.transform(X_test)

In [18]:
X_train[vars_with_na].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
GarageYrBlt    0.0
dtype: float64

In [19]:
def elapsed_years(df, var):
    df[var] = df['YrSold'] - df[var]
    return df

for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    X_train = elapsed_years(X_train, var)
    X_test = elapsed_years(X_test, var)

In [20]:
# drop YrSold
drop_features = DropFeatures(features_to_drop=['YrSold'])

X_train = drop_features.fit_transform(X_train)
X_test = drop_features.transform(X_test)

In [21]:
# Numerical Transformations

# Log Transform
log_transform = LogTransformer(
    variables=['LotFrontage', '1stFlrSF', 'GrLivArea']
)

X_train = log_transform.fit_transform(X_train)
X_test = log_transform.transform(X_test)

In [22]:
# Yeo-Johnson
yeo_transformer = YeoJohnsonTransformer(
    variables=['LotArea']
)

X_train = yeo_transformer.fit_transform(X_train)
X_test = yeo_transformer.transform(X_test)

# the learned parameter
yeo_transformer.lambda_dict_

{'LotArea': 0.017755558882009546}

In [23]:
# Binarize skewed variables

skewed = ['BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch', '3SsnPorch',
          'ScreenPorch', 'MiscVal']

X_train[skewed].head()

Unnamed: 0,BsmtFinSF2,LowQualFinSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal
930,0,0,0,0,0,0
656,0,0,0,0,0,0
45,0,0,0,0,0,0
1348,0,0,0,0,0,0
55,0,0,0,407,0,0


In [24]:
binarizer = SklearnTransformerWrapper(
    transformer=Binarizer(threshold=0), variables=skewed
)

X_train = binarizer.fit_transform(X_train)
X_test = binarizer.transform(X_test)

X_train[skewed].head()

Unnamed: 0,BsmtFinSF2,LowQualFinSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal
930,0,0,0,0,0,0
656,0,0,0,0,0,0
45,0,0,0,0,0,0
1348,0,0,0,0,0,0
55,0,0,0,1,0,0


In [25]:
# Specific mappings for Categorical Variables

qual_mappings = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'Missing': 0,
                 'NA': 0}
qual_vars = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
             'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']

for var in qual_vars:
    X_train[var] = X_train[var].map(qual_mappings)    
    X_test[var] = X_test[var].map(qual_mappings)

exposure_mappings = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4, 'Missing': 0, 'NA': 0}
var = 'BsmtExposure'
X_train[var] = X_train[var].map(exposure_mappings)
X_test[var] = X_test[var].map(exposure_mappings)

fisish_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 2,
                   'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
finish_vars = ['BsmtFinType1', 'BsmtFinType2']
for var in finish_vars:
    X_train[var] = X_train[var].map(fisish_mappings)
    X_test[var] = X_test[var].map(fisish_mappings)

garage_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
var = 'GarageFinish'
X_train[var] = X_train[var].map(garage_mappings)
X_test[var] = X_test[var].map(garage_mappings)

fence_mappings = {'Missing': 0, 'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3,
                  'GdPrv': 4}
var = 'Fence'
X_train[var] = X_train[var].map(fence_mappings)
X_test[var] = X_test[var].map(fence_mappings)

qual_vars = qual_vars + finish_vars + ['BsmtExposure', 'GarageFinish', 'Fence']

In [32]:
# Remove rare labels
# remaining categorical variables
 
cat_others = [
    var for var in cat_vars if var not in qual_vars
]
print(len(cat_others))
print(cat_others)

30
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'PoolQC', 'MiscFeature', 'SaleType', 'SaleCondition', 'MSSubClass']


In [34]:
X_train['MSSubClass'] = X_train['MSSubClass'].astype('O')

In [35]:
rare_encoder = RareLabelEncoder(tol=0.01, n_categories=1, variables=cat_others)

rare_encoder.fit(X_train)

rare_encoder.encoder_dict_

{'MSZoning': ['RL', 'RM', 'FV', 'RH'],
 'Street': ['Pave'],
 'Alley': ['Missing', 'Grvl', 'Pave'],
 'LotShape': ['Reg', 'IR1', 'IR2'],
 'LandContour': ['Lvl', 'Bnk', 'HLS', 'Low'],
 'Utilities': ['AllPub'],
 'LotConfig': ['Inside', 'Corner', 'CulDSac', 'FR2'],
 'LandSlope': ['Gtl', 'Mod'],
 'Neighborhood': ['NAmes',
  'CollgCr',
  'OldTown',
  'Edwards',
  'Somerst',
  'NridgHt',
  'Gilbert',
  'Sawyer',
  'NWAmes',
  'BrkSide',
  'SawyerW',
  'Crawfor',
  'Mitchel',
  'Timber',
  'NoRidge',
  'IDOTRR',
  'ClearCr',
  'SWISU',
  'StoneBr',
  'Blmngtn',
  'MeadowV',
  'BrDale'],
 'Condition1': ['Norm', 'Feedr', 'Artery', 'RRAn', 'PosN'],
 'Condition2': ['Norm'],
 'BldgType': ['1Fam', 'TwnhsE', 'Duplex', 'Twnhs', '2fmCon'],
 'HouseStyle': ['1Story', '2Story', '1.5Fin', 'SLvl', 'SFoyer'],
 'RoofStyle': ['Gable', 'Hip'],
 'RoofMatl': ['CompShg'],
 'Exterior1st': ['VinylSd',
  'HdBoard',
  'Wd Sdng',
  'MetalSd',
  'Plywood',
  'CemntBd',
  'BrkFace',
  'Stucco',
  'WdShing',
  'AsbShng'],


In [39]:
X_train = rare_encoder.fit_transform(X_train)
X_test = rare_encoder.transform(X_test)

In [43]:
# ordinal enconding
cat_encoder = OrdinalEncoder(encoding_method='ordered', variables=cat_others)
cat_encoder.fit(X_train, y_train)
cat_encoder.encoder_dict_

{'MSZoning': {'Rare': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4},
 'Street': {'Rare': 0, 'Pave': 1},
 'Alley': {'Grvl': 0, 'Pave': 1, 'Missing': 2},
 'LotShape': {'Reg': 0, 'IR1': 1, 'Rare': 2, 'IR2': 3},
 'LandContour': {'Bnk': 0, 'Lvl': 1, 'Low': 2, 'HLS': 3},
 'Utilities': {'Rare': 0, 'AllPub': 1},
 'LotConfig': {'Inside': 0, 'FR2': 1, 'Corner': 2, 'Rare': 3, 'CulDSac': 4},
 'LandSlope': {'Gtl': 0, 'Mod': 1, 'Rare': 2},
 'Neighborhood': {'IDOTRR': 0,
  'MeadowV': 1,
  'BrDale': 2,
  'Edwards': 3,
  'BrkSide': 4,
  'OldTown': 5,
  'Sawyer': 6,
  'SWISU': 7,
  'NAmes': 8,
  'Mitchel': 9,
  'SawyerW': 10,
  'Rare': 11,
  'NWAmes': 12,
  'Gilbert': 13,
  'Blmngtn': 14,
  'CollgCr': 15,
  'Crawfor': 16,
  'ClearCr': 17,
  'Somerst': 18,
  'Timber': 19,
  'StoneBr': 20,
  'NridgHt': 21,
  'NoRidge': 22},
 'Condition1': {'Artery': 0,
  'Feedr': 1,
  'Norm': 2,
  'RRAn': 3,
  'Rare': 4,
  'PosN': 5},
 'Condition2': {'Rare': 0, 'Norm': 1},
 'BldgType': {'2fmCon': 0, 'Duplex': 1, 'Twnhs': 2, '1Fa

In [44]:
X_train = cat_encoder.transform(X_train)
X_test = cat_encoder.transform(X_test)

In [46]:
# Feature Scaling

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)
X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)

In [51]:
class MeanImputer:
    def __init__(self, variables) -> None:
        self.variables = variables

    def fit(self, X, y=None):
        self.imputer_dict_ = X[self.variables].mean().to_dict()
        return self
    
    def transform(self, X):
        for var in self.variables:
            X[var] = X[var].fillna(self.imputer_dict_[var])
        return X

In [52]:
my_imputer = MeanImputer(variables=['Neighborhood', 'MSZoning'])
my_imputer.fit(X_train)
my_imputer.imputer_dict_

{'Neighborhood': 0.49197453991974543, 'MSZoning': 0.6767503805175038}

In [54]:
temp = my_imputer.transform(X_train)