<center style='font-size:40px'><b>Feature Engineering</b></center>

# Data Overview:
## Import Libraries:

In [3]:
import os
import pickle
import warnings
import numpy as np
from util import *
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 1000
from tqdm import tqdm_notebook as tqdm

## Read Data:

In [4]:
train = pd.read_csv('../2_Data Preprocessing/output/train_processed.csv')
test  = pd.read_csv('../2_Data Preprocessing/output/test_processed.csv')


shape(train, test)

~> [train] has [5m[7m[34m 1,460 [0m rows, and [5m[7m[34m 77 [0m columns.
~> [test ] has [5m[7m[34m 1,459 [0m rows, and [5m[7m[34m 76 [0m columns.


In [5]:
# Combine train and test together to apply the changes.
traintest     = pd.concat([train, test], axis=0)
train_ids     = train.Id
train_labels  = train.SalePrice
test_ids      = test.Id

shape(traintest)

~> [traintest] has [5m[7m[34m 2,919 [0m rows, and [5m[7m[34m 77 [0m columns.


# Feature Generation: 
## Categorical Features:

In [6]:
# Some utility function:
def save_or_load(name):
    # See if list colums is there or not.
    if os.path.isfile(f'./{name}.pkl'):
        with open(f'{name}.pkl', 'rb') as f: 
            list_cols = pickle.load(f)
    else:
        # Select only the list colums.
        list_cols = [col for col in train.columns if col.startswith(name)]

        # Save the list colums to later use.
        with open(f'{name}.pkl', 'wb') as f:
            pickle.dump(list_cols, f)
    return list_cols

In [7]:
ord_cols  = ['LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtExposure', 
             'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'BsmtFinType2', 'HeatingQC', 'Electrical', 
             'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
             'Fireplaces', 'FireplaceQu', 'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'PavedDrive',
             'PoolQC', 'BedroomAbvGr', 'KitchenAbvGr']
cat_cols  = ['MSSubClass', 'MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 
            'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'RoofStyle', 'RoofMatl', 'Exterior1st',
            'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'GarageType', 'MiscFeature', 'SaleType', 
             'SaleCondition', 'HouseStyle', 'Fence', 'CentralAir']
num_cols  = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
            '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
date_cols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold']

### One-Hot Encoding:

In [8]:
for var in list(ord_cols+cat_cols):
    train = pd.concat([train, pd.get_dummies(train[var], prefix=f'onehot_{var}', prefix_sep='_', drop_first=True)], axis=1)
    test  = pd.concat([test, pd.get_dummies(test[var], prefix=f'onehot_{var}', prefix_sep='_', drop_first=True)], axis=1)

shape(train, test)

~> [train] has [5m[7m[34m 1,460 [0m rows, and [5m[7m[34m 367 [0m columns.
~> [test ] has [5m[7m[34m 1,459 [0m rows, and [5m[7m[34m 351 [0m columns.


In [9]:
# Align all the columns in train data with test data.
train_labels       = train.SalePrice
train, test        = train.align(test, join='inner', axis=1)
train['SalePrice'] = train_labels

shape(train, test)

~> [train] has [5m[7m[34m 1,460 [0m rows, and [5m[7m[34m 337 [0m columns.
~> [test ] has [5m[7m[34m 1,459 [0m rows, and [5m[7m[34m 336 [0m columns.


We can see after aligning the train and test datasets, there are number of columns dropped.

In [10]:
onehot_cols = save_or_load('onehot_cols')

### Frequency Encoding:

In [11]:
for col in list(cat_cols+ord_cols):
    encoding             = traintest[col].value_counts(normalize=True)
    train[f'freq_{col}'] = train[col].apply(lambda x: encoding[x])
    test[f'freq_{col}']  = test[col].apply(lambda x: encoding[x])

shape(train, test)

~> [train] has [5m[7m[34m 1,460 [0m rows, and [5m[7m[34m 390 [0m columns.
~> [test ] has [5m[7m[34m 1,459 [0m rows, and [5m[7m[34m 389 [0m columns.


In [12]:
freq_cols = save_or_load('freq_cols')

### Target Encoding:

In [13]:
for col in list(cat_cols+ord_cols):
    means   = train.groupby(col).SalePrice.mean()
    stds    = train.groupby(col).SalePrice.std()
    skews   = train.groupby(col).SalePrice.skew()
    counts  = train.groupby(col).SalePrice.count()
    medians = train.groupby(col).SalePrice.median()
    mins    = train.groupby(col).SalePrice.min()
    maxs    = train.groupby(col).SalePrice.max()
    
    for stat in [means, stds, skews, medians, mins, maxs]:
        train[f'target_enc_{var2str(stat)}_{col}'] = train[col].map(stat)
        test[f'target_enc_{var2str(stat)}_{col}']  = test[col].map(stat)

In [14]:
# save or load the target encoded columns' names.
target_enc_cols = save_or_load('target_enc_cols')

In [15]:
shape(train, test)

~> [train] has [5m[7m[34m 1,460 [0m rows, and [5m[7m[34m 708 [0m columns.
~> [test ] has [5m[7m[34m 1,459 [0m rows, and [5m[7m[34m 707 [0m columns.


<div class='alert alert-info'>
    <p style='font-size:20px;font-weight:bold'>Note:<p>
    <p style='font-size:16px'>We can see that the number of columns increases as we go along. <br>In the next notebook, we'll apply some feature selection techniques to get rid of the redundant features.</p>
</div>

## Numerical Features:
### Log1p / Sqrt:

In [16]:
for col in num_cols:
    train[f'log_{col}'] = np.log1p(train[col])
    train[f'sqrt_{col}'] = np.sqrt(train[col])
    
    test[f'log_{col}'] = np.log1p(test[col])
    test[f'sqrt_{col}'] = np.sqrt(test[col])

# See the shape of both datsets.
shape(train, test)

~> [train] has [5m[7m[34m 1,460 [0m rows, and [5m[7m[34m 742 [0m columns.
~> [test ] has [5m[7m[34m 1,459 [0m rows, and [5m[7m[34m 741 [0m columns.


**Save the name of the columns for later use**

In [17]:
# save or load the log/sqrt columns' names.
log_cols = save_or_load('log_cols')
sqrt_cols = save_or_load('sqrt_cols')

### Polynomial Features:

In [19]:
from sklearn.preprocessing import PolynomialFeatures

poly_transformer = PolynomialFeatures(degree=3, include_bias=False)
poly_transformer.fit(train[num_cols])

poly_train = poly_transformer.transform(train[num_cols])
poly_test  = poly_transformer.transform(test[num_cols])

shape(poly_train, poly_test)

~> [poly_train] has [5m[7m[34m 1,460 [0m rows, and [5m[7m[34m 1,139 [0m columns.
~> [poly_test] has [5m[7m[34m 1,459 [0m rows, and [5m[7m[34m 1,139 [0m columns.
