## References
> https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard <br>
> https://github.com/trainindata/dmlm-research-and-production/blob/master/Section-2-Machine-Learning-Pipeline-Overview/Machine-Learning-Pipeline-Step2-Feature-Engineering.ipynb

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler

from scipy.stats import norm, skew
from scipy.special import boxcox1p

In [2]:
SEED = 42
np.random.seed(SEED)

TARGET = "SalePrice"
TRAIN_SET = "train.csv"
TEST_SET = 'test.csv'

TIME_VARIATIONS = ['year', 'yr']

In [3]:
df_train = pd.read_csv(TRAIN_SET)
df_test = pd.read_csv(TEST_SET)

df_train.set_index('Id', inplace=True)
df_test.set_index('Id', inplace=True)

In [4]:
numerical_variables = [var for var in df_train.columns if df_train[var].dtypes != 'O']
# removing target 
numerical_variables.pop(numerical_variables.index(TARGET))
numerical_variables_num = len(numerical_variables)

time_vars = [var for var in numerical_variables for var2 in TIME_VARIATIONS if var2 in var.lower()]

discrete_vars = [var for var in numerical_variables if len(df_train[var].unique()) < 20 and var not in time_vars]
discrete_vars_num = len(discrete_vars)
discrete_vars.pop(discrete_vars.index("PoolArea"));
discrete_vars_num -= 1

continuous_vars = [var for var in numerical_variables if var not in time_vars+discrete_vars]
continuous_vars_num = len(continuous_vars)

categorical_variables = [var for var in df_train.columns if df_train[var].dtypes == 'O']
categorical_variables_num = len(categorical_variables)

In [5]:
print(f"Number of nulls in features: {df_train.isnull().sum().sum()}")

Number of nulls in features: 6965


In [6]:
null_variables_in_numerics = [var for var in numerical_variables if df_train[var].isnull().sum() > 0]
print(f"Number of null features: {len(null_variables_in_numerics)}")
print(f"Number of nulls in numerical features: {df_train[null_variables_in_numerics].isnull().sum().sum()}")
num_nulls = df_train[numerical_variables].isnull().sum().sort_values(ascending=False)
for row in num_nulls.iteritems():
    print(f"{row[0]}  \t{row[1]} nulls \t{row[1]/df_train.shape[0]:.3f}%")

Number of null features: 3
Number of nulls in numerical features: 348
LotFrontage  	259 nulls 	0.177%
GarageYrBlt  	81 nulls 	0.055%
MasVnrArea  	8 nulls 	0.005%
YrSold  	0 nulls 	0.000%
BsmtFinSF2  	0 nulls 	0.000%
LowQualFinSF  	0 nulls 	0.000%
2ndFlrSF  	0 nulls 	0.000%
1stFlrSF  	0 nulls 	0.000%
TotalBsmtSF  	0 nulls 	0.000%
BsmtUnfSF  	0 nulls 	0.000%
BsmtFinSF1  	0 nulls 	0.000%
BsmtFullBath  	0 nulls 	0.000%
YearRemodAdd  	0 nulls 	0.000%
YearBuilt  	0 nulls 	0.000%
OverallCond  	0 nulls 	0.000%
OverallQual  	0 nulls 	0.000%
LotArea  	0 nulls 	0.000%
GrLivArea  	0 nulls 	0.000%
BsmtHalfBath  	0 nulls 	0.000%
MoSold  	0 nulls 	0.000%
WoodDeckSF  	0 nulls 	0.000%
MiscVal  	0 nulls 	0.000%
PoolArea  	0 nulls 	0.000%
ScreenPorch  	0 nulls 	0.000%
3SsnPorch  	0 nulls 	0.000%
EnclosedPorch  	0 nulls 	0.000%
OpenPorchSF  	0 nulls 	0.000%
GarageArea  	0 nulls 	0.000%
FullBath  	0 nulls 	0.000%
GarageCars  	0 nulls 	0.000%
Fireplaces  	0 nulls 	0.000%
TotRmsAbvGrd  	0 nulls 	0.000%
Kitch

In [7]:
null_variables_in_categoric = [var for var in categorical_variables if df_train[var].isnull().sum() > 0]
print(f"Number of null features: {len(null_variables_in_categoric)}")
print(f"Number of nulls in categorical features: {df_train[null_variables_in_categoric].isnull().sum().sum()}")
num_nulls = df_train[categorical_variables].isnull().sum().sort_values(ascending=False)
for row in num_nulls.iteritems():
    print(f"{row[0]}   \t{row[1]} nulls \t{row[1]/df_train.shape[0]:.3f}%")

Number of null features: 16
Number of nulls in categorical features: 6617
PoolQC   	1453 nulls 	0.995%
MiscFeature   	1406 nulls 	0.963%
Alley   	1369 nulls 	0.938%
Fence   	1179 nulls 	0.808%
FireplaceQu   	690 nulls 	0.473%
GarageCond   	81 nulls 	0.055%
GarageQual   	81 nulls 	0.055%
GarageFinish   	81 nulls 	0.055%
GarageType   	81 nulls 	0.055%
BsmtFinType2   	38 nulls 	0.026%
BsmtExposure   	38 nulls 	0.026%
BsmtFinType1   	37 nulls 	0.025%
BsmtQual   	37 nulls 	0.025%
BsmtCond   	37 nulls 	0.025%
MasVnrType   	8 nulls 	0.005%
Electrical   	1 nulls 	0.001%
Condition2   	0 nulls 	0.000%
Condition1   	0 nulls 	0.000%
Neighborhood   	0 nulls 	0.000%
LandSlope   	0 nulls 	0.000%
BldgType   	0 nulls 	0.000%
LandContour   	0 nulls 	0.000%
LotConfig   	0 nulls 	0.000%
Utilities   	0 nulls 	0.000%
RoofStyle   	0 nulls 	0.000%
LotShape   	0 nulls 	0.000%
Street   	0 nulls 	0.000%
HouseStyle   	0 nulls 	0.000%
SaleCondition   	0 nulls 	0.000%
RoofMatl   	0 nulls 	0.000%
Exterior1st   	0 nu

## Removing the outliers

In [8]:
#Deleting outliers
df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train[TARGET]<300000)].index)

## Missing values

**Categorical variables**

In [9]:
null_variables_in_categoric

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

- **Alley** : data description says NA means "no alley access".
- **MasVnrType** : data description says None means "None".
- **BsmtQual** : data description says NA means "No Basement".
- **BsmtCond** : data description says NA means "No Basement".
- **BsmtExposure** : data description says NA means "No Basement".
- **BsmtFinType1** : data description says NA means "No Basement".
- **BsmtFinType2** : data description says NA means "No Basement".
- **FireplaceQu** : data description says NA means "no fireplace"
- **GarageType** : NA means there is "no garage".
- **GarageFinish** : NA means there is "no garage".
- **GarageQual** : NA means there is "no garage".
- **GarageCond** : NA means there is "no garage".
- **PoolQC** : data description says NA means "No  Pool". 
- **Fence** : data description says NA means "no fence".
- **MiscFeature** : data description says NA means "None".
- **Electrical** : only 1 values missing so fill it with its mode won't affect as much.

In [10]:
df_train[null_variables_in_categoric] = df_train[null_variables_in_categoric].fillna('NONE')
df_test[null_variables_in_categoric] = df_test[null_variables_in_categoric].fillna('NONE')


num_nulls = df_train[categorical_variables].isnull().sum().sort_values(ascending=False)
num_nulls

SaleCondition    0
Condition2       0
ExterQual        0
MasVnrType       0
Exterior2nd      0
Exterior1st      0
RoofMatl         0
RoofStyle        0
HouseStyle       0
BldgType         0
Condition1       0
Foundation       0
Neighborhood     0
LandSlope        0
LotConfig        0
Utilities        0
LandContour      0
LotShape         0
Alley            0
Street           0
ExterCond        0
BsmtQual         0
SaleType         0
FireplaceQu      0
MiscFeature      0
Fence            0
PoolQC           0
PavedDrive       0
GarageCond       0
GarageQual       0
GarageFinish     0
GarageType       0
Functional       0
BsmtCond         0
KitchenQual      0
Electrical       0
CentralAir       0
HeatingQC        0
Heating          0
BsmtFinType2     0
BsmtFinType1     0
BsmtExposure     0
MSZoning         0
dtype: int64

In [11]:
df_test[categorical_variables].isnull().sum().sort_values(ascending=False)

MSZoning         4
Utilities        2
Functional       2
SaleType         1
Exterior2nd      1
Exterior1st      1
KitchenQual      1
Alley            0
BldgType         0
MasVnrType       0
RoofMatl         0
RoofStyle        0
HouseStyle       0
Condition2       0
LotShape         0
ExterCond        0
Condition1       0
Neighborhood     0
LandSlope        0
LotConfig        0
Street           0
LandContour      0
ExterQual        0
SaleCondition    0
Foundation       0
GarageType       0
MiscFeature      0
Fence            0
PoolQC           0
PavedDrive       0
GarageCond       0
GarageQual       0
GarageFinish     0
FireplaceQu      0
BsmtCond         0
Electrical       0
CentralAir       0
HeatingQC        0
Heating          0
BsmtFinType2     0
BsmtFinType1     0
BsmtExposure     0
BsmtQual         0
dtype: int64

- **MSZoning** : only 4 values missing so fill it with its mode won't affect as much.
- **Utilities** : only 2 values missing so fill it with its mode won't affect as much.
- **Exterior1st & Exterior2nd** : only 1 values missing so fill them with there mode won't affect as much.
- **KitchenQual** : only 1 values missing so fill it with its mode won't affect as much.
- **Functional** : only 2 values missing so fill it with its mode won't affect as much.
- **SaleType** : only 1 values missing so fill it with its mode won't affect as much.

In [12]:
categorical_features_nulls_filled_with_mode = ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'Electrical', 'KitchenQual', 'Functional', 'SaleType']
for var in categorical_features_nulls_filled_with_mode:
    mode = df_test[var].mode()[0]
    df_test[var] = df_test[var].fillna(mode)

**Numerical variables**

In [13]:
null_variables_in_numerics

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

- **LotFrontage** : add a binary missing value indicator variable then fill with the "mode".
- **MasVnrArea** : add a binary missing value indicator variable then fill with the "mode".
- **GarageYrBlt, GarageCars & GarageArea** : since there is no grage then fill with "0".
- **BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath & BsmtHalfBath** : since there is no basement then fill with "0".

In [14]:
numerical_features_nulls_filled_with_zero = ['GarageYrBlt']

df_train[numerical_features_nulls_filled_with_zero] = df_train[numerical_features_nulls_filled_with_zero].fillna(0)
df_test[numerical_features_nulls_filled_with_zero] = df_test[numerical_features_nulls_filled_with_zero].fillna(0)


numerical_features_nulls_filled_with_mode = ['LotFrontage', 'MasVnrArea']
for var in numerical_features_nulls_filled_with_mode:
    mode = df_train[var].mode()[0]
    df_train[var+'_na'] = df_train[var].isnull()
    df_train[var] = df_train[var].fillna(mode)
    
    df_test[var+'_na'] = df_test[var].isnull()
    df_test[var] = df_test[var].fillna(mode)
    
    
num_nulls = df_train[numerical_variables].isnull().sum().sort_values(ascending=False)
num_nulls   

YrSold           0
MoSold           0
GrLivArea        0
LowQualFinSF     0
2ndFlrSF         0
1stFlrSF         0
TotalBsmtSF      0
BsmtUnfSF        0
BsmtFinSF2       0
BsmtFinSF1       0
MasVnrArea       0
YearRemodAdd     0
YearBuilt        0
OverallCond      0
OverallQual      0
LotArea          0
LotFrontage      0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
WoodDeckSF       0
MiscVal          0
PoolArea         0
ScreenPorch      0
3SsnPorch        0
EnclosedPorch    0
OpenPorchSF      0
GarageArea       0
HalfBath         0
GarageCars       0
GarageYrBlt      0
Fireplaces       0
TotRmsAbvGrd     0
KitchenAbvGr     0
BedroomAbvGr     0
MSSubClass       0
dtype: int64

In [15]:
df_test[numerical_variables].isnull().sum().sort_values(ascending=False)

BsmtHalfBath     2
BsmtFullBath     2
TotalBsmtSF      1
BsmtUnfSF        1
BsmtFinSF2       1
BsmtFinSF1       1
GarageCars       1
GarageArea       1
MasVnrArea       0
YearRemodAdd     0
YearBuilt        0
OverallCond      0
OverallQual      0
LotArea          0
LotFrontage      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
YrSold           0
MoSold           0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MSSubClass       0
dtype: int64

In [16]:
numerical_features_nulls_filled_with_zero_in_test_set = ['GarageCars', 'GarageArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
df_test[numerical_features_nulls_filled_with_zero_in_test_set] = df_test[numerical_features_nulls_filled_with_zero_in_test_set].fillna(0)

## Adding Features

- **TimeSold** : the difference between the year the house was sold and the year the house was built.
<!-- - **TimeSold** : the difference between the year the house was sold and the year the house was built. -->

In [17]:
df_train['TimeSold'] = df_train['YrSold'] - df_train['YearBuilt']
df_test['TimeSold'] = df_test['YrSold'] - df_test['YearBuilt']

## Categorical Encoding

In [18]:
def to_ordinal_encoding(train, test, var):
    train = train.copy()
    test = test.copy()

    orderd_labels = train.groupby(var)[TARGET].mean().sort_values().index.tolist()
    ordinal_encoder = OrdinalEncoder(categories=[orderd_labels], handle_unknown='ignore')
    
    train[var] = ordinal_encoder.fit_transform(train[var].values.reshape(-1, 1))
    test[var] = ordinal_encoder.transform(test[var].values.reshape(-1, 1))
    
    return train, test

In [19]:
for var in categorical_variables:
    df_train, df_test = to_ordinal_encoding(df_train, df_test, var)

## Adding Features (again)

In [20]:
# Reference
# https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset
train = df_train
test = df_test

train["OverallGrade"] = train["OverallQual"] * train["OverallCond"]
# Overall quality of the garage
train["GarageGrade"] = train["GarageQual"] * train["GarageCond"]
# Overall quality of the exterior
train["ExterGrade"] = train["ExterQual"] * train["ExterCond"]
# Overall kitchen score
train["KitchenScore"] = train["KitchenAbvGr"] * train["KitchenQual"]
# Overall fireplace score
train["FireplaceScore"] = train["Fireplaces"] * train["FireplaceQu"]
# Overall garage score
train["GarageScore"] = train["GarageArea"] * train["GarageQual"]
# Overall pool score
train["PoolScore"] = train["PoolArea"] * train["PoolQC"]
# Total number of bathrooms
train["TotalBath"] = train["BsmtFullBath"] + (0.5 * train["BsmtHalfBath"]) + \
train["FullBath"] + (0.5 * train["HalfBath"])
# Total SF for house (incl. basement)
train["AllSF"] = train["GrLivArea"] + train["TotalBsmtSF"]
# Total SF for 1st + 2nd floors
train["AllFlrsSF"] = train["1stFlrSF"] + train["2ndFlrSF"]
# Total SF for porch
train["AllPorchSF"] = train["OpenPorchSF"] + train["EnclosedPorch"] + \
train["3SsnPorch"] + train["ScreenPorch"]

test["OverallGrade"] = test["OverallQual"] * test["OverallCond"]
# Overall quality of the garage
test["GarageGrade"] = test["GarageQual"] * test["GarageCond"]
# Overall quality of the exterior
test["ExterGrade"] = test["ExterQual"] * test["ExterCond"]
# Overall kitchen score
test["KitchenScore"] = test["KitchenAbvGr"] * test["KitchenQual"]
# Overall fireplace score
test["FireplaceScore"] = test["Fireplaces"] * test["FireplaceQu"]
# Overall garage score
test["GarageScore"] = test["GarageArea"] * test["GarageQual"]
# Overall pool score
test["PoolScore"] = test["PoolArea"] * test["PoolQC"]
# Total number of bathrooms
test["TotalBath"] = test["BsmtFullBath"] + (0.5 * test["BsmtHalfBath"]) + \
test["FullBath"] + (0.5 * test["HalfBath"])
# Total SF for house (incl. basement)
test["AllSF"] = test["GrLivArea"] + test["TotalBsmtSF"]
# Total SF for 1st + 2nd floors
test["AllFlrsSF"] = test["1stFlrSF"] + test["2ndFlrSF"]
# Total SF for porch
test["AllPorchSF"] = test["OpenPorchSF"] + test["EnclosedPorch"] + \
test["3SsnPorch"] + test["ScreenPorch"]

df_train = train
df_test = test

## Feature Transformation & Scaling

In [21]:
def transform_with_log(train, test, features):
    train = train.copy()
    test = test.copy()
    
    for var in features:
        train[var] = np.log(train[var])
        test[var] = np.log(test[var]) 
        
    return train, test

In [22]:
features_transformed_with_log = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']
df_train, df_test = transform_with_log(df_train, df_test, features_transformed_with_log)

df_train[TARGET] = np.log(df_train[TARGET])

In [23]:
features = [var for var in df_train.columns if var != 'SalePrice']

In [24]:
def transform_with_MinMax(train, test, features):
    train = train.copy()
    test = test.copy()
    
    scaler = MinMaxScaler()
    scaler.fit(train[features])
    
    train[features] = scaler.transform(train[features])
    test[features] = scaler.transform(test[features])
    
    return train, test

def transform_with_Standard(train, test, features):
    train = train.copy()
    test = test.copy()
    
    scaler = StandardScaler()
    scaler.fit(train[features])
    
    train[features] = scaler.transform(train[features])
    test[features] = scaler.transform(test[features])
    
    return train, test

In [25]:
df_train, df_test = transform_with_MinMax(df_train, df_test, features)

In [26]:
def get_skewed_features(train, numeric_feats, threshold=0.5):
    df_train = train.copy()

    skewed_feats = df_train[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew' :skewed_feats})

    skewness = skewness[abs(skewness) > threshold]
    print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))
    
    return skewness.index

def transform_with_box_cox(data, skewed_features, lambda_=0.15):
    data = data.copy()
    lambda_ = lambda_
    
    for feat in skewed_features:
        data[feat] = boxcox1p(data[feat], lambda_)
        
    return data

In [27]:
skewed_features = get_skewed_features(df_train, features)

df_train = transform_with_box_cox(df_train, skewed_features)
df_test = transform_with_box_cox(df_test, skewed_features)

There are 93 skewed numerical features to Box Cox transform


In [28]:
df_train.to_csv('processed_train.csv', index=False)
df_test.to_csv('processed_test.csv', index=True)