In [73]:
# This is Exercise 02 for Lecture 07 (Ames Housing Part III - Creating Features) 
# of the "Data Science" class at Technische Hochschule Rosenheim
# it is based on https://www.kaggle.com/code/ryanholbrook/creating-features

In [74]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from category_encoders import MEstimateEncoder
%matplotlib inline

In [75]:
# make the plots look nice
sns.set(style="darkgrid")
snscolor='mediumseagreen'
sns.set(font_scale=0.7) 

# The Ames Housing Dataset - Part III

## Load the cleaned data

In [76]:
columns_categorical = ['PID', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature',
       'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition',
       'SalePrice']
columns_discrete = ['YearBuilt', 'YearRemodAdd', 'BsmtFullBath', 'BsmtHalfBath',
                    'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
                    'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 
                    'MoSold', 'YrSold']
columns_continuous = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 
                      'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', 
                      '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea',
                      'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
                      'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']

columns_numeric =  columns_discrete + columns_continuous

In [77]:
# make lists of features
features_numeric = columns_numeric.copy()
features_numeric.remove('SalePrice') # remove target

features_categorical = columns_categorical.copy()

In [78]:
ames = pd.read_pickle("data/AmesHousingClean.pkl")
X = ames.copy()
y = X.pop("SalePrice")

#### Pre-defined Scoring Function

The following function can be used to evaluate the effects of your feature engineering efforts.

In [79]:
# use root mean squared error as scoring method.
# we will score our effort on both XGBoost Regression and Lasso Regression
def compute_score(X, y, text='Baseline'):
    score_xgb = -1 * cross_val_score(XGBRegressor(), X, y, cv=5, scoring="neg_root_mean_squared_error").mean()
    score_lr = -1 * cross_val_score(Lasso(tol=0.1, alpha=5), X, y, cv=5, scoring="neg_root_mean_squared_error").mean()
    print(f'{text:25}: xgb regression={score_xgb:,.0f} and lasso regression={score_lr:,.0f}')

In order to keep our models relatively simple, we will start with 5 features as a baseline and keep adding constructed features to these.

In [80]:
feature_baseline = ['GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'FullBath']

In [81]:
# base line score - no hyperparameter tuning, using our baseline features only
compute_score(X[feature_baseline], y)

Baseline                 : xgb regression=33,380 and lasso regression=37,513


## Mathematical Interactions

### Demo

In this exercise, we want to add three new features to our dataframe `X`, which make sense from a domain perspective. In addition, we focus on ratios and sums, as these are particularly hard to XGBoost-based regression models.

Let us create the following features:
* `LivLotRatio`: the ratio of `GrLivArea` to `LotArea`
* `Spaciousness`: the sum of `1stFlrSF` and `2ndFlrSF` divided by `TotRmsAbvGrd`
* `TotalOutsideSF`: the sum of `WoodDeckSF`, `OpenPorchSF`, `EnclosedPorch`, `3SsnPorch`, and `ScreenPorch`

Finally, we evaluate the score on all numeric features plus these three new features and compare it with the base line score.

In [82]:
X['LivLotRatio'] = X.GrLivArea / X.LotArea
X['Spaciousness'] = (X['1stFlrSF'] + X['2ndFlrSF']) / X.TotRmsAbvGrd
X['TotalOutsideSF'] = X.WoodDeckSF + X.OpenPorchSF + X.EnclosedPorch + X['3SsnPorch'] + X.ScreenPorch

In [83]:
features_III1 = feature_baseline + ['LivLotRatio', 'Spaciousness', 'TotalOutsideSF']
compute_score(X[feature_baseline], y)
compute_score(X[features_III1], y, '3 new features')

Baseline                 : xgb regression=33,380 and lasso regression=37,513
3 new features           : xgb regression=31,444 and lasso regression=36,596


Interpretation: as expected, the score for xgb regression improved, and the score for lasso regression as well.

## Creating a Count Feature

We already added a feature `TotalOutsideSF` for the total size of the outdoor areas. Another useful information (based on our domain knowledge) might be, how many different outdoor areas a property has.

### Exercise III.1

* Create a new feature `CountPorches`. A porch does exist, if it's area is larger than 0. Thus, our feature will have values between 0 and 5.
* Again, add it to our feature list and compare with the previous model and the base line model.

In [84]:
#---------- SOLUTION --------
porches = ["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]
X["CountPorches"] = (X[porches]>0).sum(axis=1)
X["CountPorches"] = X[porches].gt(0).sum(axis=1) # same

compute_score(X[feature_baseline], y)
compute_score(X[feature_baseline + ["CountPorches"]], y, "porchCount")

Baseline                 : xgb regression=33,380 and lasso regression=37,513
porchCount               : xgb regression=32,750 and lasso regression=36,980


## Splitting a categorical feature

`MSSubClass` describes the type of the property:

       020: 1-STORY 1946 & NEWER ALL STYLES
       030: 1-STORY 1945 & OLDER
       040: 1-STORY W/FINISHED ATTIC ALL AGES
       045: 1-1/2 STORY - UNFINISHED ALL AGES
       050: 1-1/2 STORY FINISHED ALL AGES
       060: 2-STORY 1946 & NEWER
       070: 2-STORY 1945 & OLDER
       075: 2-1/2 STORY ALL AGES
       080: SPLIT OR MULTI-LEVEL
       085: SPLIT FOYER
       090: DUPLEX - ALL STYLES AND AGES
       120: 1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150: 1-1/2 STORY PUD - ALL AGES
       160: 2-STORY PUD - 1946 & NEWER
       180: PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190: 2 FAMILY CONVERSION - ALL STYLES AND AGES
       
Let's break this down into meaningfull groups: 020, 030, 040, 045, 050, 120, 150 describe `one_story` buildings; 060, 070, 075, 160 are `two_story` building; 080, 085, 180 are `split_level` building; and 090, 190 are `duplex` buildings. 

### Exercise III.2

* Break down `MSSubClass` into the four types give, und one-hot encode the result.
* Again, add it to our feature list and compare with the previous models (incl. the base-line model)

In [85]:
#---------- SOLUTION ----------
def break_down(type:str) -> str:
    if type in ["020", "030", "040", "045", "050", "120", "150"]:
        return "one_story"
    elif type in ["060", "070", "075", "160"]:
        return "two_story"
    elif type in ["080", "085", "180"]:
        return "split_level"
    elif type in ["090", "190"]:
        return "duplex"
    else:
        return ""

X["MSSubClass"] = X["MSSubClass"].apply(lambda x: break_down(x))
X = pd.get_dummies(X, columns=["MSSubClass"], drop_first=True)
print(X.columns)
compute_score(X[feature_baseline], y)
compute_score(X[feature_baseline + ["CountPorches"]], y, "porchCount")
compute_score(pd.concat([X[feature_baseline], X.filter(like="MSSubClass")], axis=1), y, 'Our Selection:')


Index(['PID', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckS

## Group-by/Aggregation Transformation

The value of a home often depends on how it compares to typical homes in its neighborhood. 

### Exercise III.3

* Create a feature `MedNHArea` that describes the median of `GrLivArea` grouped on `Neighborhood`.
* Again, add it to our feature list and compare with the previous models (incl. the base-line model)

In [96]:
#---------- SOLUTION ----------
X[["GrLivArea", "Neighborhood"]].groupby("Neighborhood").median()

Unnamed: 0_level_0,GrLivArea
Neighborhood,Unnamed: 1_level_1
Blmngtn,1455.5
Blueste,1118.0
BrDale,1092.0
BrkSide,1231.0
ClearCr,1694.0
CollgCr,1504.0
Crawfor,1648.0
Edwards,1184.0
Gilbert,1560.0
Greens,1226.0


In [98]:
X["MedNHArea"] = X[["GrLivArea", "Neighborhood"]].groupby("Neighborhood").median()
X["MedNHArea"]

Order
1      NaN
2      NaN
3      NaN
4      NaN
5      NaN
        ..
2926   NaN
2927   NaN
2928   NaN
2929   NaN
2930   NaN
Name: MedNHArea, Length: 2925, dtype: float64

------