# Imports

In [20]:
# General libraries
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

# Scikit-learn model selection and metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

# Scikit-learn transformers and preprocessors & Feature Selection
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline

# Scikit-learn regressors
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# External regressor
import xgboost as xgb

# Miscellaneous settings
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [3]:
houses = pd.read_csv('https://raw.githubusercontent.com/MerleSt/HousingPricePrediction/main/Data/housing_prices.csv')

In [4]:
houses.drop(columns=['Id'], inplace=True)

# Split Data

In [5]:
X = houses.copy()
y = X.pop('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# # Temporarily set the display options to show all rows
# pd.set_option('display.max_rows', None)

# # Display the NaN counts for all columns
# print(X_train.isna().sum())

# # Reset the display options back to default settings
# pd.reset_option('display.max_rows')

# Preprocess Data

In [8]:
num_features = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()

## Encoding

In [9]:
# Define orders
order_LandSlope = ['Sev', 'Mod', 'Gtl']
order_ExterQual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_ExterCond = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtQual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtCond = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtExposure = ['NA', 'No', 'Mn', 'Av', 'Gd']
order_BsmtFinType1 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
order_BsmtFinType2 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
order_HeatingQC = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_KitchenQual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_FireplaceQu = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_GarageFinish = ['NA', 'Unf', 'RFn', 'Fin']
order_GarageQual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_GarageCond = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_PoolQC = ['NA', 'Fa', 'TA', 'Gd', 'Ex']
order_Fence = ['NA', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

# Setup the encoder
ordinal_enc = OrdinalEncoder(
    categories=[
        order_LandSlope, order_ExterQual, order_ExterCond, order_BsmtQual, order_BsmtCond, 
        order_BsmtExposure, order_BsmtFinType1, order_BsmtFinType2, order_HeatingQC, 
        order_KitchenQual, order_FireplaceQu, order_GarageFinish, order_GarageQual, 
        order_GarageCond, order_PoolQC, order_Fence
    ],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)


In [10]:
cat_ordinal = [
    'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 
    'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 
    'GarageCond', 'PoolQC', 'Fence'
]
cat_onehot = [col for col in cat_features if col not in cat_ordinal]

## Pipelines

In [14]:
num_pipeline = make_pipeline(
    SimpleImputer()
)

cat_ordinal_pipe = make_pipeline(
    SimpleImputer(strategy='constant'),
    ordinal_enc
)

cat_onehot_pipe = make_pipeline(
    SimpleImputer(strategy='constant'),
    OneHotEncoder(drop='first', handle_unknown='ignore')
)

In [15]:
preprocessor = ColumnTransformer(transformers=[
    ('num_pipe', num_pipeline, num_features),
    ('onehot', cat_onehot_pipe, cat_onehot),
    ('ordinal', cat_ordinal_pipe, cat_ordinal)
])

In [18]:
pipeline = make_pipeline(preprocessor, SelectFromModel(RandomForestRegressor(random_state=42)), RandomForestRegressor(random_state=42))

In [38]:
pipeline.fit(X_train, y_train)

In [39]:
predictions = pipeline.predict(X_test)
mae = mean_absolute_error(y_true=y_test, y_pred=predictions).round(2)
rmse = mean_squared_error(y_true=y_test, y_pred=predictions, squared=False).round(2)
mape = mean_absolute_percentage_error(y_true=y_test, y_pred=predictions).round(2)
r2 = r2_score(y_true=y_test, y_pred=predictions).round(2)

model_df = pd.DataFrame({
    "MAE": [mae],
    "RMSE": [rmse],
    "MAPE": [mape],
    "R2": [r2]},
)
model_df

Unnamed: 0,MAE,RMSE,MAPE,R2
0,17675.84,28383.28,0.11,0.89


# Hypertuning

## Randomized Grid Search

In [21]:
param_grid = {
    'selectfrommodel__estimator__n_estimators': [50, 100, 200, 300],
    'selectfrommodel__estimator__max_depth': [None, 10, 20, 30, 40],
    'selectfrommodel__estimator__min_samples_split': [2, 5, 10],
    'selectfrommodel__estimator__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__n_estimators': [50, 100, 200, 300],
    'randomforestregressor__max_depth': [None, 10, 20, 30, 40],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__min_samples_leaf': [1, 2, 4]
}

In [22]:
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-2
)

In [23]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits




In [24]:
search.best_params_

{'selectfrommodel__estimator__n_estimators': 100,
 'selectfrommodel__estimator__min_samples_split': 2,
 'selectfrommodel__estimator__min_samples_leaf': 2,
 'selectfrommodel__estimator__max_depth': 30,
 'randomforestregressor__n_estimators': 100,
 'randomforestregressor__min_samples_split': 5,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__max_depth': None}

## Evaluate Model

In [26]:
predictions = search.predict(X_test)
mae = mean_absolute_error(y_true=y_test, y_pred=predictions).round(2)
rmse = mean_squared_error(y_true=y_test, y_pred=predictions, squared=False).round(2)
mape = mean_absolute_percentage_error(y_true=y_test, y_pred=predictions).round(2)
r2 = r2_score(y_true=y_test, y_pred=predictions).round(2)

model_df = pd.DataFrame({
    "MAE": [mae],
    "RMSE": [rmse],
    "MAPE": [mape],
    "R2": [r2]},
)
model_df

In [27]:
model_df

Unnamed: 0,MAE,RMSE,MAPE,R2
0,17904.64,28600.5,0.11,0.89


## Grid Search

In [30]:
param_grid = {
    'selectfrommodel__estimator__n_estimators': [90, 100, 110],
    'selectfrommodel__estimator__max_depth': [25, 30, 35],
    'selectfrommodel__estimator__min_samples_split': [2],
    'selectfrommodel__estimator__min_samples_leaf': [1, 2, 3],
    'randomforestregressor__n_estimators': [90, 100, 110],
    'randomforestregressor__max_depth': [None, 5, 10],
    'randomforestregressor__min_samples_split': [4, 5, 6],
    'randomforestregressor__min_samples_leaf': [1, 2]
}

In [32]:
search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    verbose=10,
    n_jobs=-2
)

In [33]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 1458 candidates, totalling 7290 fits




In [34]:
search.best_params_

{'randomforestregressor__max_depth': 10,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__min_samples_split': 5,
 'randomforestregressor__n_estimators': 110,
 'selectfrommodel__estimator__max_depth': 25,
 'selectfrommodel__estimator__min_samples_leaf': 1,
 'selectfrommodel__estimator__min_samples_split': 2,
 'selectfrommodel__estimator__n_estimators': 100}

## Evaluate Model

In [37]:
predictions = search.predict(X_test)
mae = mean_absolute_error(y_true=y_test, y_pred=predictions).round(2)
rmse = mean_squared_error(y_true=y_test, y_pred=predictions, squared=False).round(2)
mape = mean_absolute_percentage_error(y_true=y_test, y_pred=predictions).round(2)
r2 = r2_score(y_true=y_test, y_pred=predictions).round(2)

model_df = pd.DataFrame({
    "MAE": [mae],
    "RMSE": [rmse],
    "MAPE": [mape],
    "R2": [r2]},
)
model_df

Unnamed: 0,MAE,RMSE,MAPE,R2
0,17953.54,28754.36,0.11,0.89


# Principal Component Analysis

# The End

In [36]:
print('This is the End, you know...')

This is the End, you know...
