# Housing price prediction

## Import of libs and CSV file

In [28]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn import set_config
set_config(transform_output='pandas')
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error, r2_score
housing_df = pd.read_csv('housing_data_regression/housing_iteration_6_regression.csv')

## Preprocessor

In [30]:
# -----------------------------
# Feature and Target Separation
# -----------------------------
y = housing_df.pop("SalePrice")
X = housing_df
# -----------------------------
# Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)
# -----------------------------
# Train data
# -----------------------------
train_data = X_train.copy()
train_data['SalePrice'] = y_train
# -----------------------------
# Preprocessing Pipelines
# -----------------------------
# Identify numerical and categorical features
num_feat = X_train.select_dtypes(include="number").columns
cat_feat = X_train.select_dtypes(exclude="number").columns

# Numerical pipeline: Impute missing values using the mean.
num_imputer = SimpleImputer(strategy="mean")
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

# Categorical preprocessing:
# - Impute missing categorical values with a constant 'N_A'
cat_imputer = SimpleImputer(strategy='constant', fill_value='N_A')

# For ordinal encoding, specify the feature and its explicit order.
# Identify which columns you want to ordinally encode
ord_feat = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "KitchenQual", "FireplaceQu", "GarageQual", "GarageCond", "PoolQC"]
# Create ordered lists for each of the ordinal features. Make sure to include the fill_value option
ExterQual_cats = ["Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["N_A", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["N_A", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
BsmtFinType2_cats = ["N_A", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
HeatingQC_cats = ["Po", "Fa", "TA", "Gd", "Ex"]
KitchenQual_cats = ["Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
GarageQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats = ["N_A", "Fa", "TA", "Gd", "Ex"]
# Create a list of the ordered lists
categories = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, BsmtExposure_cats, BsmtFinType1_cats, BsmtFinType2_cats, HeatingQC_cats, KitchenQual_cats, FireplaceQu_cats, GarageQual_cats, GarageCond_cats, PoolQC_cats]
# Initialize encoder
ord_encoder = OrdinalEncoder(categories=categories)

# Identify nominal categorical features (those that will be one-hot encoded)
oh_feat = list(set(cat_feat) - set(ord_feat))
oh_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Combine ordinal and nominal encoders using a column transformer.
encoder = make_column_transformer(
    (ord_encoder, ord_feat),
    (oh_encoder, oh_feat)
)

# Create a categorical pipeline: imputation followed by encoding.
cat_pipe = Pipeline([
    ("imputer", cat_imputer),
    ("encoder", encoder)
])

# Combine numerical and categorical pipelines into a full preprocessor.
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipe, num_feat),
    ("cat", cat_pipe, cat_feat)
])

preprocessor

## Comparison of different models with GridSearchCV

In [35]:
# Define a function to perform a grid search, which helps to avoid duplicating code for different models
def run_grid_search(model, param_grid, X_train, y_train, preprocessor, scaler, cv=5, verbose=1):
    # Create a pipeline that first applies the data preprocessing steps, then fits the model
    pipe = make_pipeline(preprocessor, scaler, model)

    # GridSearchCV will test all possible combinations of parameters defined in 'param_grid'
    grid_search = GridSearchCV(pipe, param_grid, cv=cv, verbose=verbose)

    # Fit the model on the training data with the various parameter combinations
    grid_search.fit(X_train, y_train)

    # Return the trained GridSearchCV object which holds the best parameters and model
    return grid_search

# define scaler:
scaler = StandardScaler()

# Define a dictionary of hyperparameters to tune for the decision tree model
dt_param_grid = {
    "columntransformer__num__imputer__strategy": ["mean", "median"],
    'decisiontreeregressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
    'decisiontreeregressor__splitter': ['best', 'random'],
    'decisiontreeregressor__max_depth': [5, 10, 20, 30, 50],
    'decisiontreeregressor__min_samples_leaf': [1, 2, 4, 10],
}
# Define a dictionary of hyperparameters to tune for the GradientBooster model
gb_param_grid = {
    "columntransformer__num__imputer__strategy": ["mean", "median"],
    'gradientboostingregressor__n_estimators': [100, 200],               # number of boosting stages
    'gradientboostingregressor__max_depth': [4, 5, 6, 7],                     # max depth of individual estimators
    'gradientboostingregressor__min_samples_leaf': [1, 2, 4],                 # minimum samples at a leaf node
    'gradientboostingregressor__loss': ['squared_error', 'absolute_error']    # loss function to optimize
}
# Define a dictionary of hyperparameters to tune for the RF model
rf_param_grid = {
    "columntransformer__num__imputer__strategy": ["mean", "median"],
    'randomforestregressor__n_estimators': [100, 200],               # number of trees in the forest
    'randomforestregressor__max_depth': [None, 10, 20, 30],               # maximum depth of each tree
    'randomforestregressor__min_samples_leaf': [1, 2, 4],                 # minimum samples at each leaf
    'randomforestregressor__max_features': ['sqrt', 'log2', None],      # number of features to consider at each split
}


# Create Model/ param_grid dict:
model_param_dict = {'model':[DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor()], 
                    'param_grid': [dt_param_grid, gb_param_grid, rf_param_grid]}

# Get the lists for each key
model_list = model_param_dict['model']
param_list = model_param_dict['param_grid']
model_search_list = []

# Loop through both lists simultaneously
for model, param_grid in zip(model_list, param_list):
    # Run the grid search for all Classifiers using the specified parameters
    model_search = run_grid_search(
        model,
        param_grid,
        X_train,
        y_train,
        preprocessor,
        scaler
    )

    model_search_list.append(model_search)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Fitting 5 folds for each of 144 candidates, totalling 720 fits


## Evaluation of models

In [40]:
model_scores = []
def score_model(model_name, y_true, y_pred):
  scores = {
      'Model': model_name,
      'MAE ($)': round(mean_absolute_error(y_true, y_pred), 2),
      'RMSE ($)': round(root_mean_squared_error(y_true, y_pred), 2),
      'MAPE (%)': round(100 * mean_absolute_percentage_error(y_true, y_pred), 2),
      'R-Squared': round(r2_score(y_true, y_pred), 3)
  }
  return scores

model_name_list = ["Decision Tree", "GradientBoosting", "RandomForest"]
count = 0
for model in model_search_list:
    y_pred = model.predict(X_test)
    model_scores.append(score_model(model_name_list[count], y_test, y_pred))
    count += 1

pd.DataFrame(model_scores)

Unnamed: 0,Model,MAE ($),RMSE ($),MAPE (%),R-Squared
0,Decision Tree,22249.09,35596.37,13.35,0.803
1,GradientBoosting,15310.94,30720.08,8.68,0.853
2,RandomForest,15831.97,30859.64,9.09,0.852
