# Housing value classification

## Import of libs and CSV file

In [15]:
from sklearn import set_config
set_config(transform_output='pandas')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    balanced_accuracy_score,
    cohen_kappa_score
)
import pandas as pd
housing_df = pd.read_csv('housing_data/housing_iteration_5_classification.csv')

## Creation of Preprocessor
One pipeline for numerical features and one for categorical features with OneHot and Ordinal encoding.

In [17]:
# -----------------------------
# Feature and Target Separation
# -----------------------------
y = housing_df.pop("Expensive")
X = housing_df

# -----------------------------
# Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

# -----------------------------
# Preprocessing Pipelines
# -----------------------------

# Identify numerical and categorical features
num_feat = X.select_dtypes(include="number").columns
cat_feat = X.select_dtypes(exclude="number").columns

# Numerical pipeline: Impute missing values using the mean.
num_imputer = SimpleImputer(strategy="mean")
num_pipe = make_pipeline(num_imputer)

# Categorical preprocessing:
# - Impute missing categorical values with a constant 'N_A'
cat_imputer = SimpleImputer(strategy='constant', fill_value='N_A')

# For ordinal encoding, specify the feature and its explicit order.
# Identify which columns you want to ordinally encode
ord_feat = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "KitchenQual", "FireplaceQu", "GarageQual", "GarageCond", "PoolQC"]
# Create ordered lists for each of the ordinal features. Make sure to include the fill_value option
ExterQual_cats = ["Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["N_A", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["N_A", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
BsmtFinType2_cats = ["N_A", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
HeatingQC_cats = ["Po", "Fa", "TA", "Gd", "Ex"]
KitchenQual_cats = ["Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
GarageQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats = ["N_A", "Fa", "TA", "Gd", "Ex"]
# Create a list of the ordered lists
categories = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, BsmtExposure_cats, BsmtFinType1_cats, BsmtFinType2_cats, HeatingQC_cats, KitchenQual_cats, FireplaceQu_cats, GarageQual_cats, GarageCond_cats, PoolQC_cats]
# Initialize encoder
ord_encoder = OrdinalEncoder(categories=categories)

# Identify nominal categorical features (those that will be one-hot encoded)
oh_feat = list(set(cat_feat) - set(ord_feat))
oh_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Combine ordinal and nominal encoders using a column transformer.
encoder = make_column_transformer(
    (ord_encoder, ord_feat),
    (oh_encoder, oh_feat)
)

# Create a categorical pipeline: imputation followed by encoding.
cat_pipe = make_pipeline(
    cat_imputer,
    encoder
)

# Combine numerical and categorical pipelines into a full preprocessor.
preprocessor = make_column_transformer(
    (num_pipe, num_feat),
    (cat_pipe, cat_feat)
)
preprocessor

## Comparison of different models with GridSearchCV

In [19]:
# Define a function to perform a grid search, which helps to avoid duplicating code for different models
def run_grid_search(model, param_grid, X_train, y_train, preprocessor, scaler, cv=5, verbose=1):
    # Create a pipeline that first applies the data preprocessing steps, then fits the model
    pipe = make_pipeline(preprocessor, scaler, model)

    # GridSearchCV will test all possible combinations of parameters defined in 'param_grid'
    grid_search = GridSearchCV(pipe, param_grid, cv=cv, verbose=verbose)

    # Fit the model on the training data with the various parameter combinations
    grid_search.fit(X_train, y_train)

    # Return the trained GridSearchCV object which holds the best parameters and model
    return grid_search

# define scaler:
scaler = StandardScaler()

# Define a dictionary of hyperparameters to tune for the decision tree model
dt_param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy": ["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2)
}
# Define a dictionary of hyperparameters to tune for the KNN model
knn_param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy":["mean", "median"],
    "kneighborsclassifier__n_neighbors": range(1, 11),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}
# Define a dictionary of hyperparameters to tune for the RandomForest model
rf_param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy":["mean", "median"],
    "randomforestclassifier__max_depth": range(2, 12),
    "randomforestclassifier__min_samples_leaf": range(2, 10)
}
# Define a dictionary of hyperparameters to tune for the GradientBooster model
gb_param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy":["mean", "median"],
    "gradientboostingclassifier__min_samples_leaf": range(2, 14, 2),
    "gradientboostingclassifier__max_depth": range(3, 12, 2),
}
# Define a dictionary of hyperparameters to tune for the SVC model
svc_param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy":["mean", "median"],
    "svc__kernel":['linear', 'poly', 'rbf', 'sigmoid'],
    "svc__shrinking":[True, False]
}

# Create Model/ param_grid dict:
model_param_dict = {'model':[DecisionTreeClassifier(), KNeighborsClassifier(), RandomForestClassifier(), GradientBoostingClassifier(), SVC()], 
                    'param_grid': [dt_param_grid, knn_param_grid, rf_param_grid, gb_param_grid, svc_param_grid]}

# Get the lists for each key
model_list = model_param_dict['model']
param_list = model_param_dict['param_grid']
model_search_list = []

# Loop through both lists simultaneously
for model, param_grid in zip(model_list, param_list):
    # Run the grid search for all Classifiers using the specified parameters
    model_search = run_grid_search(
        model,
        param_grid,
        X_train,
        y_train,
        preprocessor,
        scaler
    )

    model_search_list.append(model_search)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 60 candidates, totalling 300 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits


## Evaluation of models

In [21]:
# Function to get the scores for our model(s)
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    scores = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Specificity": recall_score(y_test, y_pred, pos_label=0),
        "F1 Score": f1_score(y_test, y_pred),
        "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
        "Cohen's Kappa": cohen_kappa_score(y_test, y_pred)
    }
    return scores

# Create an empty DataFrame to store model evaluation results
model_scores_df = pd.DataFrame(columns=[
    "Model", "Accuracy", "Recall", "Precision",
    "Specificity", "F1 Score", "Balanced Accuracy", "Cohen's Kappa"
])

model_name_list = ["Decision Tree", "KNN", "RandomForest", "GradientBoosting", "SVC"]
count = 0
for model in model_search_list:
    # Evaluate the Decision Tree model
    dt_scores = evaluate_model(model, X_test, y_test)
    dt_scores["Model"] = model_name_list[count]
    count += 1
    
    # Convert the dictionary to a Series matching the DataFrame columns, then assign as a new row
    model_scores_df.loc[len(model_scores_df)] = pd.Series(dt_scores, index=model_scores_df.columns)

# Display the DataFrame
model_scores_df

Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity,F1 Score,Balanced Accuracy,Cohen's Kappa
0,Decision Tree,0.924658,0.763158,0.690476,0.948819,0.725,0.855988,0.681476
1,KNN,0.934932,0.684211,0.787879,0.972441,0.732394,0.828326,0.695566
2,RandomForest,0.938356,0.657895,0.833333,0.980315,0.735294,0.819105,0.700956
3,GradientBoosting,0.938356,0.815789,0.738095,0.956693,0.775,0.886241,0.739389
4,SVC,0.938356,0.736842,0.777778,0.968504,0.756757,0.852673,0.721492
