# Import

In [50]:
import pandas as pd
from imblearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline as make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import plot_tree
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import cohen_kappa_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, make_scorer
pd.set_option('display.max_columns', None)

In [79]:
houses = pd.read_csv('/Users/merlesteffen/Documents/GitHub/HousingPrices/Data/iter-5/housing-classification-iter5.csv')

Remember to scale for a distance algorithm

# Split Data

In [80]:
X = houses.drop(columns='Expensive')
y = houses['Expensive']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
X_train.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu,MSSubClass,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
254,8400,70.0,1314,3,0,0,1,250,0,RL,Norm,GasA,Pave,Y,CBlock,TA,Gd,TA,TA,No,Rec,TA,,20,5,6,1957,1957,0.0,922,0,392,1314,0,0,1314,1,0,1,0,1,5,1957.0,294,0,0,0,0,6,2010
1066,7837,59.0,799,3,1,0,2,0,0,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,No,Unf,TA,TA,60,6,7,1993,1994,0.0,0,0,799,799,772,0,1571,0,0,2,1,1,7,1993.0,380,40,0,0,0,5,2009
638,8777,67.0,796,2,0,0,0,328,0,RL,Feedr,GasA,Pave,Y,CBlock,TA,TA,Fa,TA,No,Unf,TA,,30,5,7,1910,1950,0.0,0,0,796,796,0,0,796,0,0,1,0,1,4,,0,0,164,0,0,5,2008
799,7200,60.0,731,3,2,0,1,0,0,RL,Feedr,GasA,Pave,Y,BrkTil,TA,TA,Gd,TA,No,ALQ,Gd,TA,50,5,7,1937,1950,252.0,569,0,162,981,787,0,1768,1,0,1,1,1,7,1939.0,240,0,264,0,0,6,2007
380,5000,50.0,1026,3,1,0,1,0,0,RL,Norm,GasA,Pave,Y,BrkTil,TA,TA,TA,TA,No,LwQ,Gd,Gd,50,5,6,1924,1950,0.0,218,0,808,1026,665,0,1691,0,0,2,0,1,6,1924.0,308,0,242,0,0,5,2010


First obersvations: Need to replace missing values and scale. Use two pipes and column transformer.

# Prepare & Pipeline

In [82]:
categoric_features = list(X_train.select_dtypes(include=["object"]))
numeric_features = list(X_train.select_dtypes(exclude=["object"]))

In [83]:
numeric_pipe = make_pipeline(
    SimpleImputer()
)
categoric_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False, drop='first', min_frequency=30, handle_unknown='infrequent_if_exist')
)

In [84]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, numeric_features),
        ("cat_pipe_onehot", categoric_pipe, categoric_features),
    ]
)

In [85]:
model_pipeline = make_pipeline(preprocessor, StandardScaler(), RandomForestClassifier(random_state=42))

# Fit Model

In [86]:
model_pipeline.fit(X_train, y_train)

# Evaluate Model

In [87]:
y_train_pred = model_pipeline.predict(X_train)

accuracy_score(y_train, y_train_pred)

1.0

In [88]:
y_test_pred = model_pipeline.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9554794520547946

In [89]:
recall_score(y_test, y_test_pred)

0.75

# Refine Model

In [90]:
model_pipeline_rf = make_pipeline(preprocessor, StandardScaler(), RandomForestClassifier(random_state=42))

## Randomized Grid Search

In [91]:
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median", "most_frequent"],
    "randomforestclassifier__n_estimators": range(50,150,10),
    "randomforestclassifier__criterion" : ['gini', 'entroopy'],
    "randomforestclassifier__min_samples_leaf" : range(1,25)
}


In [92]:
search = RandomizedSearchCV(
    model_pipeline_rf,
    param_grid,
    n_iter=1000,
    cv=5,
    verbose=1,
    n_jobs=-2,
    random_state=42
)

In [93]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


2480 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
394 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/merlesteffen/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/merlesteffen/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/merlesteffen/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/merles

In [94]:
search.best_score_

0.9571842558967022

In [95]:
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9837328767123288

In [96]:
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9657534246575342

In [97]:
search.best_params_

{'randomforestclassifier__n_estimators': 120,
 'randomforestclassifier__min_samples_leaf': 3,
 'randomforestclassifier__criterion': 'gini',
 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

In [99]:
recall_score(y_test, y_test_pred)

0.7916666666666666

## Grid Search

In [110]:
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["median"],
    "randomforestclassifier__n_estimators": range(110,130),
    "randomforestclassifier__criterion" : ['gini', 'entropy'],
    "randomforestclassifier__min_samples_leaf" : range(2,5)
}

In [111]:
search = GridSearchCV(
    model_pipeline_rf,
    param_grid,
    cv=10,
    n_jobs=-2,
    verbose=1
)

In [112]:
search.fit(X_train, y_train)

Fitting 10 folds for each of 120 candidates, totalling 1200 fits




In [113]:
search.best_score_

0.9554818744473916

In [114]:
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9957191780821918

In [115]:
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9554794520547946

In [116]:
search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'median',
 'randomforestclassifier__criterion': 'entropy',
 'randomforestclassifier__min_samples_leaf': 2,
 'randomforestclassifier__n_estimators': 110}

In [117]:
recall_score(y_test, y_test_pred)

0.75

# Recall Scorer Model

In [127]:
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median", "most_frequent"],
    "randomforestclassifier__n_estimators": range(50,150,10),
    "randomforestclassifier__criterion" : ['gini', 'entroopy'],
    "randomforestclassifier__min_samples_leaf" : range(1,25)
}

In [128]:
recall_scori=make_scorer(recall_score)

In [130]:
search = RandomizedSearchCV(
    model_pipeline_rf,
    param_grid,
    n_iter=1000,
    cv=5,
    verbose=1,
    n_jobs=-2,
    scoring=recall_scori,
    random_state=42
)

In [131]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


2480 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
409 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/merlesteffen/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/merlesteffen/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/merlesteffen/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/merles

In [132]:
search.best_score_

0.7572192513368984

In [133]:
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9811643835616438

In [134]:
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9657534246575342

In [135]:
search.best_params_

{'randomforestclassifier__n_estimators': 140,
 'randomforestclassifier__min_samples_leaf': 3,
 'randomforestclassifier__criterion': 'gini',
 'columntransformer__num_pipe__simpleimputer__strategy': 'most_frequent'}

In [136]:
recall_score(y_test, y_test_pred)

0.7916666666666666