# Import

In [21]:
import pandas as pd
from imblearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline as make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import plot_tree
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import cohen_kappa_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, make_scorer
pd.set_option('display.max_columns', None)

In [2]:
houses = pd.read_csv('/Users/merlesteffen/Documents/GitHub/HousingPrices/Data/iter-5/housing-classification-iter5.csv')

Remember to scale for a distance algorithm

# Split Data

In [3]:
X = houses.drop(columns='Expensive')
y = houses['Expensive']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
X_train.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu,MSSubClass,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
254,8400,70.0,1314,3,0,0,1,250,0,RL,Norm,GasA,Pave,Y,CBlock,TA,Gd,TA,TA,No,Rec,TA,,20,5,6,1957,1957,0.0,922,0,392,1314,0,0,1314,1,0,1,0,1,5,1957.0,294,0,0,0,0,6,2010
1066,7837,59.0,799,3,1,0,2,0,0,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,No,Unf,TA,TA,60,6,7,1993,1994,0.0,0,0,799,799,772,0,1571,0,0,2,1,1,7,1993.0,380,40,0,0,0,5,2009
638,8777,67.0,796,2,0,0,0,328,0,RL,Feedr,GasA,Pave,Y,CBlock,TA,TA,Fa,TA,No,Unf,TA,,30,5,7,1910,1950,0.0,0,0,796,796,0,0,796,0,0,1,0,1,4,,0,0,164,0,0,5,2008
799,7200,60.0,731,3,2,0,1,0,0,RL,Feedr,GasA,Pave,Y,BrkTil,TA,TA,Gd,TA,No,ALQ,Gd,TA,50,5,7,1937,1950,252.0,569,0,162,981,787,0,1768,1,0,1,1,1,7,1939.0,240,0,264,0,0,6,2007
380,5000,50.0,1026,3,1,0,1,0,0,RL,Norm,GasA,Pave,Y,BrkTil,TA,TA,TA,TA,No,LwQ,Gd,Gd,50,5,6,1924,1950,0.0,218,0,808,1026,665,0,1691,0,0,2,0,1,6,1924.0,308,0,242,0,0,5,2010


First obersvations: Need to replace missing values and scale. Use two pipes and column transformer.

# Prepare & Pipeline

In [5]:
categoric_features = list(X_train.select_dtypes(include=["object"]))
numeric_features = list(X_train.select_dtypes(exclude=["object"]))

Idea: Try out different scaler, robust or standard.

In [6]:
numeric_pipe = make_pipeline(
    SimpleImputer()
)
categoric_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False, drop='first', min_frequency=30, handle_unknown='infrequent_if_exist')
)

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, numeric_features),
        ("cat_pipe_onehot", categoric_pipe, categoric_features),
    ]
)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
model_pipeline = make_pipeline(preprocessor, knn)

# Fit Model

In [None]:
model_pipeline.fit(X_train, y_train)

# Evaluate Model

In [None]:
y_train_pred = model_pipeline.predict(X_train)

accuracy_score(y_train, y_train_pred)

In [None]:
y_test_pred = model_pipeline.predict(X_test)

accuracy_score(y_test, y_test_pred)

# Refine Model

In [18]:
model_pipeline = make_pipeline(preprocessor, StandardScaler(), KNeighborsClassifier())

In [19]:
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median", "most_frequent"],
    "kneighborsclassifier__n_neighbors": range(2, 10),
    "kneighborsclassifier__weights": ['uniform', 'distance'],
    "kneighborsclassifier__algorithm": ['ball_tree', 'kd_tree', 'brute'],
    "kneighborsclassifier__leaf_size": range(2, 80),
    "kneighborsclassifier__p": [1, 2]
}


In [22]:
recall_scori=make_scorer(recall_score)

In [23]:
search = RandomizedSearchCV(
    model_pipeline,
    param_grid,
    n_iter=10000,
    cv=5,
    verbose=1,
    scoring=recall_scori,
    n_jobs=-2,
    random_state=42
)

In [24]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 10000 candidates, totalling 50000 fits




In [25]:
search.best_score_

0.692156862745098

In [26]:
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

1.0

In [27]:
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9486301369863014

In [28]:
search.best_params_

{'kneighborsclassifier__weights': 'distance',
 'kneighborsclassifier__p': 1,
 'kneighborsclassifier__n_neighbors': 2,
 'kneighborsclassifier__leaf_size': 37,
 'kneighborsclassifier__algorithm': 'brute',
 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

In [29]:
recall_score(y_test, y_test_pred)

0.75

In [16]:
best_max_neighbors = search.best_params_['kneighborsclassifier__n_neighbors']
best_samples_leaf = search.best_params_['kneighborsclassifier__leaf_size']

In [17]:
best_max_neighbors, best_samples_leaf

(5, 36)

## Grid Search

In [None]:
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median", "most_frequent"],
    "kneighborsclassifier__n_neighbors": range(best_max_neighbors - 4, best_max_neighbors + 5),
    "kneighborsclassifier__weights": ['uniform', 'distance'],
    "kneighborsclassifier__algorithm": ['ball_tree', 'kd_tree', 'brute'],
    "kneighborsclassifier__leaf_size": range(best_samples_leaf-4, best_samples_leaf+5),
    "kneighborsclassifier__p": [1, 2]
}

In [None]:
search = GridSearchCV(
    model_pipeline,
    param_grid,
    cv=10,
    n_jobs=-2,
    verbose=1
)

In [None]:
search.fit(X_train, y_train)

In [None]:
search.best_score_

In [None]:
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

In [None]:
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

In [None]:
search.best_params_