# Import and organize Data

In [44]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import plot_tree
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce

In [45]:
pd.set_option('display.max_columns', None)

In [46]:
houses = pd.read_csv('/Users/merlesteffen/Documents/GitHub/HousingPrices/Data/iter-4/housing-classification-iter4.csv')

In [47]:
houses_all = houses.copy()

As you can see below, there are many categorical columns, which only have very few observations. Let us combine some of them to make the model more robust to overfitting. In best case at least 50 obervations. Maybe sometimes we can even drop one, where there are very few observations.

In [6]:
#cat_columns = houses.select_dtypes(include=["object"])
# for column in cat_columns:
#     print(f'The Name of the column is: {column}')
#     print(houses[column].value_counts())

In [48]:
# Define which categories for each column should be replaced by 'Other'
categories_to_combine = {
    'MSZoning': ['FV', 'RH', 'C (all)'],
    'Condition1': ['RRAn', 'PosN', 'RRAe', 'PosA', 'RRNn', 'RRNe'],
    'Heating': ['GasW', 'Grav', 'Wall', 'OthW', 'Floor'],
    'Street': [],
    'CentralAir': [],  # All categories have counts over 50
    'Foundation': ['Slab', 'Stone', 'Wood'],
    'ExterQual': ['Ex', 'Fa'],
    'ExterCond': ['Fa', 'Ex', 'Po'],
    'BsmtQual': ['Fa', 'Ex'], 
    'BsmtCond': ['Fa', 'Po'],
    'BsmtExposure': [],  # All categories have counts over 50
    'BsmtFinType1': [], 
    'KitchenQual': ['Fa', 'Ex'],
    'FireplaceQu': ['Fa', 'Ex', 'Po']
}

# Use the dictionary to generate the code to combine categories
for column, categories in categories_to_combine.items():
    if categories:
        houses[column] = houses[column].replace(categories, 'Other')
        
# Let us drop GRavel for Street since it only has 6 observations
houses = houses[houses['Street'] != 'Grvl']

In [49]:
houses_other = houses.copy()

# Split Data

Only ever use on of the two below. Comment out the other.

In [50]:
X = houses_other.drop(columns='Expensive')
y = houses_other['Expensive']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# X = houses_all.drop(columns='Expensive')
# y = houses_all['Expensive']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encoding

## Find Categories, which can be sorted meaningfully

In [11]:
# order_ExterQual = ['Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
# order_ExterCond = ['Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
# order_BsmtQual = ['NA', 'Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
# order_BsmtCond = ['NA', 'Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
# order_BsmtExposure = ['NA', 'No', 'Mn', 'Av', 'Gd']
# order_BsmtFinType1 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
# order_KitchenQual = ['Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
# order_FireplaceQu = ['NA', 'Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']

# ordinal_enc = OrdinalEncoder(
#     categories=[
#         order_ExterQual, order_ExterCond, order_BsmtQual, order_BsmtCond,
#         order_BsmtExposure, order_BsmtFinType1, order_KitchenQual, order_FireplaceQu
#     ],
#     handle_unknown='use_encoded_value',
#     unknown_value=-1
# )


In [12]:
# Define the custom orders
order_ExterQual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_ExterCond = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtQual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtCond = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtExposure = ['NA', 'No', 'Mn', 'Av', 'Gd']
order_BsmtFinType1 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
order_KitchenQual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_FireplaceQu = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']

# Setup the encoder
ordinal_enc = OrdinalEncoder(
    categories=[
        order_ExterQual, order_ExterCond, order_BsmtQual, order_BsmtCond,
        order_BsmtExposure, order_BsmtFinType1, order_KitchenQual, order_FireplaceQu
    ],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

In [13]:
categoric_features_ordinal = [
    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
    'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu'
]

categoric_pipe_ordinal = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='NA'),  # Use 'NA' for the missing basement/fireplace related values
    ordinal_enc
)

In [14]:
all_categorical_cols = list(X_train.select_dtypes(include=["object"]))
categoric_features_onehot = list(set(all_categorical_cols) - set(categoric_features_ordinal))
numeric_features = list(X_train.select_dtypes(exclude=["object"]))

In [15]:
numeric_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    MinMaxScaler()
)

categoric_pipe_onehot = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder(handle_unknown='ignore')
)

categoric_pipe_ordinal = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='NA'),
    ordinal_enc 
)

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, numeric_features),
        ("cat_pipe_onehot", categoric_pipe_onehot, categoric_features_onehot),
        ("cat_pipe_ordinal", categoric_pipe_ordinal, categoric_features_ordinal),
    ]
)

In [17]:
dtree = DecisionTreeClassifier(random_state=42)

In [18]:
model_pipeline = make_pipeline(preprocessor, dtree)

In [19]:
model_pipeline.fit(X_train, y_train)

In [None]:
y_train_pred = model_pipeline.predict(X_train)

accuracy_score(y_train, y_train_pred)

In [None]:
y_test_pred = model_pipeline.predict(X_test)

accuracy_score(y_test, y_test_pred)

# Refine Model

In [51]:
#Other Model
order_ExterQual = ['Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_ExterCond = ['Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtQual = ['NA', 'Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtCond = ['NA', 'Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtExposure = ['NA', 'No', 'Mn', 'Av', 'Gd']
order_BsmtFinType1 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
order_KitchenQual = ['Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_FireplaceQu = ['NA', 'Other', 'Po', 'Fa', 'TA', 'Gd', 'Ex']

# Setup the encoder
ordinal_enc = OrdinalEncoder(
    categories=[
        order_ExterQual, order_ExterCond, order_BsmtQual, order_BsmtCond,
        order_BsmtExposure, order_BsmtFinType1, order_KitchenQual, order_FireplaceQu
    ],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

In [21]:
# # Define the custom orders
# order_ExterQual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
# order_ExterCond = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
# order_BsmtQual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
# order_BsmtCond = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
# order_BsmtExposure = ['NA', 'No', 'Mn', 'Av', 'Gd']
# order_BsmtFinType1 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
# order_KitchenQual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
# order_FireplaceQu = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']

# # Setup the encoder
# ordinal_enc = OrdinalEncoder(
#     categories=[
#         order_ExterQual, order_ExterCond, order_BsmtQual, order_BsmtCond,
#         order_BsmtExposure, order_BsmtFinType1, order_KitchenQual, order_FireplaceQu
#     ],
#     handle_unknown='use_encoded_value',
#     unknown_value=-1
# )

In [52]:
categoric_features_ordinal = [
    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
    'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu'
]

categoric_pipe_ordinal = make_pipeline(
    SimpleImputer(),  
    ordinal_enc
)

In [53]:
all_categorical_cols = list(X_train.select_dtypes(include=["object"]))
categoric_features_onehot = list(set(all_categorical_cols) - set(categoric_features_ordinal))
numeric_features = list(X_train.select_dtypes(exclude=["object"]))

In [54]:
numeric_pipe = make_pipeline(
    SimpleImputer(),
    MinMaxScaler()
)

categoric_pipe_onehot = make_pipeline(
    SimpleImputer(),
    OneHotEncoder(handle_unknown='ignore')
)

categoric_pipe_ordinal = make_pipeline(
    SimpleImputer(),
    ordinal_enc 
)

In [55]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, numeric_features),
        ("cat_pipe_onehot", categoric_pipe_onehot, categoric_features_onehot),
        ("cat_pipe_ordinal", categoric_pipe_ordinal, categoric_features_ordinal),
    ]
)

In [56]:
dtree = DecisionTreeClassifier(random_state=42)

In [57]:
model_pipeline = make_pipeline(preprocessor, dtree)

## Cross Validation

### Randomized Grid Search

In [58]:
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median", "most_frequent"],
    "columntransformer__num_pipe__minmaxscaler__clip": [True, False],
    "columntransformer__cat_pipe_onehot__simpleimputer__strategy": ["most_frequent"],
    "columntransformer__cat_pipe_onehot__onehotencoder__drop": [None, 'first'],
    "columntransformer__cat_pipe_onehot__onehotencoder__sparse_output": [True, False],
    "columntransformer__cat_pipe_ordinal__simpleimputer__strategy": ["most_frequent", "constant"],
    "columntransformer__cat_pipe_ordinal__simpleimputer__fill_value": ['NA', 'missing'],
    "decisiontreeclassifier__max_depth": range(2, 25),
    "decisiontreeclassifier__min_samples_leaf": range(3, 30),
    "decisiontreeclassifier__criterion": ["gini", "entropy"]
}

In [59]:
search = RandomizedSearchCV(
    model_pipeline,
    param_grid,
    n_iter=10000,  # specify number of iterations, 100 is just an example, adjust based on your computational capacity
    cv=10,
    verbose=1,
    n_jobs=-2,
    random_state=42
)

In [60]:
search.fit(X_train, y_train)

Fitting 10 folds for each of 10000 candidates, totalling 100000 fits


In [61]:
search.best_score_

0.9174108458591217

In [62]:
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9277730008598453

In [63]:
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9381443298969072

In [64]:
search.best_params_

{'decisiontreeclassifier__min_samples_leaf': 10,
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__criterion': 'entropy',
 'columntransformer__num_pipe__simpleimputer__strategy': 'mean',
 'columntransformer__num_pipe__minmaxscaler__clip': False,
 'columntransformer__cat_pipe_ordinal__simpleimputer__strategy': 'most_frequent',
 'columntransformer__cat_pipe_ordinal__simpleimputer__fill_value': 'missing',
 'columntransformer__cat_pipe_onehot__simpleimputer__strategy': 'most_frequent',
 'columntransformer__cat_pipe_onehot__onehotencoder__sparse_output': False,
 'columntransformer__cat_pipe_onehot__onehotencoder__drop': 'first'}

### Grid Search

In [65]:
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median", "most_frequent"],
    "columntransformer__num_pipe__minmaxscaler__clip": [True, False],
    "columntransformer__cat_pipe_onehot__simpleimputer__strategy": ["most_frequent", "constant"],
    "columntransformer__cat_pipe_onehot__simpleimputer__fill_value": ['NA', 'missing'],
    "columntransformer__cat_pipe_onehot__onehotencoder__drop": [None, 'first'],
    "columntransformer__cat_pipe_onehot__onehotencoder__sparse_output": [True, False],
    "columntransformer__cat_pipe_ordinal__simpleimputer__strategy": ["most_frequent", "constant"],
    "columntransformer__cat_pipe_ordinal__simpleimputer__fill_value": ['NA', 'missing'],
    "decisiontreeclassifier__max_depth": range(2, 8),
    "decisiontreeclassifier__min_samples_leaf": range(8, 12),
    "decisiontreeclassifier__criterion": ["gini", "entropy"]
}

In [66]:
search = GridSearchCV(
    model_pipeline,
    param_grid,
    cv=10,
    n_jobs=-2,
    verbose=1
)

In [67]:
search.fit(X_train, y_train)

Fitting 10 folds for each of 18432 candidates, totalling 184320 fits


In [68]:
search.best_score_

0.9174108458591217

In [69]:
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9277730008598453

In [70]:
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9381443298969072

In [71]:
search.best_params_

{'columntransformer__cat_pipe_onehot__onehotencoder__drop': 'first',
 'columntransformer__cat_pipe_onehot__onehotencoder__sparse_output': True,
 'columntransformer__cat_pipe_onehot__simpleimputer__fill_value': 'NA',
 'columntransformer__cat_pipe_onehot__simpleimputer__strategy': 'most_frequent',
 'columntransformer__cat_pipe_ordinal__simpleimputer__fill_value': 'NA',
 'columntransformer__cat_pipe_ordinal__simpleimputer__strategy': 'most_frequent',
 'columntransformer__num_pipe__minmaxscaler__clip': True,
 'columntransformer__num_pipe__simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__min_samples_leaf': 9}

In [72]:
print('This worked!')

This worked!
