In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import plot_tree
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV

In [2]:
houses = pd.read_csv('/Users/merlesteffen/Documents/GitHub/HousingPrices/Data/iter-3/housing-classification-iter3.csv')

In [3]:
houses.shape

(1460, 16)

 Had some issues with very few observations in some of the categorical features. Let us transform the dataframe a bit.

In [41]:
categories_to_combine_1 = ['RRAn', 'PosN', 'RRAe', 'PosA', 'RRNn', 'RRNe']
houses['Condition1'] = houses['Condition1'].replace(categories_to_combine_1, 'Other')
categories_to_combine_2 = ['GasW', 'Grav', 'Wall', 'OthW', 'Floor']
houses['Heating'] = houses['Heating'].replace(categories_to_combine_2, 'Other')  

# Split Data

In [43]:
X = houses.drop(columns='Expensive')
y = houses['Expensive']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# One Hot Encoding

In [44]:
categoric_features = list(X_train.select_dtypes(include=["object"]))
numeric_features = list(X_train.select_dtypes(exclude=["object"]))

Create Pipelines which are then applied to different subsets of the original dataframe.

In [45]:
numeric_pipe = make_pipeline(
    SimpleImputer(),
    MinMaxScaler()
)

categoric_pipe = make_pipeline(
    SimpleImputer(),
    OneHotEncoder(handle_unknown='ignore') # handle_unknown ensures that if a category not seen during fit appears during transform, it's ignored.
)

- The ColumnTransformer takes in a list of transformers, where each transformer is applied to a specified subset of the columns in the input data.
- Each transformer is defined by a name, a transformer object (like a pipeline), and the columns it should be applied to.

In [46]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, numeric_features),
        ("cat_pipe", categoric_pipe, categoric_features),
    ]
)

# Choose a Model

In [47]:
dtree = DecisionTreeClassifier(random_state = 42)

# Cross Validation

## Randomized Grid Search

In [48]:
model_pipeline = make_pipeline(preprocessor, dtree)

In [49]:
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median", "most_frequent"],
    "columntransformer__num_pipe__minmaxscaler__clip": [True, False],
    "columntransformer__cat_pipe__simpleimputer__strategy": ["most_frequent"],
    "columntransformer__cat_pipe__onehotencoder__drop": [None, 'first'],
    "columntransformer__cat_pipe__onehotencoder__sparse_output": [True, False],
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(3, 14),
    "decisiontreeclassifier__criterion": ["gini", "entropy"]
}

In [50]:
search = RandomizedSearchCV(
    model_pipeline,
    param_grid,
    n_iter=100,  # specify number of iterations, 100 is just an example, adjust based on your computational capacity
    cv=10,
    verbose=1,
    random_state=42
)

In [51]:
search.fit(X_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [52]:
search.best_score_

0.9229885057471264

In [53]:
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9332191780821918

In [54]:
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9383561643835616

In [55]:
search.best_params_

{'decisiontreeclassifier__min_samples_leaf': 8,
 'decisiontreeclassifier__max_depth': 5,
 'decisiontreeclassifier__criterion': 'entropy',
 'columntransformer__num_pipe__simpleimputer__strategy': 'median',
 'columntransformer__num_pipe__minmaxscaler__clip': True,
 'columntransformer__cat_pipe__simpleimputer__strategy': 'most_frequent',
 'columntransformer__cat_pipe__onehotencoder__sparse_output': False,
 'columntransformer__cat_pipe__onehotencoder__drop': None}

## Refined Grid Search

In [65]:
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median", "most_frequent"],
    "columntransformer__num_pipe__minmaxscaler__clip": [True, False],
    "columntransformer__cat_pipe__simpleimputer__strategy": ["most_frequent"],
    "columntransformer__cat_pipe__onehotencoder__drop": [None, 'first'],
    "columntransformer__cat_pipe__onehotencoder__sparse_output": [True, False],
    "decisiontreeclassifier__max_depth": range(3, 7),
    "decisiontreeclassifier__min_samples_leaf": range(6, 10),
    "decisiontreeclassifier__criterion": ["gini", "entropy"]
}

In [66]:
search = GridSearchCV(
    model_pipeline,
    param_grid,
    cv=10,
    n_jobs=-1,
    verbose=1
)

In [58]:
X_train['Heating'].value_counts()

GasA     1140
Other      28
Name: Heating, dtype: int64

In [59]:
X_train['Condition1'].value_counts()

Norm      1004
Feedr       66
Other       58
Artery      40
Name: Condition1, dtype: int64

In [60]:
search.fit(X_train, y_train)

Fitting 10 folds for each of 768 candidates, totalling 7680 fits


In [61]:
search.best_score_

0.9229885057471264

In [62]:
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9332191780821918

In [63]:
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9383561643835616

In [64]:
print('This worked!')

This worked!
