In [27]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import plot_tree
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
houses = pd.read_csv('/Users/merlesteffen/Documents/GitHub/HousingPrices/Data/iter-3/housing-classification-iter3.csv')

In [14]:
houses.shape

(1460, 16)

# Split Data

In [16]:
X = houses.drop(columns='Expensive')
y = houses['Expensive']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# One Hot Encoding

In [28]:
categoric_features = list(X_train.select_dtypes(include=["object"]))
numeric_features = list(X_train.select_dtypes(exclude=["object"]))

Create Pipelines which are then applied to different subsets of the original dataframe.

In [35]:
scaler = MinMaxScaler()

In [36]:
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="median"),
    scaler
)

categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown='ignore') # handle_unknown ensures that if a category not seen during fit appears during transform, it's ignored.
)

- The ColumnTransformer takes in a list of transformers, where each transformer is applied to a specified subset of the columns in the input data.
- Each transformer is defined by a name, a transformer object (like a pipeline), and the columns it should be applied to.

In [37]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, numeric_features),
        ("cat_pipe", categoric_pipe, categoric_features),
    ]
)

# Choose a Model

In [38]:
dtree = DecisionTreeClassifier(random_state = 42)

# Build Final Pipeline

In [39]:
model_pipeline = make_pipeline(preprocessor, dtree)

model_pipeline.fit(X_train, y_train)

In [40]:
predictions = model_pipeline.predict(X_train)
score = accuracy_score(y_train, predictions)
print(score)

1.0


In [41]:
predictions = model_pipeline.predict(X_test)
score = accuracy_score(y_test, predictions)
print(score)

0.9006849315068494


# Cross Validation

## Grid Search CV

In [46]:
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median"],
    "columntransformer__num_pipe__minmaxscaler__feature_range": [(0,1), (0,2)],
    "columntransformer__num_pipe__minmaxscaler__clip": [True, False],
    "columntransformer__cat_pipe__simpleimputer__strategy": ["most_frequent", "constant"],
    "columntransformer__cat_pipe__simpleimputer__fill_value": ["N_A"],  # Only applies when strategy is "constant"
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(3, 10),
    "decisiontreeclassifier__criterion": ["gini", "entropy"]
}

In [47]:
search = GridSearchCV(
    model_pipeline,
    param_grid,
    cv=10,
    verbose=1
)

In [48]:
search.fit(X_train, y_train)

Fitting 10 folds for each of 2688 candidates, totalling 26880 fits


In [49]:
search.best_score_

0.9204022988505747

In [50]:
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9332191780821918

In [51]:
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9383561643835616