In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Env setup
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

#Modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

#Evaluating
from sklearn.metrics import accuracy_score

sns.set_style("darkgrid")

In [3]:
#create transformer class to use in pipeline
from sklearn.base import BaseEstimator,TransformerMixin
class CleanDataTransformer(BaseEstimator, TransformerMixin,auto_wrap_output_keys=None):
    def __init__(self, *, columns=None):
        self.columns = columns
        #super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        try:
            X_= X_.drop(columns=self.columns)
        except:
            print(" within CleanDataTransformer, dropping columns failed! ")
        
        return X_

## Import Data

In [4]:
house_df = pd.read_csv("datasets\housing-classification-iter-0-2\housing-classification-iter-0-2.csv")

In [5]:
# house_c_df = house_df.copy()
# house_c_df = house_c_df.drop(columns=["PoolArea","ScreenPorch"])

## Split Data

In [6]:
X = house_df.drop(columns=["Expensive"]).copy()
y = house_df.pop("Expensive")
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=272)

## Create Pipeline

In [19]:
param_grid = {
    "simpleimputer__strategy" : ['mean',"median"],
    "standardscaler__with_mean" : [True,False],
    "standardscaler__with_std" : [True,False],
    'decisiontreeclassifier__max_depth': range(2, 10),
    'decisiontreeclassifier__min_samples_leaf': range(5, 50 ,5),
    'decisiontreeclassifier__min_samples_split': range(5, 50, 10),
    'decisiontreeclassifier__criterion':['gini']
    }

In [8]:
my_imputer = SimpleImputer().set_output(transform='pandas')
my_scaler = StandardScaler().set_output(transform='pandas')
my_tree = DecisionTreeClassifier()
my_cleaner = CleanDataTransformer(columns=["PoolArea","ScreenPorch"])

In [9]:
# from tempfile import mkdtemp
# cachedir = mkdtemp()

In [20]:

pipe = make_pipeline(
                     #my_cleaner,
                     my_imputer,
                     my_scaler,
                     my_tree,
                     #memory=cachedir
                     )

## Try RandomizedSearchCV

In [11]:
Rsearch = RandomizedSearchCV(pipe,
                      param_grid,
                      n_iter=100,
                      cv=5,
                      scoring='accuracy',
                      verbose=1) 

In [12]:
Rsearch.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [13]:
Rsearch.best_params_

{'simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__min_samples_split': 15,
 'decisiontreeclassifier__min_samples_leaf': 5,
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__criterion': 'gini'}

In [14]:
Rsearch.best_score_

0.9255309783206778

In [15]:
accuracy_score(y_train,Rsearch.predict(X_train))

0.9375

In [16]:
accuracy_score(y_test,Rsearch.predict(X_test))

0.9041095890410958

## Try GridSearchCV

In [22]:
search = GridSearchCV(pipe,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1) 

In [23]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 2880 candidates, totalling 14400 fits


In [24]:
search.best_params_

{'decisiontreeclassifier__criterion': 'gini',
 'decisiontreeclassifier__max_depth': 5,
 'decisiontreeclassifier__min_samples_leaf': 5,
 'decisiontreeclassifier__min_samples_split': 45,
 'simpleimputer__strategy': 'mean',
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True}

In [25]:
search.best_score_

0.9289534499834928

In [26]:
accuracy_score(y_train,search.predict(X_train))

0.934931506849315

In [27]:
accuracy_score(y_test,search.predict(X_test))

0.9041095890410958

In [129]:
# from shutil import rmtree
# # Clear the cache directory when you don't need it anymore
# rmtree(cachedir)