In [1]:
from scripts.final.DecisionTree import DecisionTree
from scripts.final.utils import *
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo #for importing data
from summarytools import dfSummary

In [2]:
# fetch dataset 
secondary_mushroom = fetch_ucirepo(id=848) 
  
# data (as pandas dataframes) 
X_loaded = secondary_mushroom.data.features 
y_loaded = secondary_mushroom.data.targets

In [3]:
X = X_loaded.copy()
y = y_loaded.copy()

In [4]:
X = X.drop(['spore-print-color', 'veil-type', 'veil-color', 'stem-root'], axis=1)

In [5]:
for col in X:
    X.loc[:,col]=X.loc[:,col].astype(str) if X.loc[:,col].dtype == 'object' else X.loc[:,col]

In [6]:
#encode y into 0s and 1s. 
y_mapping = encode_labels(y)
print(y_mapping)

{'class': {'e': 0, 'p': 1}}


In [7]:
#split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Now we train the tree using each of the three splitting methods

Model 1: Entropy

In [11]:
#training the decision tree
entropy_model = DecisionTree(split_using='entropy', max_depth=10)
entropy_model.fit(X_train, y_train)

In [12]:
#performance of entropy model
entropy_pred = entropy_model.predict(X_test)
print(accuracy(y_test, entropy_pred))
print(precision(y_test, entropy_pred))
print(recall(y_test, entropy_pred))

Accuracy: 0.8505813001473719
0.8505813001473719
Precision: 0.9733811591466868
0.9733811591466868
Recall: 0.7538011695906432
0.7538011695906432


Model 2: Gini impurity

In [13]:
#training the decision tree
gini_model = DecisionTree(split_using='gini', max_depth=10)
gini_model.fit(X_train, y_train)

In [14]:
#performance of gini model
gini_pred = gini_model.predict(X_test)
print(accuracy(y_test, gini_pred))
print(precision(y_test, gini_pred))
print(recall(y_test, gini_pred))

Accuracy: 0.9042901588341248
0.9042901588341248
Precision: 0.9544798845968905
0.9544798845968905
Recall: 0.8706140350877193
0.8706140350877193


Model 3: Train Error

Next we adopt the training error using zero-one loss as a splitting criteria

In [15]:
#training the decision tree
train_error_model = DecisionTree(split_using='train_error', max_depth=10)
train_error_model.fit(X_train, y_train)

In [16]:
#performance of train_error model
train_error_pred = train_error_model.predict(X_test)
print(accuracy(y_test, train_error_pred))
print(precision(y_test, train_error_pred))
print(recall(y_test, train_error_pred))

Accuracy: 0.8078434583265106
0.8078434583265106
Precision: 0.8805692021006268
0.8805692021006268
Recall: 0.7599415204678363
0.7599415204678363


In [17]:
#training errors of each model
entropy_train = entropy_model.predict(X_train)
gini_train = gini_model.predict(X_train)
train_error_train = train_error_model.predict(X_train)
print(f"entropy train error: {zero_one_loss(y_train, entropy_train)}")
print(f"gini train error: {zero_one_loss(y_train, gini_train)}")
print(f"train_error train error: {zero_one_loss(y_train, train_error_train)}")

entropy train error: 0.1510387882509467
gini train error: 0.09415617644048715
train_error train error: 0.1857332923958653


It's easy to see that the training errors are similar to the test errors, showing that the model didn't overfit considering the stopping criteria.

finally we perform hyper parameter tuning to optimize the model on the max depth stopping criterion

In [10]:
from joblib import Parallel, delayed
def evaluate_model(X_train, y_train, X_validate, y_validate, tune_on, split_using, i):
    params = {tune_on: i, 'split_using': split_using}
    tree = DecisionTree(**params)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_validate)
    validation_error = zero_one_loss(y_validate, y_pred)
    print(f"tree depth: {tree.max_depth}, validation error: {validation_error}")
    return validation_error, tree, i

def tune(X, y, tune_on, split_using, start, stop, n_jobs=-1):
    X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.2)
    
    results = Parallel(n_jobs=n_jobs)(
        delayed(evaluate_model)(X_train, y_train, X_validate, y_validate, tune_on, split_using, i)
        for i in range(start, stop)
    )
    
    best_error, best_tree, best_depth = min(results, key=lambda x: x[0])
    
    print(f"best depth: {best_depth} split criterion: {split_using} validation error: {round(best_error * 100, 2)} %")
    return best_tree


In [11]:
#tryna see if at some point the validation starts to rise
#tuning the tree on the max_depth criterion

tuned_tree = tune(X_train, y_train, tune_on='max_depth',split_using='gini', start=15, stop=25)

In [None]:
#retraining the tree with the best validation error
best_tree = DecisionTree(max_depth=20, split_using='gini')
best_tree.fit(X_train, y_train)

In [None]:
y_pred = best_tree.predict(X_test)
print(accuracy(y_test, y_pred))
print(precision(y_test, y_pred))
print(recall(y_test, y_pred))

Accuracy: 0.997789421974783
Precision: 0.9980991373007749
Recall: 0.997953216374269
