In [2]:
# LOAD DATASET
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split
from modules.ML_tools import CV_score
import numpy as np

# fetch dataset 
secondary_mushroom = fetch_ucirepo(id=848) 
  
# data (as pandas dataframes) 
X = secondary_mushroom.data.features 
y = secondary_mushroom.data.targets 
y.loc[:, 'class'] = y['class'].map({'e': True, 'p': False})

#X = X.drop(columns=['does-bruise-or-bleed'])

random_state = 24

# Split train e test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=random_state)

print("Dataset size      |", X.shape[0])
print("Training set size |", X_train.shape[0])
print("Test set size     |", X_test.shape[0])


print(f"Dataset edible ratio: {y['class'].sum() / y.shape[0]}")
print(f"Training set edible ratio: {y_train['class'].sum() / y_train.shape[0]}")
print(f"Test set edible ratio: {y_test['class'].sum() / y_test.shape[0]}")

Dataset size      | 61069
Training set size | 42748
Test set size     | 18321
Dataset edible ratio: 0.44508670520231214
Training set edible ratio: 0.4471086366613643
Test set edible ratio: 0.4403689754926041


In [3]:
# cross-validation
from modules.tree import TreePredictor

tp = TreePredictor(max_depth = 28, min_samples_leaf = 1, splitting_criterion="scaled_entropy")
accuracy, pr, re, f1  = CV_score(tp, X_train, y_train, num_thresholds=50, k = 4)
print("accuracy: ", round(accuracy,4))
print("precision: ", round(pr,4))
print("recall: ", round(re,4))
print("f1 score: ", round(f1,4))

fold 1/4
fold 2/4
fold 3/4
fold 4/4
accuracy:  0.9988
precision:  0.9985
recall:  0.9988
f1 score:  0.9986


Grid search

In [None]:
# GridSearch
from modules.tree import TreePredictor
from modules.ML_tools import GridSearchCV

param_grid = {
    "max_depth" : [20, 22, 24, 26, 28, 30],
    "min_samples_leaf" : [1, 2, 5, 10],
    "splitting_criterion" : ["gini", "scaled_entropy", "sqrt_impurity"]
}

results  =  GridSearchCV(TreePredictor, X_train, y_train, num_thresholds= 50, k = 4, param_grid=param_grid)

In [13]:
best_acc = max(results, key=lambda x: x[1])
best_pr = max(results, key=lambda x: x[2])
best_re = max(results, key=lambda x: x[3])
print("Best val accuracy: ", f"{best_acc[0]['max_depth']}/{best_acc[0]['min_samples_leaf']}/{best_acc[0]['splitting_criterion']}", 
      "\n-val accuracy: ", round(best_acc[1], 4),
      "\n-val precision: ", round(best_acc[2], 4),
      "\n-val recall: ", round(best_acc[3], 4),
      "\n-val f1: ", round(best_acc[4], 4))

print("\nBest val precison: ", f"{best_pr[0]['max_depth']}/{best_pr[0]['min_samples_leaf']}/{best_pr[0]['splitting_criterion']}", 
      "\n-val accuracy: ", round(best_pr[1], 4),
      "\n-val precision: ", round(best_pr[2], 4),
      "\n-val recall: ", round(best_pr[3], 4),
      "\n-val f1: ", round(best_pr[4], 4))

print("\nBest val recall: ", f"{best_re[0]['max_depth']}/{best_re[0]['min_samples_leaf']}/{best_re[0]['splitting_criterion']}", 
      "\n-val accuracy: ", round(best_re[1], 4),
      "\n-val precision: ", round(best_re[2], 4),
      "\n-val recall: ", round(best_re[3], 4),
      "\n-val f1: ", round(best_re[4], 4))
print("-----------------")
for i, (comb, accuracy, pr, re, f1) in enumerate(sorted(results, key=lambda x: x[1], reverse=True)):
    print(f"{comb['max_depth']}/{comb['min_samples_leaf']}/{comb['splitting_criterion']}", 
          "val accuracy: ", round(accuracy, 5))

Best val accuracy:  28/1/scaled_entropy 
-val accuracy:  0.9988 
-val precision:  0.9985 
-val recall:  0.9988 
-val f1:  0.9986

Best val precison:  28/1/scaled_entropy 
-val accuracy:  0.9988 
-val precision:  0.9985 
-val recall:  0.9988 
-val f1:  0.9986

Best val recall:  24/1/scaled_entropy 
-val accuracy:  0.998 
-val precision:  0.9968 
-val recall:  0.9988 
-val f1:  0.9978
-----------------
28/1/scaled_entropy val accuracy:  0.99878
30/1/scaled_entropy val accuracy:  0.99878
26/1/scaled_entropy val accuracy:  0.99855
28/2/scaled_entropy val accuracy:  0.99834
30/2/scaled_entropy val accuracy:  0.99834
26/1/gini val accuracy:  0.99832
28/1/gini val accuracy:  0.99832
30/1/gini val accuracy:  0.99832
30/1/sqrt_impurity val accuracy:  0.99829
30/2/sqrt_impurity val accuracy:  0.9982
24/1/gini val accuracy:  0.9982
26/2/scaled_entropy val accuracy:  0.99811
26/2/gini val accuracy:  0.99801
28/2/gini val accuracy:  0.99801
30/2/gini val accuracy:  0.99801
24/1/scaled_entropy val a