In [1]:
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer,make_regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,mean_squared_error,mean_absolute_error,r2_score

# my own implementation
from decision_tree import DecisionTreeClassifier, DecisionTreeRegressor
# SKlearn's impelemtation for comparisson 
from sklearn.tree import DecisionTreeClassifier as SkDecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor as SkDecisionTreeRegressor


### Testing the classifier model 

To evaluate the performance of my decision tree classifier implementation, I'll load a toy dataset and compare the results with Sklearn's implementation.

I won't perform extensive feature engineering and things like that, since my goal is not to achieve the best possible model. Instead, I'm focusing on just testing purposes.

In [2]:
# load classification dataset
data = load_breast_cancer()
X = data.data
y = data.target

print(data.data.shape)
print(pd.Series(y).value_counts())

(569, 30)
1    357
0    212
Name: count, dtype: int64


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [4]:
def test_classifier_model(
        which:str,
        loss:str,
        max_depth:int,
        X_train,
        X_test,
        y_train,
        y_test
):
    """
    Tests the performance of a decision tree classifier models with different parameters.
    """

    start_time = time.time()  # Record start time

    if which == "custom":
        clf = DecisionTreeClassifier(max_depth=max_depth,loss=loss)
        clf.train(X_train,y_train)

    elif which == "sklearn":
        clf = SkDecisionTreeClassifier(max_depth=max_depth,criterion=loss)
        clf.fit(X_train,y_train)

    else:
        raise ValueError("to the which parameter, custom and sklearn are the possible inputs")

    train_time = time.time() - start_time  # Calculate training time
    preds = clf.predict(X_test)

    
    return {
        "Model": which,
        "Impurity metric": loss,
        "Depth": max_depth,
        "Accuracy": accuracy_score(y_test,preds),
        "Precision": precision_score(y_test,preds),
        "Recall": recall_score(y_test,preds),
        "Time": train_time
    }

In [5]:
results = []

models = ['custom','sklearn']
loss_types = ['gini','entropy']
max_depths = [4,8]

for model in models:
    for loss in loss_types:
        for max_depth in max_depths:
            results.append(
                test_classifier_model(
                    which=model,
                    loss=loss,
                    max_depth=max_depth,
                    X_train=X_train,
                    X_test=X_test,
                    y_train=y_train,
                    y_test=y_test)
            )

In [6]:
pd.set_option('display.float_format', '{:.2f}'.format)
pd.DataFrame(results).pivot_table(
    columns='Model', 
    values=['Accuracy','Precision','Recall','Time'],
    index=['Impurity metric','Depth'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Accuracy,Precision,Precision,Recall,Recall,Time,Time
Unnamed: 0_level_1,Model,custom,sklearn,custom,sklearn,custom,sklearn,custom,sklearn
Impurity metric,Depth,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
entropy,4,0.96,0.96,0.96,0.95,0.97,0.98,0.75,0.0
entropy,8,0.94,0.96,0.94,0.95,0.95,0.98,0.93,0.0
gini,4,0.96,0.95,0.98,0.97,0.96,0.95,0.77,0.0
gini,8,0.92,0.92,0.96,0.96,0.92,0.92,1.17,0.0


Our model has achieved almost the same results as the sklearn tree classifier! However, it's important to note that the training time of the Sklearn's implementation is significantly faster. Since my primary goal was to implement the logic to refresh my knowledge, I'm really satisfied with the current outcome. Perhaps in the future, I can explore methods to optimize its performance. 


### Testing the regressor model 

This time I will test the regressor tree model, with another toy dataset. 

In [7]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
print(diabetes.data.shape, diabetes.target.shape)

(442, 10) (442,)


In [8]:
X = diabetes.data
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [9]:
def test_regressor_model(
        which:str,
        loss:str,
        max_depth:int,
        X_train,
        X_test,
        y_train,
        y_test
):
    
    start_time = time.time()  # Record start time

    if which == "custom":
        clf = DecisionTreeRegressor(max_depth=max_depth,loss=loss)
        clf.train(X_train,y_train)

    elif which == "sklearn":
        error_mapping={
            'mae':'absolute_error',
            'mse':'squared_error'
        }
        clf = SkDecisionTreeRegressor(max_depth=max_depth,criterion=error_mapping[loss])
        clf.fit(X_train,y_train)

    else:
        raise ValueError("to the which parameter, custom and sklearn are the possible inputs")

    train_time = time.time() - start_time  # Calculate training time
    preds = clf.predict(X_test)

    
    return {
        "Model": which,
        "Impurity metric": loss,
        "Depth": max_depth,
        "rmse": np.sqrt(mean_squared_error(y_test,preds)),
        "mae": mean_absolute_error(y_test,preds),
        "r2": r2_score(y_test,preds),
        "Time": train_time
    }

In [10]:
results = []

models = ['custom','sklearn']
loss_types = ['mae','mse']
max_depths = [2,3,4,5,6]

for model in models:
    for loss in loss_types:
        for max_depth in max_depths:
            results.append(
                test_regressor_model(
                    which=model,
                    loss=loss,
                    max_depth=max_depth,
                    X_train=X_train,
                    X_test=X_test,
                    y_train=y_train,
                    y_test=y_test)
            )


In [11]:
pd.set_option('display.float_format', '{:.2f}'.format)
pd.DataFrame(results).pivot_table(
    columns='Model', 
    values=['rmse','mae','r2','Time'],
    index=['Impurity metric','Depth']
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,Time,mae,mae,r2,r2,rmse,rmse
Unnamed: 0_level_1,Model,custom,sklearn,custom,sklearn,custom,sklearn,custom,sklearn
Impurity metric,Depth,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
mae,2,0.02,0.0,53.71,44.65,0.22,0.37,64.81,58.36
mae,3,0.04,0.0,45.23,46.03,0.39,0.3,57.55,61.51
mae,4,0.06,0.0,46.22,48.92,0.32,0.25,60.54,63.51
mae,5,0.08,0.0,48.16,52.61,0.27,0.11,62.57,69.25
mae,6,0.1,0.0,49.06,56.07,0.27,0.05,62.71,71.75
mse,2,0.02,0.0,53.83,46.5,0.21,0.36,65.43,58.99
mse,3,0.03,0.0,46.5,46.96,0.36,0.33,58.99,60.14
mse,4,0.06,0.0,46.96,47.03,0.33,0.35,60.14,59.08
mse,5,0.08,0.0,47.94,48.23,0.32,0.31,60.41,61.24
mse,6,0.1,0.0,49.67,49.04,0.26,0.26,63.12,63.24


Similarly, we've achieved satisfactory results with our custom implementation in a regression scenario as well. However, it's worth noting that, once again, the training times are significantly shorter with the sklearn implementation.