# **SML Project: Binary Tree Predictors**

*   **Author:** Matteo Onger
*   **Date:** October 2024

**Dataset documentation**:
*   [Secondary Mushroom](https://archive.ics.uci.edu/dataset/848/secondary+mushroom+dataset)

## VM Setup

In [None]:
# install dataset package
!pip install ucimlrepo

# download repository
!git clone -b dev https://github.com/MatteoOnger/SML_Project.git

# set working directory
%cd /content/SML_Project/

## Code

In [27]:
# ---- LIBRARIES ----
import logging
import pandas as pd

from sklearn.model_selection import KFold
from typing import Tuple
from ucimlrepo import fetch_ucirepo

from binrandomforest import BinRandomForest
from bintreepredictor import BinTreePredictor
from data import DataSet
from utils import round_wrp

In [11]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", force=True)

In [30]:
# ---- FUNCTIONS ----
def k_folds_cross_val(
        k :int,
        predictor :BinRandomForest|BinTreePredictor,
        data :pd.DataFrame,
        shuffle :bool=True,
        random_state :int=1,
        verbose :bool=False
    ) -> Tuple[float, float]:
    """
    Applies the k-folds cross validation to estimate the expected risk of the predictor.

    Parameters
    ----------
    k : int
        Number of folds.
    predictor : BinRandomForest | BinTreePredictor
        Predictor that must be tested.
    data : pd.DataFrame
        Data used to train and test the predictor.
    shuffle : bool, optional
        Whether to shuffle the data before splitting into batches, by default True.
    random_state : int, optional
        When shuffle is True, random_state affects the ordering of the indices, which controls the randomness of each fold. 
        Otherwise, this parameter has no effect, by default 1.
    verbose : bool, optional
        If True, training and test errors of each fold are printed, by default False.

    Returns
    -------
    Tuple[float, float]
        The tuple returned contains the average training error and the average test error.
    """
    avg_train_err = 0
    avg_test_err = 0

    cv = KFold(n_splits=k, shuffle=shuffle, random_state=random_state)

    for i, (train_index, test_index) in enumerate(cv.split(data)):
        train_ds = DataSet(data.iloc[train_index], label_col="class")
        test_ds =  DataSet(data.iloc[test_index], label_col="class")

        train_err = predictor.fit(train_ds)
        _, test_err = predictor.predict(test_ds)

        if verbose:
            print(f"round {i} - training_err:{round_wrp(train_err,4)} - test_err:{round_wrp(test_err,4)}")

        avg_train_err += train_err
        avg_test_err += test_err
    return avg_train_err / k, avg_test_err / k

### Dataset

In [5]:
# fetch datatset
mushroom_df = fetch_ucirepo(id=848).data.original
mushroom_df.head()

### Binary Tree Predictors

In [None]:
# use k-folds CV to estimate the expected risk of the predictors produced by setting to different values the hyper-parameters
k = 5

fixed_hyperparams = {
    "loss_func":"zero-one", 
    "prediction_criterion":"mode",
    "split_criterion":"entropy",
    "stop_criterion":"max_height",
    "max_features":None,
    "max_thresholds":5,
}

hyperparams = [
    {"stop_criterion_threshold":5},
    {"stop_criterion_threshold":15},
    {"stop_criterion_threshold":20},
]

results = dict()
for i, hp in enumerate(hyperparams): 
    predictor = BinTreePredictor(**fixed_hyperparams, **hp, id=i)
    avg_train_err, avg_test_err = k_folds_cross_val(k, predictor, mushroom_df)
    results[i] = (avg_train_err, avg_test_err)

results

In [None]:
# use nested CV to get a more precise estimate of the expected risk of the learning algo
outer_k = 10
inner_k = 5

fixed_hyperparams = {
    "loss_func":"zero-one", 
    "prediction_criterion":"mode",
    "split_criterion":"entropy",
    "stop_criterion":"max_height",
    "max_features":None,
    "max_thresholds":5,
}

hyperparams = [
    {"stop_criterion_threshold":5},
    {"stop_criterion_threshold":15},
    {"stop_criterion_threshold":20},
]

outer_cv = KFold(n_splits=outer_k, shuffle=True, random_state=1)

avg_train_err = 0
avg_val_err = 0
avg_test_err = 0

for train_index, test_index in outer_cv.split(mushroom_df):
    train_df = mushroom_df.iloc[train_index]
    test_df = mushroom_df.iloc[test_index]

    train_ds = DataSet(train_df, label_col="class")
    test_ds = DataSet(test_df, label_col="class")

    best_hyperparam, best_val_err = None, float("inf")

    for i, hp in enumerate(hyperparams): 
        predictor = BinTreePredictor(**fixed_hyperparams, **hp, id=i)
        _, avg_val_err = k_folds_cross_val(inner_k, predictor, train_df)

        if avg_val_err < best_val_err:
            best_val_err = avg_val_err
            best_hyperparam = hp

    predictor = BinTreePredictor(**fixed_hyperparams, **hp, id=i)
    train_err = predictor.fit(train_ds)
    _, test_err = predictor.predict(test_ds)
    
    avg_train_err += train_err
    avg_test_err += test_err

avg_train_err = avg_train_err / outer_k
avg_test_err = avg_test_err / outer_k

print(f"avg_training_err:{round_wrp(avg_train_err,4)} - avg_training_err:{round_wrp(avg_val_err,4)} - avg_test_err:{round_wrp(avg_test_err,4)}")