# **SML Project: Binary Tree Predictors**

*   **Author:** Matteo Onger
*   **Date:** October 2024

**Dataset documentation**:
*   [Secondary Mushroom](https://archive.ics.uci.edu/dataset/848/secondary+mushroom+dataset)

## VM Setup

In [None]:
# install dataset package
!pip install ucimlrepo

# download repository
!git clone -b dev https://github.com/MatteoOnger/SML_Project.git

# set working directory
%cd /content/SML_Project/

## Code

In [27]:
# ---- LIBRARIES ----
import logging
import pandas as pd

from sklearn.model_selection import KFold
from typing import Any, Dict, List, Tuple, Type
from ucimlrepo import fetch_ucirepo

from binrandomforest import BinRandomForest
from bintreepredictor import BinTreePredictor
from data import DataSet
from utils import round_wrp

In [11]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", force=True)

In [30]:
# ---- FUNCTIONS ----
def k_folds_cross_val(
        k :int,
        predictor :BinRandomForest|BinTreePredictor,
        data :pd.DataFrame,
        label_col :str,
        shuffle :bool=True,
        random_state :int=1,
        verbose :bool=False
    ) -> Tuple[float, float]:
    """
    Applies the k-folds cross validation to estimate the expected risk of the predictor.

    Parameters
    ----------
    k : int
        Number of folds.
    predictor : BinRandomForest | BinTreePredictor
        Predictor that must be tested.
    data : pd.DataFrame
        Data used to train and test the predictor.
    label_col : str
        The name of the column of ``data`` that contains the labels.
    shuffle : bool, optional
        Whether to shuffle the data before splitting into batches, by default True.
    random_state : int, optional
        When shuffle is True, random_state affects the ordering of the indices, which controls the randomness of each fold. 
        Otherwise, this parameter has no effect, by default 1.
    verbose : bool, optional
        If True, training and test error of each fold are printed, by default False.

    Returns
    -------
    Tuple[float, float]
        The tuple returned contains the average training error and the average test error.
    """
    avg_train_err = 0
    avg_test_err = 0

    cv = KFold(n_splits=k, shuffle=shuffle, random_state=random_state)

    for i, (train_index, test_index) in enumerate(cv.split(data)):
        train_ds = DataSet(data.iloc[train_index], label_col=label_col)
        test_ds =  DataSet(data.iloc[test_index], label_col=label_col)

        train_err = predictor.fit(train_ds)
        _, test_err = predictor.predict(test_ds)

        if verbose:
            print(f"round {i} - training_err:{round_wrp(train_err,4)} - test_err:{round_wrp(test_err,4)}")

        avg_train_err += train_err
        avg_test_err += test_err
    return avg_train_err / k, avg_test_err / k


def nested_cross_val(
        outer_k :int,
        inner_k :int,
        predicotr_class :Type[BinTreePredictor]|Type[BinRandomForest],
        fixed_hyperparams :Dict[str, Any],
        hyperparams :List[Dict[str, Any]],
        data :pd.DataFrame,
        label_col :str,
        shuffle :bool=True,
        random_state :int=1,
        verbose :bool=False
    ) -> Tuple[float, float]:
    """
    Applies the nested cross validation to estimate the expected risk of the learning algorithm.

    Parameters
    ----------
    outer_k : int
        Number of folds of the outer CV.
    inner_k : int
        Number of folds of the inner CV.
    predicotr_class : Type[BinTreePredictor] | Type[BinRandomForest]
        Class of the predictor that must be tested.
    fixed_hyperparams : Dict[str, Any]
        Fixed hyper-parameters on which no tuning is performed.
    hyperparams : List[Dict[str, Any]]
        Hyper-parameters to be tuned.
        A list containing all the combinations of the hyper-parameters to try must be given, then the best combination will be used. 
    data : pd.DataFrame
        Data used to train and test the predictor.
    label_col : str
        The name of the column of ``data`` that contains the labels.
    shuffle : bool, optional
        Whether to shuffle the data before splitting into batches, by default True.
    random_state : int, optional
        When shuffle is True, random_state affects the ordering of the indices, which controls the randomness of each fold. 
        Otherwise, this parameter has no effect, by default 1.
    verbose : bool, optional
        If True, training and test error of each fold are printed, by default False.

    Returns
    -------
    Tuple[float, float]
        The tuple returned contains the average training error and the average test error.
    """
    avg_train_err = 0
    avg_test_err = 0

    outer_cv = KFold(n_splits=outer_k, shuffle=shuffle, random_state=random_state)

    for i, (train_index, test_index) in enumerate(outer_cv.split(data)):
        train_df = data.iloc[train_index]
        test_df = data.iloc[test_index]

        train_ds = DataSet(train_df, label_col=label_col)
        test_ds = DataSet(test_df, label_col=label_col)

        best_hyperparam, best_val_err = None, float("inf")

        for j, hp in enumerate(hyperparams): 
            predictor = predicotr_class(**fixed_hyperparams, **hp, id=j)
            _, avg_val_err = k_folds_cross_val(inner_k, predictor, train_df, label_col, shuffle, random_state)

            if avg_val_err < best_val_err:
                best_val_err = avg_val_err
                best_hyperparam = hp

        predictor = predicotr_class(**fixed_hyperparams, **best_hyperparam, id=-i)
        train_err = predictor.fit(train_ds)
        _, test_err = predictor.predict(test_ds)

        if verbose:
            print(f"round {i} - training_err:{round_wrp(train_err,4)} - validation_err:{round_wrp(best_val_err,4)} - test_err:{round_wrp(test_err,4)}")

        avg_train_err += train_err
        avg_test_err += test_err
    return avg_train_err / outer_k, avg_test_err / outer_k


def eval_hyperparams(
        k :int,
        predicotr_class :Type[BinTreePredictor]|Type[BinRandomForest],
        fixed_hyperparams :Dict[str, Any],
        hyperparams :List[Dict[str, Any]],
        data :pd.DataFrame,
        label_col :str,
        shuffle :bool=True,
        random_state :int=1,
        verbose :bool=False
    ) -> Dict[int, Dict[str, float]]:
    """
    Trains one predictor for each combination of the hyper-parameters given in ``hyperparams`` and then
    applies the k-folds cross validation to estimate its expected risk.

    Parameters
    ----------
    k : int
        Number of folds.
    predicotr_class : Type[BinTreePredictor] | Type[BinRandomForest]
        Class of the predictor that must be tested.
    fixed_hyperparams : Dict[str, Any]
        Fixed hyperparameters.
    hyperparams : List[Dict[str, Any]]
        A list containing all the combinations of the hyperparameters to try.
    data : pd.DataFrame
        Data used to train and test the predictor.
    label_col : str
        The name of the column of ``data`` that contains the labels.
    shuffle : bool, optional
        Whether to shuffle the data before splitting into batches, by default True.
    random_state : int, optional
        When shuffle is True, random_state affects the ordering of the indices, which controls the randomness of each fold. 
        Otherwise, this parameter has no effect, by default 1.
    verbose : bool, optional
        If True, training and test error of each fold are printed, by default False.

    Returns
    -------
    Dict[int, Dict[str, float]]
        _description_
    """
    results = dict()
    for i, hp in enumerate(hyperparams): 
        predictor = predicotr_class(**fixed_hyperparams, **hp, id=i)
        avg_train_err, avg_test_err = k_folds_cross_val(k, predictor, data, label_col, shuffle, random_state, verbose)
        results[i] = {"avg_train_err":avg_train_err, "avg_test_err":avg_test_err}
    return results

### Dataset

In [5]:
# fetch datatset
mushroom_df = fetch_ucirepo(id=848).data.original
mushroom_df.head()

### Binary Tree Predictors

In [None]:
# use k-folds CV to estimate the expected risk of the predictors produced by setting to different values the hyper-parameters
k = 5

fixed_hyperparams = {
    "loss_func":"zero-one", 
    "prediction_criterion":"mode",
    "stop_criterion":"max_nodes",
    "max_features":None,
    "max_thresholds":5,
}

hyperparams = [
    {"split_criterion":"entropy", "stop_criterion_threshold": 96},
    {"split_criterion":"entropy", "stop_criterion_threshold":128},
    {"split_criterion":"entropy", "stop_criterion_threshold":160},
    {"split_criterion":"gini", "stop_criterion_threshold": 96},
    {"split_criterion":"gini", "stop_criterion_threshold":128},
    {"split_criterion":"gini", "stop_criterion_threshold":160},
    {"split_criterion":"misclass", "stop_criterion_threshold": 96},
    {"split_criterion":"misclass", "stop_criterion_threshold":128},
    {"split_criterion":"misclass", "stop_criterion_threshold":160},
]

eval_hyperparams(k, BinTreePredictor, fixed_hyperparams, hyperparams, mushroom_df, "class")

In [None]:
# train the predictor, test the performance and print the results
train_ds = DataSet(mushroom_df.sample(frac=0.8, random_state=2106))
test_ds = DataSet(mushroom_df.drop(train_ds.index))

tree = BinTreePredictor("zero-one", "mode", "gini", "max_height", 20, max_thresholds=5)
train_err = tree.fit(train_ds)
prediction, test_err = tree.predict(test_ds)

tree.print_tree()
print(f"Training error:{train_err}")
print(f"Test error:{test_err}")

res = mushroom_df.copy(deep=True)
res.insert(1, "predicted-class", prediction)

print(res)