# **SML Project: Binary Tree Predictors**

*   **Author:** Matteo Onger
*   **Date:** October 2024

**Dataset documentation**:
*   [Secondary Mushroom](https://archive.ics.uci.edu/dataset/848/secondary+mushroom+dataset)

## VM Setup

In [None]:
# install dataset package
!pip install ucimlrepo

# download repository
!git clone -b dev https://github.com/MatteoOnger/SML_Project.git

# set working directory
%cd /content/SML_Project/

## Code

In [27]:
# ---- LIBRARIES ----
import logging
import pandas as pd

from sklearn.model_selection import KFold
from typing import Tuple
from ucimlrepo import fetch_ucirepo

from bintreepredictor import BinTreePredictor
from data import DataSet
from utils import round_wrp

In [11]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", force=True)

In [30]:
# ---- FUNCTIONS ----
def k_folds_cross_val(k :int, predictor :BinTreePredictor, data :pd.DataFrame, shuffle :bool=True, random_state :int=1) -> Tuple[float, float]:
    """
    Applies the k-folds cross validation to estimate the expected risk of the predictor.

    Parameters
    ----------
    k : int
        Number of folds.
    predictor : BinTreePredictor
        Predictor tested.
    data : pd.DataFrame
        Data used to train and test the predictor.
    shuffle : bool, optional
        Whether to shuffle the data before splitting into batches, by default True.
    random_state : int, optional
        When shuffle is True, random_state affects the ordering of the indices, which controls the randomness of each fold. 
        Otherwise, this parameter has no effect, by default 1.

    Returns
    -------
    Tuple[float, float]
        The tuple returned contains the average training error and the average test error.
    """
    logger = logging.getLogger("crossval")

    avg_train_err = 0
    avg_test_err = 0

    cv = KFold(n_splits=k, shuffle=shuffle, random_state=random_state)

    for i, (train_index, test_index) in enumerate(cv.split(data)):
        train_ds = DataSet(data.iloc[train_index], label_col="class")
        test_ds =  DataSet(data.iloc[test_index], label_col="class")

        train_err = predictor.fit(train_ds)
        _, test_err = predictor.predict(test_ds)

        logger.debug(f"round {i} - training_err:{round_wrp(train_err,4)} - test_err:{round_wrp(test_err,4)}")

        avg_train_err += train_err
        avg_test_err += test_err
    return avg_train_err / k, avg_test_err / k

In [5]:
# fetch datatset
mushroom_df = fetch_ucirepo(id=848).data.original

In [None]:
# use k-folds CV to estimate the expected risk of each predictor produced by setting to different values the hyper-parameters 
stop_criterion_thresholds = [i for i in range(5, 35, 5)]

results = dict()
for threshold in stop_criterion_thresholds:
    predictor = BinTreePredictor("zero-one", "mode", "entropy", "max_nodes", 3, max_thresholds=threshold)
    _, avg_test_err = k_folds_cross_val(5, predictor, mushroom_df)
    results[threshold] = avg_test_err

results

In [None]:
# ...
stop_criterion_thresholds = [i for i in range(5, 35, 5)]

outer_cv = KFold(n_splits=10, shuffle=True, random_state=1)

for train_index, test_index in outer_cv.split(mushroom_df):
    test_ds = DataSet(mushroom_df.iloc[test_index])
    train_ds = DataSet(mushroom_df.iloc[train_index])

    best_threshold = None
    best_err = float("inf")
    for threshold in stop_criterion_thresholds:
        predictor = BinTreePredictor("zero-one", "mode", "entropy", "max_nodes", 3, max_thresholds=threshold)
        _, avg_test_err = k_folds_cross_val(5, predictor, mushroom_df)