# **SML Project: Binary Tree Predictors**

*   **Author:** Matteo Onger
*   **Date:** October 2024

**Dataset documentation**:
*   [Secondary Mushroom](https://archive.ics.uci.edu/dataset/848/secondary+mushroom+dataset)

## VM Setup

In [1]:
# install dataset package
!pip install ucimlrepo

# download repository
!git clone -b dev https://github.com/MatteoOnger/SML_Project.git

# set working directory
%cd /content/SML_Project/

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Cloning into 'SML_Project'...
remote: Enumerating objects: 157, done.[K
remote: Counting objects: 100% (157/157), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 157 (delta 94), reused 91 (delta 44), pack-reused 0 (from 0)[K
Receiving objects: 100% (157/157), 40.99 KiB | 1.58 MiB/s, done.
Resolving deltas: 100% (94/94), done.
/content/SML_Project


## Code

In [27]:
# ---- LIBRARIES ----
import logging
import pandas as pd

from sklearn.model_selection import KFold
from ucimlrepo import fetch_ucirepo

from bintreepredictor import BinTreePredictor
from data import DataSet
from utils import round_wrp

In [11]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", force=True)

In [30]:
# ---- FUNCS ----
def k_folds_cross_val(k :int, predictor :BinTreePredictor, data :pd.DataFrame, random_state :int=1) -> float:
    logger = logging.getLogger("crossval")

    avg_test_err = 0
    cv = KFold(n_splits=k, shuffle=True, random_state=random_state)

    for i, (train_index, test_index) in enumerate(cv.split(data)):
        train_ds = DataSet(data.iloc[train_index], label_col="class")
        test_ds =  DataSet(data.iloc[test_index], label_col="class")

        train_err = predictor.fit(train_ds)
        _, test_err = predictor.predict(test_ds)

        logger.info(f"round {i} - training_err:{round_wrp(train_err,4)} - test_err:{round_wrp(test_err,4)}")

        avg_test_err += test_err
    return avg_test_err / k

In [5]:
# Fetch datatset
mushroom_df = fetch_ucirepo(id=848).data.original

In [None]:
predictor = BinTreePredictor("zero-one", "mode", "entropy", "max_nodes", 3, max_thresholds=threshold)
avg_test_err = k_folds_cross_val(5, predictor, mushroom_df)

In [None]:
stop_criterion_thresholds = [i for i in range(5,35, 5)]

res = dict()
for threshold in stop_criterion_thresholds:
    predictor = BinTreePredictor("zero-one", "mode", "entropy", "max_nodes", 3, max_thresholds=threshold)
    avg_test_err = k_folds_cross_val(5, predictor, mushroom_df)
    res[threshold] = avg_test_err

res

In [None]:
outer_cv = KFold(n_splits=10, shuffle=True, random_state=1)



for outer_train_index, outer_test_index in outer_cv.split(mushroom_df):
    test_ds = DataSet(mushroom_df.iloc[test_index])
    train_ds = DataSet(mushroom_df.iloc[train_index])

    inner_cv = KFold(n_splits=3, shuffle=True, random_state=1)



    print(space)