# Homework 3 - classifiers

In this homework we will use 7 algorithms to classify data.

## Prerequisite knowledge


$$
\begin{align*}
    Precision &= \frac{\text{True positives}}{\text{True positives} + \text{False positives}} \\
    Recall &= \frac{\text{True positives}}{\text{True positives} + \text{False negatives}} \\
    F1 &= 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
\end{align*}
$$

**Note**: This is mostly for learning purposes.


## Dependencies

### General dependencies

In [None]:
import copy
import os
import platform

if platform.system() == "Windows":
    os.environ["R_HOME"] = "C:\Program Files\R\R-4.3.3"

In [None]:
import typing as t
import csv
import itertools
import numpy as np
import numpy.typing as npt
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import xgboost as xgb

from IPython.display import display, HTML, IFrame

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
mpl.rcParams['figure.dpi'] = 300

In [None]:
from sklearn.base import BaseEstimator
from sklearn import tree
from sklearn import neighbors
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import DecisionBoundaryDisplay

### Dataset-specific dependencies

In [None]:
LABEL_VARIABLE = "NObeyesdad"
NUMERICAL_VARIABLES = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]
CATEGORICAL_VARIABLES_NO_LABEL = [
    "FAVC",
    "CAEC",
    "CALC",
    "SCC",
    "MTRANS",
    "Gender",
    "family_history_with_overweight",
    "SMOKE",
]
CATEGORICAL_VARIABLES = [
    *CATEGORICAL_VARIABLES_NO_LABEL,
    LABEL_VARIABLE,
]
ALL_VARIABLES_NO_LABEL = [*NUMERICAL_VARIABLES, *CATEGORICAL_VARIABLES_NO_LABEL]
ALL_VARIABLES = [*NUMERICAL_VARIABLES, *CATEGORICAL_VARIABLES]
LABEL_DICTIONARY = {
    "Age": "Age",
    "Height": "Height (cm)",
    "Weight": "Weight (kg)",
    "FCVC": " Frequency of consumption of vegetables (times per day)",
    "NCP": "Number of main meals",
    "CH2O": "Consumption of water daily (Liters)",
    "FAF": "Physical activity frequency (times per day)",
    "TUE": "Time using technology devices (hours)",
    "FAVC": "Frequent consumption of high caloric food",
    "CAEC": "Consumption of food between meals",
    "CALC": "Consumption of alcohol",
    "SCC": "Calories consumption monitoring",
    "MTRANS": "Transportation used",
    "Gender": "Gender",
    "family_history_with_overweight": "Family member suffered or suffers from overweight",
    "SMOKE": "Smoker or not",
    "NObeyesdad": "Obesity level",
}

T = t.TypeVar("T")


class Person:
    Gender: str
    Age: np.int32
    Height: np.float32
    Weight: np.float32
    family_history_with_overweight: str
    FAVC: str
    FCVC: np.float32
    NCP: np.float32
    CAEC: str
    SMOKE: str
    CH2O: np.float32
    SCC: str
    FAF: np.float32
    TUE: np.float32
    CALC: str
    MTRANS: str
    NObeyesdad: str

    def __init__(
        self,
        Gender: str,
        Age: str,
        Height: str,
        Weight: str,
        family_history_with_overweight: str,
        FAVC: str,
        FCVC: str,
        NCP: str,
        CAEC: str,
        SMOKE: str,
        CH2O: str,
        SCC: str,
        FAF: str,
        TUE: str,
        CALC: str,
        MTRANS: str,
        NObeyesdad: str,
    ):
        self.Gender = Gender
        self.Age = np.float32(Age)
        self.Height = np.float32(Height)
        self.Weight = np.float32(Weight)
        self.family_history_with_overweight = family_history_with_overweight
        self.FAVC = FAVC
        self.FCVC = np.float32(FCVC)
        self.NCP = np.float32(NCP)
        self.CAEC = CAEC
        self.SMOKE = SMOKE
        self.CH2O = np.float32(CH2O)
        self.SCC = SCC
        self.FAF = np.float32(FAF)
        self.TUE = np.float32(TUE)
        self.CALC = CALC
        self.MTRANS = MTRANS
        self.NObeyesdad = NObeyesdad

    def __str__(self):
        return vars(self)

    def __len__(self):
        return len(vars(self))

    def __repr__(self):
        return vars(self)


class DatasetManager:
    def __init__(self, path_to_csv: str):
        self.path_to_csv = path_to_csv
        self.label_encoder = LabelEncoder()

    def load_as_obj_list(self) -> list[Person]:
        with open(self.path_to_csv) as csv_file:
            csv_reader = csv.DictReader(csv_file)
            return [Person(**row) for row in csv_reader]

    def make_label_combinations(self, labels: t.List[str], k: int = 2):
        result: t.List[t.Tuple[str, ...]] = []

        for combination in itertools.combinations(labels, k):
            result.append(list(combination))

        return result

    def labels_to_onehot(
        self, dataframe: pd.DataFrame, labels: t.List[str]
    ) -> pd.DataFrame:
        return pd.get_dummies(dataframe[labels], drop_first=True, dtype=int)

    def sample(
        self, dataframe: pd.DataFrame, frac: float, random_state: t.Optional[int] = 42
    ):
        sample = dataframe.sample(frac=frac, random_state=random_state)
        sample_extra = dataframe.drop(sample.index)

        return (sample, sample_extra)

    def encode_categorial_to_numerical(self, dataframe: pd.DataFrame):
        return pd.Series(data=self.label_encoder.fit_transform(dataframe))

    def compute_metrics(self, y_true: pd.DataFrame, y_hat: pd.DataFrame):
        precision = precision_score(y_true, y_hat, average="micro")
        recall = recall_score(y_true, y_hat, average="micro")
        f1 = f1_score(y_true, y_hat, average="micro")

        result = pd.DataFrame(
            {
                "Precision score": [precision],
                "Recall score": [recall],
                "F1 score": [f1],
            }
        )

        return result

In [None]:
class PlotManager:
    def __init__(self, cols: int, samples: int) -> None:
        self.rows = samples // cols + 1
        self.cols = cols
        self.samples = samples
        self.index = 0

        fig = plt.figure()
        fig, axs = plt.subplots(self.rows, self.cols)
        plt.tight_layout(h_pad=2, w_pad=2, pad=0)

        self.fig = fig
        self.axs = axs

    def __len__(self):
        return self.samples

    def __getitem__(self, i: int):
        subplot_i = i // (self.rows - 1)
        subplot_j = i % self.cols
        subplot = (subplot_i, subplot_j)

        return self.axs[subplot]

    def __next__(self):
        subplot_i = self.index // (self.rows - 1)
        subplot_j = self.index % self.cols
        subplot = (subplot_i, subplot_j)

        self.index += 1

        return self.axs[subplot]

In [None]:
dataset_manager = DatasetManager("data/ObesityDataSet.csv")
dataset_obj_list = dataset_manager.load_as_obj_list()
dataset_dataframe = pd.DataFrame.from_records(
    data=[vars(entry) for entry in dataset_obj_list]
)
dataset_output_classes = len(dataset_dataframe[LABEL_VARIABLE].unique())

### Algorithms utility functions

In [None]:
# TODO: Properly implement this
def generate_k_folds(X: pd.DataFrame, Y: pd.DataFrame, k: int = 8):
    random_state = 42

    X_train, X_test = dataset_manager.sample(
        dataframe=X, frac=0.8, random_state=random_state
    )
    Y_train, Y_test = dataset_manager.sample(
        dataframe=Y, frac=0.8, random_state=random_state
    )

    return [(X_train, X_test, Y_train, Y_test)]


def run_for_k_fold_cross_validation(
    dataset_manager: DatasetManager,
    classifier_generator: t.Callable[[t.Dict], BaseEstimator],
    params_set: t.Dict,
    X: pd.DataFrame,
    Y: pd.DataFrame,
):
    best_current_classifier = None
    best_current_metrics = pd.DataFrame(
        {"Precision score": [0], "Recall score": [0], "F1 score": [0]}
    )

    for X_train, X_test, Y_train, Y_test in generate_k_folds(X=X, Y=Y):
        classifier = classifier_generator(params_set)
        classifier.fit(X_train, Y_train)

        Y_hat = classifier.predict(X_test)
        Y_hat = pd.DataFrame({LABEL_VARIABLE: Y_hat})

        metrics = dataset_manager.compute_metrics(y_true=Y_test, y_hat=Y_hat)

        if metrics["F1 score"][0] > best_current_metrics["F1 score"][0]:
            best_current_metrics = metrics
            best_current_classifier = classifier

    return best_current_classifier, best_current_metrics


def run_for_parameter_set(
    dataset_manager: DatasetManager,
    dataset_dataframe: pd.DataFrame,
    classifier_generator: t.Callable[[t.Dict], BaseEstimator],
    params_set: t.Dict,
    label_combinations: t.List[t.Tuple[str, ...]],
    name: str,
    random_state: int,
    run_for_k_fold_cross_validation_fn: t.Callable[
        [
            DatasetManager,
            t.Callable[[t.Dict], BaseEstimator],
            t.List[t.Dict],
            pd.DataFrame,
            pd.DataFrame,
        ],
        t.Tuple[BaseEstimator, pd.DataFrame],
    ] = run_for_k_fold_cross_validation,
):
    best_classifier = None
    best_metrics = pd.DataFrame(
        {"Precision score": [0], "Recall score": [0], "F1 score": [0]}
    )

    plot_manager = PlotManager(cols=3, samples=len(label_combinations))

    display(HTML(f"<h1>{name} classifier</h1>"))
    display(HTML(f"<p>Parameters: {params_set}, random state: {random_state}.</p>"))

    for i, label_combination in enumerate(label_combinations):
        subplot = plot_manager[i]
        best_current_classifier = None
        best_current_metrics = pd.DataFrame(
            {"Precision score": [0], "Recall score": [0], "F1 score": [0]}
        )

        X = dataset_manager.labels_to_onehot(dataset_dataframe, label_combination)
        Y = dataset_dataframe[LABEL_VARIABLE]
        Y_encoded = dataset_manager.encode_categorial_to_numerical(Y)

        best_current_classifier, best_current_metrics = (
            run_for_k_fold_cross_validation_fn(
                dataset_manager=dataset_manager,
                classifier_generator=classifier_generator,
                params_set=params_set,
                X=X,
                Y=Y_encoded,
            )
        )

        if best_current_metrics["F1 score"][0] > best_metrics["F1 score"][0]:
            best_metrics = best_current_metrics
            best_classifier = best_current_classifier

        DecisionBoundaryDisplay.from_estimator(
            best_current_classifier,
            X,
            cmap=plt.cm.RdYlBu,
            response_method="predict",
            ax=subplot,
            xlabel=label_combination[0],
            ylabel=label_combination[1],
        )

        subplot.scatter(
            X[X.columns[0]],
            X[X.columns[1]],
            c=Y_encoded,
            cmap=plt.cm.RdYlBu,
            linewidths=0.25,
            edgecolor="black",
            s=5,
        )

        display(
            HTML(
                f"<p>For input label combination {label_combination} and output label '{LABEL_VARIABLE}', best metrics were:</p>"
            )
        )
        display(best_current_metrics)

    plt.show()

    return best_classifier, best_metrics


def run(
    dataset_manager: DatasetManager,
    dataset_dataframe: pd.DataFrame,
    classifier_generator: t.Callable[[t.Dict], BaseEstimator],
    params_sets: t.List[t.Dict],
    label_combinations: t.List[t.Tuple[str, ...]],
    name: str,
    random_state: int,
    run_for_parameter_set_fn: t.Callable[
        [
            DatasetManager,
            pd.DataFrame,
            t.Callable[[t.Dict], BaseEstimator],
            t.List[t.Dict],
            t.List[t.Tuple[str, ...]],
            str,
            int,
        ],
        t.Tuple[BaseEstimator, pd.DataFrame],
    ] = run_for_parameter_set,
):
    best_classifier = None
    best_metrics = pd.DataFrame(
        {"Precision score": [0], "Recall score": [0], "F1 score": [0]}
    )

    for params_set in params_sets:
        best_classifier, best_metrics = run_for_parameter_set_fn(
            dataset_manager=dataset_manager,
            dataset_dataframe=dataset_dataframe,
            classifier_generator=classifier_generator,
            params_set=params_set,
            label_combinations=label_combinations,
            name=name,
            random_state=random_state,
        )

    return best_classifier, best_metrics

## Data preprocessing

In [None]:
# TODO: Perform better selection

SELECTED_FEATURES = [
    "NCP",
    "Height",
    "Weight",
    "FAVC",
    "family_history_with_overweight",
]

## Classiers

### 1. Decision trees

In [None]:
label_combinations = dataset_manager.make_label_combinations(SELECTED_FEATURES)
random_state = np.random.randint(low=0, high=100)
params_sets = [
    {"criterion": "gini", "splitter": "best"},
    {"criterion": "gini", "splitter": "random"},
]


run(
    dataset_manager=dataset_manager,
    dataset_dataframe=dataset_dataframe,
    classifier_generator=lambda params_set: tree.DecisionTreeClassifier(
        criterion=params_set["criterion"], splitter=params_set["splitter"]
    ),
    params_sets=params_sets,
    label_combinations=label_combinations,
    name="Decision trees",
    random_state=42,
)

### 2. RandomForest or ExtraTrees

In [None]:
label_combinations = dataset_manager.make_label_combinations(SELECTED_FEATURES)
random_state = np.random.randint(low=0, high=100)
params_sets = [
    {"criterion": "gini", "splitter": "best"},
    {"criterion": "gini", "splitter": "random"},
]

run(
    dataset_manager=dataset_manager,
    dataset_dataframe=dataset_dataframe,
    classifier_generator=lambda params_set: tree.ExtraTreeClassifier(
        criterion=params_set["criterion"], splitter=params_set["splitter"]
    ),
    params_sets=params_sets,
    label_combinations=label_combinations,
    name="ExtraTree",
    random_state=42,
)

### 3. XGBoost

In [None]:
label_combinations = dataset_manager.make_label_combinations(SELECTED_FEATURES)
random_state = np.random.randint(low=0, high=100)
params_sets = [
    {
        "max_depth": 2,
        "learning_rate": 0.3,
        "objective": "multi:softmax",
        "num_class": dataset_output_classes,
        "rounds": 100,
    },
    {
        "max_depth": 2,
        "learning_rate": 0.2,
        "objective": "multi:softmax",
        "num_class": dataset_output_classes,
        "rounds": 200,
    },
    {
        "max_depth": 2,
        "learning_rate": 0.02,
        "objective": "multi:softmax",
        "num_class": dataset_output_classes,
        "rounds": 500,
    },
]

run(
    dataset_manager=dataset_manager,
    dataset_dataframe=dataset_dataframe,
    classifier_generator=lambda params_set: xgb.XGBClassifier(
        max_depth=params_set["max_depth"],
        learning_rate=params_set["learning_rate"],
        n_estimators=params_set["rounds"],
        objective=params_set["objective"],
        tree_method="hist",
    ),
    params_sets=params_sets,
    label_combinations=label_combinations,
    name="XGBoost",
    random_state=42,
)


### 4. Naive Bayes

### 5. k-NN

In [None]:
label_combinations = dataset_manager.make_label_combinations(SELECTED_FEATURES)
random_state = np.random.randint(low=0, high=100)
params_sets = [
    {"n_neighbors": 5, "algorithm": "ball_tree", "metric": "minkowski"},
    {"n_neighbors": 5, "algorithm": "ball_tree", "metric": "cityblock"},
    {"n_neighbors": 5, "algorithm": "kd_tree", "metric": "minkowski"},
    {"n_neighbors": 5, "algorithm": "kd_tree", "metric": "cityblock"},
    {"n_neighbors": 3, "algorithm": "ball_tree", "metric": "minkowski"},
    {"n_neighbors": 3, "algorithm": "ball_tree", "metric": "cityblock"},
    {"n_neighbors": 3, "algorithm": "kd_tree", "metric": "minkowski"},
    {"n_neighbors": 3, "algorithm": "kd_tree", "metric": "cityblock"},
]


run(
    dataset_manager=dataset_manager,
    dataset_dataframe=dataset_dataframe,
    classifier_generator=lambda params_set: neighbors.KNeighborsClassifier(
        n_neighbors=params_set["n_neighbors"],
        algorithm=params_set["algorithm"],
        metric=params_set["metric"],
    ),
    params_sets=params_sets,
    label_combinations=label_combinations,
    name="k-NN",
    random_state=42
)

### 6. Neural networks

### 7. SVM

### 8. Logistic regression