In [1]:
# Define the metric and load the data
import pandas as pd
from sklearn.metrics import f1_score, make_scorer

# Define the metric
f1_scorer = make_scorer(f1_score)

# Load the data
data = pd.read_csv("../data/processed/compositing/data.csv")
target = pd.read_csv("../data/processed/compositing/target.csv")["0"]

In [2]:
# Create save folder and wrapper functions
from pathlib import Path
from typing import Any

save_folder = "../models/"
Path(save_folder).mkdir(parents=True, exist_ok=True)


def _suggest_categorical(*args: Any, **kwargs: Any) -> tuple:  # noqa: ANN401
    return "suggest_categorical", args, kwargs


def _suggest_discrete_uniform(*args: Any, **kwargs: Any) -> tuple:  # noqa: ANN401
    return "suggest_discrete_uniform", args, kwargs


def _suggest_float(*args: Any, **kwargs: Any) -> tuple:  # noqa: ANN401
    return "suggest_float", args, kwargs


def _suggest_int(*args: Any, **kwargs: Any) -> tuple:  # noqa: ANN401
    return "suggest_int", args, kwargs

In [3]:
from skelm import ELMClassifier
from slc.models import hyperparam_search

elm_model = ELMClassifier(random_state=42)
search_space = [
    _suggest_float("alpha", 1e-8, 1e5, log=True),
    _suggest_categorical("include_original_features", [True, False]),
    _suggest_categorical("ufunc", ["tanh", "sigm", "relu", "lin"]),
    _suggest_categorical("n_neurons", [1, None]),
    _suggest_float("density", 0.01, 0.99),
]

elm_pipe, elm_study = hyperparam_search(
    elm_model,
    search_space,
    data,
    target,
    f1_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42,
)

elm_pipe

Files already exist, skipping search: ../models/ELMClassifier_study.pkl, ../models/ELMClassifier.pkl


In [4]:
# K-Nearest Neighbour
from sklearn.neighbors import KNeighborsClassifier
from slc.models import hyperparam_search

knn_model = KNeighborsClassifier(n_jobs=-1)
search_space = [
    _suggest_int("n_neighbors", 1, 100),
    _suggest_categorical("weights", ["uniform", "distance"]),
    _suggest_categorical("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]),
]

knn_pipe, knn_study = hyperparam_search(
    knn_model,
    search_space,
    data,
    target,
    f1_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42,
)

knn_pipe

Files already exist, skipping search: ../models/KNeighborsClassifier_study.pkl, ../models/KNeighborsClassifier.pkl


In [5]:
# SGD Linear Regression
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(random_state=42)
search_space = [
    _suggest_categorical(
        "loss",
        [
            "squared_epsilon_insensitive",
            "modified_huber",
            "log_loss",
            "perceptron",
            "squared_error",
            "squared_hinge",
            "epsilon_insensitive",
            "huber",
            "hinge",
        ],
    ),
    _suggest_float("alpha", 1e-6, 1e5, log=True),
    _suggest_float("l1_ratio", 0, 1),
    _suggest_int("max_iter", 1, 10000),
]

sgd_pipe, sgd_study = hyperparam_search(
    sgd_model,
    search_space,
    data,
    target,
    f1_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42,
)

sgd_pipe

Files already exist, skipping search: ../models/SGDClassifier_study.pkl, ../models/SGDClassifier.pkl


In [6]:
# Support Vector Machine
from sklearn.svm import SVC

svm_model = SVC()
search_space = [
    _suggest_float("C", 1e-5, 1e5, log=True),
    _suggest_categorical("kernel", ["poly", "rbf", "sigmoid"]),
]

svm_pipe, svm_study = hyperparam_search(
    svm_model,
    search_space,
    data,
    target,
    f1_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42,
    always_standardize=True,  # TODO: pack this info into the paper
)

svm_pipe

Files already exist, skipping search: ../models/SVC_study.pkl, ../models/SVC.pkl


In [7]:
from sklearn.ensemble import ExtraTreesClassifier

et_model = ExtraTreesClassifier(n_jobs=-1, random_state=42)
search_space = [
    _suggest_int("n_estimators", 1, 200),
    _suggest_float("min_impurity_decrease", 1e-5, 0.5, log=True),
    _suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
]

et_pipe, et_study = hyperparam_search(
    et_model,
    search_space,
    data,
    target,
    f1_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42,
)

et_pipe

Files already exist, skipping search: ../models/ExtraTreesClassifier_study.pkl, ../models/ExtraTreesClassifier.pkl


In [8]:
# HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier

hgb_model = HistGradientBoostingClassifier(random_state=42)
search_space = [
    _suggest_int("max_iter", 10, 1000),
    _suggest_float("learning_rate", 0.001, 0.5, log=True),
    _suggest_int("max_leaf_nodes", 2, 1000),
    _suggest_categorical("l2_regularization", [0, 1e-10, 1e-5, 1e-3, 1e-1, 1]),
]

hgb_pipe, hgb_study = hyperparam_search(
    hgb_model,
    search_space,
    data,
    target,
    f1_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42,
)

hgb_pipe

Resuming search from cache at trial 23.


[I 2024-09-07 16:22:24,282] Trial 23 finished with value: 0.625 and parameters: {'do_standardize': True, 'do_pca': False, 'max_iter': 711, 'learning_rate': 0.01737043138817676, 'max_leaf_nodes': 216, 'l2_regularization': 1e-05}. Best is trial 9 with value: 0.632768361581921.
[I 2024-09-07 16:23:29,074] Trial 24 finished with value: 0.6143790849673203 and parameters: {'do_standardize': True, 'do_pca': False, 'max_iter': 723, 'learning_rate': 0.02014196795989707, 'max_leaf_nodes': 249, 'l2_regularization': 1e-05}. Best is trial 9 with value: 0.632768361581921.
[I 2024-09-07 16:24:14,341] Trial 25 finished with value: 0.6358381502890174 and parameters: {'do_standardize': True, 'do_pca': False, 'max_iter': 839, 'learning_rate': 0.037908057335103, 'max_leaf_nodes': 254, 'l2_regularization': 1e-05}. Best is trial 25 with value: 0.6358381502890174.
[I 2024-09-07 16:24:27,264] Trial 26 finished with value: 0.56 and parameters: {'do_standardize': False, 'do_pca': True, 'n_components': 242, 'max

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/peterhofinger/miniforge3/envs/slc/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
  File "/var/folders/rk/ysjkqtws30s2_qqhb20f6vqm0000gn/T/ipykernel_5407/1861627933.py", line 12, in <module>
    hgb_pipe, hgb_study = hyperparam_search(
                          ^^^^^^^^^^^^^^^^^^
  File "/Users/peterhofinger/Documents/leaf-type-mixture/src/slc/models.py", line 562, in hyperparam_search
    study.optimize(_objective, callbacks=[_callback], n_trials=n_trials, n_jobs=n_jobs)
  File "/Users/peterhofinger/miniforge3/envs/slc/lib/python3.12/site-packages/optuna/study/study.py", line 475, in optimize
  File "/Users/peterhofinger/miniforge3/envs/slc/lib/python3.12/site-packages/optuna/study/_optimize.py", line 63, in _optimize
  File "/Users/peterhofinger/miniforge3/envs/slc/lib/python3.12/site-packages/optuna/study/_optimize.py", line 160, in _optimize_sequential
  File "/Users/peterhofinger/miniforge3/en

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_jobs=-1, random_state=42)
search_space = [
    _suggest_int("n_estimators", 1, 200),
    _suggest_int("max_depth", 1, 1000),
    _suggest_float("max_features", 0.1, 1.0),
    _suggest_float("min_samples_split", 1e-5, 0.5, log=True),
    _suggest_float("min_samples_leaf", 1e-5, 0.5, log=True),
    _suggest_categorical("bootstrap", [True, False]),
    _suggest_categorical(
        "criterion",
        ["gini", "entropy", "log_loss"],
    ),
]

rf_pipe, rf_study = hyperparam_search(
    rf_model,
    search_space,
    data,
    target,
    f1_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42,
)

rf_pipe

In [None]:
from slc.models import hyperparam_search
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_jobs=-1, random_state=42)
search_space = [
    _suggest_int("n_estimators", 10, 200),
    _suggest_int("max_depth", 1, 20),
    _suggest_float("learning_rate", 0.001, 0.5, log=True),
    _suggest_float("gamma", 0, 0.5),
    _suggest_int("min_child_weight", 1, 11),
    _suggest_int("reg_alpha", 40, 180),
    _suggest_float("reg_lambda", 0, 1),
    _suggest_float("colsample_bytree", 0.5, 1),
]

xgb_pipe, xgb_study = hyperparam_search(
    xgb_model,
    search_space,
    data,
    target,
    f1_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42,
)

xgb_pipe

In [None]:
# Plot the hyperparam search scores during search
import matplotlib.pyplot as plt
import scienceplots  # noqa: F401

plt.style.use("science")

studies = [
    elm_study,
    knn_study,
    sgd_study,
    svm_study,
    et_study,
    hgb_study,
    rf_study,
    xgb_study,
]

fig, axs = plt.subplots(2, 4, figsize=(20, 10))

for ax, study in zip(axs.flat, studies, strict=False):
    trials = study.trials_dataframe()
    ax.plot(trials["value"], label="Score")
    ax.set_title(study.study_name)
    ax.set_xlabel("Trial")
    ax.set_ylabel("F1 Score")
    ax.axhline(study.best_value, color="g", linestyle="--", label="Best score")
    ax.set_ylim(0, 1)
    ax.legend()

figure_path = "../reports/figures/hyperparameter_tuning/Hyperparameter Tuning.svg"
Path(figure_path).parent.mkdir(parents=True, exist_ok=True)
plt.savefig(figure_path, dpi=300, transparent=True)

In [None]:
from collections import defaultdict

import pandas as pd
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from tqdm.notebook import tqdm

csv_path = "../reports/hyperparameter_tuning.csv"
scoring = {
    "F1 Score": make_scorer(f1_score),
    "Accuracy": make_scorer(accuracy_score),
    "Kappa": make_scorer(cohen_kappa_score),
}

tuned_models = [
    elm_pipe,
    knn_pipe,
    sgd_pipe,
    svm_pipe,
    rf_pipe,
    et_pipe,
    hgb_pipe,
    xgb_pipe,
]

if not Path(csv_path).exists():
    # Create columns
    model_names = [model.steps[-1][1].__class__.__name__ for model in tuned_models]
    columns = defaultdict(list)
    columns["Model"] = model_names

    # Cross validate default and tuned models
    for model in tqdm(tuned_models):
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_result = cross_validate(
            model, data, target, cv=cv, scoring=scoring, n_jobs=-1
        )

        for metric in scoring:
            columns[metric].append(cv_result[f"test_{metric}"].mean())

    # Create dataframe
    results = pd.DataFrame(columns)
    results = results.set_index("Model")
    results.to_csv(csv_path)
else:
    results = pd.read_csv(csv_path, index_col="Model")

# Sort the dataframe by F1 Score
results.sort_values("F1 Score", ascending=False)