In this notebook I will focus on model selection. I will test few or more models and I will choose the best to hyperaparameters tuning. So far I tested three models. Below I will present theirs brier score (smaller is better). I test all models on 10-fold time series cross validation.

For men dataset:
1. Logistic regression: 0.20055
2. Random forest: 0.20602
3. Decision tree: 0.40546

For women dataset:
1. Logistic regression: 0.23619
2. Random forest: 0.56513
3. Decision tree: 0.70500

## Load libraries

In [18]:
import fireducks.pandas as pd
import numpy as np
from sklearn.metrics import brier_score_loss
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import TimeSeriesSplit
from custom_utils import transform_data, enrich_data
import mlflow

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment("NCAA Classifiers")

<Experiment: artifact_location='mlflow-artifacts:/577994152132513070', creation_time=1747996421002, experiment_id='577994152132513070', last_update_time=1747996421002, lifecycle_stage='active', name='NCAA Classifiers', tags={}>

## Load and prepare data

In [6]:
wregularseason = pd.read_csv("data/WRegularSeasonCompactResults.csv")
wtourneyseason = pd.read_csv("data/WNCAATourneyCompactResults.csv")

mregularseason = pd.read_csv("data/MRegularSeasonCompactResults.csv")
mtourneyseason = pd.read_csv("data/MNCAATourneyCompactResults.csv")

mseason = pd.read_csv("data/MSeasons.csv")
wseason = pd.read_csv("data/WSeasons.csv")

wdata = pd.concat([wregularseason, wtourneyseason], axis=0)
wdata = wdata.loc[wdata.Season > 2015].reset_index(drop=True)

mdata = pd.concat([mregularseason, mtourneyseason], axis=0).reset_index(drop=True)
mdata = mdata.loc[mdata.Season > 2015].reset_index(drop=True)

wprep = transform_data(wdata)
mprep = transform_data(mdata)

mprep_enh = enrich_data(mprep, "M")
wprep_enh = enrich_data(wprep, "W")

X_featm = mprep_enh.drop(["Result", "DayNum"], axis=1)
ym = mprep_enh.Result

X_featw = wprep_enh.drop(["Result", "DayNum"], axis=1)
yw = wprep_enh.Result

tscv = TimeSeriesSplit(n_splits=10)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


## Model selection

### Decision tree

#### Men

In [15]:
with mlflow.start_run():
    mlflow.set_tags({"Gender": "Men", "Model" : "Decision tree", "Hyperparams" : "Basic"})
    brier_score_cv = {}

    for id_split, (train_idx, test_idx) in enumerate(tscv.split(X_featm)):
        X_train, y_train = X_featm.iloc[train_idx], ym.iloc[train_idx]
        X_test, y_test = X_featm.iloc[test_idx], ym.iloc[test_idx]

        dt_clf = DecisionTreeClassifier(random_state=42)
        dt_clf.fit(X_train, y_train)

        pred_proba = dt_clf.predict_proba(X_test)[:, 1]
        brier_score_fold = brier_score_loss(y_test, pred_proba).item()
        brier_score_cv[f"Fold_{id_split}_Brier_Score"] = brier_score_fold
    brier_score_cv["Avg_Brier_Score"] = np.mean(list(brier_score_cv.values()))
    mlflow.log_metrics(brier_score_cv)

🏃 View run painted-auk-659 at: http://127.0.0.1:8080/#/experiments/577994152132513070/runs/dc13db4370f14aa2969063fd5f7cb8b3
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/577994152132513070


#### Women

In [22]:
with mlflow.start_run():
    mlflow.set_tags({"Gender": "Women", "Model" : "Decision tree", "Hyperparams" : "Basic"})
    brier_score_cv = {}

    for id_split, (train_idx, test_idx) in enumerate(tscv.split(X_featw)):
        X_train, y_train = X_featw.iloc[train_idx], yw.iloc[train_idx]
        X_test, y_test = X_featw.iloc[test_idx], yw.iloc[test_idx]

        dt_clf = DecisionTreeClassifier(random_state=42)
        dt_clf.fit(X_train, y_train)

        pred_proba = dt_clf.predict_proba(X_test)[:, 1]
        brier_score_fold = brier_score_loss(y_test, pred_proba).item()
        brier_score_cv[f"Fold_{id_split}_Brier_Score"] = brier_score_fold
    brier_score_cv["Avg_Brier_Score"] = np.mean(list(brier_score_cv.values()))
    mlflow.log_metrics(brier_score_cv)

🏃 View run rumbling-conch-863 at: http://127.0.0.1:8080/#/experiments/577994152132513070/runs/a33e6985a49c4415b6d587f47430314a
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/577994152132513070


### Random forest

#### Men

In [16]:
with mlflow.start_run():
    mlflow.set_tags({"Gender": "Men", "Model" : "Random forest", "Hyperparams" : "Basic"})
    brier_score_cv = {}

    for id_split, (train_idx, test_idx) in enumerate(tscv.split(X_featm)):
        X_train, y_train = X_featm.iloc[train_idx], ym.iloc[train_idx]
        X_test, y_test = X_featm.iloc[test_idx], ym.iloc[test_idx]

        dt_clf = RandomForestClassifier(random_state=42)
        dt_clf.fit(X_train, y_train)

        pred_proba = dt_clf.predict_proba(X_test)[:, 1]
        brier_score_fold = brier_score_loss(y_test, pred_proba).item()
        brier_score_cv[f"Fold_{id_split}_Brier_Score"] = brier_score_fold
    brier_score_cv["Avg_Brier_Score"] = np.mean(list(brier_score_cv.values()))
    mlflow.log_metrics(brier_score_cv)

🏃 View run spiffy-skunk-267 at: http://127.0.0.1:8080/#/experiments/577994152132513070/runs/fdc7363b659247c2b6263a348746bd9f
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/577994152132513070


#### Women

In [23]:
with mlflow.start_run():
    mlflow.set_tags({"Gender": "Women", "Model" : "Random forest", "Hyperparams" : "Basic"})
    brier_score_cv = {}

    for id_split, (train_idx, test_idx) in enumerate(tscv.split(X_featw)):
        X_train, y_train = X_featw.iloc[train_idx], yw.iloc[train_idx]
        X_test, y_test = X_featw.iloc[test_idx], yw.iloc[test_idx]

        dt_clf = RandomForestClassifier(random_state=42)
        dt_clf.fit(X_train, y_train)

        pred_proba = dt_clf.predict_proba(X_test)[:, 1]
        brier_score_fold = brier_score_loss(y_test, pred_proba).item()
        brier_score_cv[f"Fold_{id_split}_Brier_Score"] = brier_score_fold
    brier_score_cv["Avg_Brier_Score"] = np.mean(list(brier_score_cv.values()))
    mlflow.log_metrics(brier_score_cv)

🏃 View run luminous-hen-922 at: http://127.0.0.1:8080/#/experiments/577994152132513070/runs/fdb0716291894185a3d3500cc1bc7cd1
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/577994152132513070


### Logistic Regression

#### Men

In [None]:
with mlflow.start_run():
    mlflow.set_tags({"Gender": "Men", "Model" : "Logistic regression", "Hyperparams" : "Basic"})
    brier_score_cv = {}

    for id_split, (train_idx, test_idx) in enumerate(tscv.split(X_featm)):
        X_train, y_train = X_featm.iloc[train_idx], ym.iloc[train_idx]
        X_test, y_test = X_featm.iloc[test_idx], ym.iloc[test_idx]

        dt_clf = LogisticRegression(random_state=42)
        dt_clf.fit(X_train, y_train)

        pred_proba = dt_clf.predict_proba(X_test)[:, 1]
        brier_score_fold = brier_score_loss(y_test, pred_proba).item()
        brier_score_cv[f"Fold_{id_split}_Brier_Score"] = brier_score_fold
    brier_score_cv["Avg_Brier_Score"] = np.mean(list(brier_score_cv.values()))
    mlflow.log_metrics(brier_score_cv)

#### Women

In [None]:
with mlflow.start_run():
    mlflow.set_tags({"Gender": "Women", "Model" : "Logistic regression", "Hyperparams" : "Basic"})
    brier_score_cv = {}

    for id_split, (train_idx, test_idx) in enumerate(tscv.split(X_featw)):
        X_train, y_train = X_featw.iloc[train_idx], yw.iloc[train_idx]
        X_test, y_test = X_featw.iloc[test_idx], yw.iloc[test_idx]

        dt_clf = LogisticRegression(random_state=42)
        dt_clf.fit(X_train, y_train)

        pred_proba = dt_clf.predict_proba(X_test)[:, 1]
        brier_score_fold = brier_score_loss(y_test, pred_proba).item()
        brier_score_cv[f"Fold_{id_split}_Brier_Score"] = brier_score_fold
    brier_score_cv["Avg_Brier_Score"] = np.mean(list(brier_score_cv.values()))
    mlflow.log_metrics(brier_score_cv)