In [60]:
# Dependencies

# Data Manip
import pandas as pd

# Linear Algebra
import numpy as np

# Machine Learning
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from models.lr import LREstimator
from models.dummy import DummyEstimator
from sklearn.decomposition import PCA

# Optimization
import optuna
from optuna.storages import RDBStorage
from functools import partial

# System & Files
import os
import json

# Visualization
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate

seed = 42
np.random.seed(seed)
import random
random.seed(seed)

In [61]:
# Data collection

data_path = "data/nba_filtered_capped.csv"
df = pd.read_csv(data_path)
df = df.drop(columns="Name")

In [62]:
# Train / Test split
target = "TARGET_5Yrs"

X = df.drop(columns=[target]).values
y = df[target].values

test_size=0.2
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=test_size,shuffle=True,random_state=seed)

n_splits = 4 # to get same number of samples within validation fold as in test set

In [63]:
# Model
max_iter = 10000
n_components = 0.99
cv_folds = 4
beta = 0.5 # precision twice as important as recall
n_trials = 200
scaler_type = "robust"

params = {
    "dummy":{
        "model": "dummy",
        "cv_folds": cv_folds,
        "random_state": seed,
        "strategy": "most_frequent",
        "beta": beta,
    },
    "base": {
        "model": "lr",
        "penalty": None,
        "random_state": seed,
        "cv_folds": cv_folds,
        "class_weight": "balanced",
        "n_jobs": -1,
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": None,
        "beta": beta,
    },
    "l1": {
        "model": "lr_l1",
        "penalty": "l1", 
        "random_state": seed,
        "cv_folds": cv_folds,
        "solver": "liblinear",
        "class_weight": "balanced",
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": None,
        "beta": beta,
        "c_min": 1e-1,
        "c_max": 1,
        "n_trials": n_trials,
    },
    "l2": {
        "model": "lr_l2",
        "penalty": "l2", 
        "random_state": seed,
        "cv_folds": cv_folds,
        "solver": "liblinear",
        "class_weight": "balanced",
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": None,
        "beta": beta,
        "c_min": 1e-2,
        "c_max": 1,
        "n_trials": n_trials,
    },
    "base_pca": {
        "model": "lr_pca",
        "penalty": None,
        "random_state": seed,
        "cv_folds": cv_folds,
        "class_weight": "balanced",
        "n_jobs": -1,
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": n_components,
        "beta": beta,
    },
    "l1_pca": {
        "model": "lr_l1_pca",
        "penalty": "l1", 
        "random_state": seed,
        "cv_folds": cv_folds,
        "solver": "liblinear",
        "class_weight": "balanced",
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": n_components, 
        "beta": beta,
        "c_min": 1e-2,
        "c_max": 1,
        "n_trials": n_trials,
    },
    "l2_pca": {
        "model": "lr_l2_pca",
        "penalty": "l2", 
        "random_state": seed,
        "cv_folds": cv_folds,
        "solver": "liblinear",
        "class_weight": "balanced",
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": n_components,
        "beta": beta,
        "c_min": 1e-2,
        "c_max": 1,
        "n_trials": n_trials,
    }

}

In [64]:
# HP fine-tuning
def objective(trial, args: dict, X_train: np.array, y_train: np.array):
    model_name = args["model"]
    if "lr" in model_name:
        h_params = args.copy()
        h_params["C"] = trial.suggest_float("C", args["c_min"], args["c_max"], log=True)
        if args["scaler_type"] == "robust":
            h_params["scaler"] = RobustScaler()
        if args["pca_level"] is not None:
            h_params["pca"] = PCA(n_components=args["pca_level"], random_state=args["random_state"])
        else:
            h_params["pca"] = None
    else:
        raise NotImplementedError(f"Objective function not developped for model {model_name}")
    
    return cv_score(model_name, h_params, X_train, y_train)

def cv_score(model_name: str, h_params: dict, X_train: np.array, y_train: np.array):
    if "lr" in model_name:
        model = LREstimator(h_params)
        score = model.cross_validate(X_train, y_train)
        return score
    else:
        raise NotImplementedError(f"Scoring function not developped for model {model_name}")


storage_url = "sqlite:///db.sqlite3"
storage = RDBStorage(url=storage_url)

for config_name, config_args in params.items():
    study_name = config_args["model"] + "_" + target
    if "c_min" in config_args.keys(): # only treat regularized models
        studies = storage.get_all_studies()
        if any(s.study_name == study_name for s in studies):
            optuna.delete_study(study_name=study_name, storage=storage_url)
            print(f"Deleted existing study: {study_name}")
        
        study = optuna.create_study(
            storage=storage_url,
            study_name=study_name,
            direction="maximize",
            sampler=optuna.samplers.TPESampler(seed=seed)
        )
        
        objective_partial = partial(
            objective,
            args=config_args,
            X_train=X_train,
            y_train=y_train,
        )
    
        print(f"Optimizing {study_name}...")
        study.optimize(objective_partial, n_trials=config_args["n_trials"], n_jobs=-1)
        print(f"Best value for {study_name}: {study.best_value}")
        
        # Verify the best score by re-running with best params
        best_params_copy = config_args.copy()
        best_params_copy["C"] = study.best_params["C"]
        if best_params_copy["scaler_type"] == "robust":
            best_params_copy["scaler"] = RobustScaler()
        if best_params_copy["pca_level"] is not None:
            best_params_copy["pca"] = PCA(n_components=best_params_copy["pca_level"], random_state=seed)
        else:
            best_params_copy["pca"] = None
            
        verification_score = cv_score(study_name.split("_")[0], best_params_copy, X_train, y_train)
        print(f"Verification score for {study_name}: {verification_score} (should match best: {study.best_value})")

[I 2025-10-03 12:47:23,553] A new study created in RDB with name: lr_l1_TARGET_5Yrs


Deleted existing study: lr_l1_TARGET_5Yrs
Optimizing lr_l1_TARGET_5Yrs...


[I 2025-10-03 12:47:24,083] Trial 5 finished with value: 0.7709378897648195 and parameters: {'C': 0.23230315797210782}. Best is trial 5 with value: 0.7709378897648195.
[I 2025-10-03 12:47:24,111] Trial 3 finished with value: 0.7703038984035535 and parameters: {'C': 0.5839629328584863}. Best is trial 5 with value: 0.7709378897648195.
[I 2025-10-03 12:47:24,156] Trial 4 finished with value: 0.7709300713548545 and parameters: {'C': 0.1983067955827161}. Best is trial 5 with value: 0.7709378897648195.
[I 2025-10-03 12:47:24,194] Trial 9 finished with value: 0.7703038984035535 and parameters: {'C': 0.48252877861526766}. Best is trial 5 with value: 0.7709378897648195.
[I 2025-10-03 12:47:24,444] Trial 2 finished with value: 0.7657963861153664 and parameters: {'C': 0.10244881998323918}. Best is trial 5 with value: 0.7709378897648195.
[I 2025-10-03 12:47:24,488] Trial 1 finished with value: 0.7729934469467612 and parameters: {'C': 0.3107259996287767}. Best is trial 1 with value: 0.7729934469467

Best value for lr_l1_TARGET_5Yrs: 0.7745586985530044
Verification score for lr_l1_TARGET_5Yrs: 0.7745586985530044 (should match best: 0.7745586985530044)


[I 2025-10-03 12:47:31,931] A new study created in RDB with name: lr_l2_TARGET_5Yrs


Deleted existing study: lr_l2_TARGET_5Yrs
Optimizing lr_l2_TARGET_5Yrs...


[I 2025-10-03 12:47:32,343] Trial 1 finished with value: 0.7634908643002415 and parameters: {'C': 0.3598130340301906}. Best is trial 1 with value: 0.7634908643002415.
[I 2025-10-03 12:47:32,400] Trial 4 finished with value: 0.7682618521565087 and parameters: {'C': 0.028616338966572674}. Best is trial 4 with value: 0.7682618521565087.
[I 2025-10-03 12:47:32,437] Trial 7 finished with value: 0.76259191535705 and parameters: {'C': 0.017825527113339065}. Best is trial 4 with value: 0.7682618521565087.
[I 2025-10-03 12:47:32,474] Trial 10 finished with value: 0.7734770018563072 and parameters: {'C': 0.08081080599855617}. Best is trial 10 with value: 0.7734770018563072.
[I 2025-10-03 12:47:32,501] Trial 12 finished with value: 0.7680666711661545 and parameters: {'C': 0.20134552292014604}. Best is trial 10 with value: 0.7734770018563072.
[I 2025-10-03 12:47:32,647] Trial 0 finished with value: 0.7630953739061864 and parameters: {'C': 0.43644431495830666}. Best is trial 10 with value: 0.773477

Best value for lr_l2_TARGET_5Yrs: 0.7747103177332844
Verification score for lr_l2_TARGET_5Yrs: 0.7747103177332844 (should match best: 0.7747103177332844)
Deleted existing study: lr_l1_pca_TARGET_5Yrs
Optimizing lr_l1_pca_TARGET_5Yrs...


[I 2025-10-03 12:47:40,620] Trial 3 finished with value: 0.7711968227134284 and parameters: {'C': 0.22173222260897327}. Best is trial 3 with value: 0.7711968227134284.
[I 2025-10-03 12:47:40,812] Trial 6 finished with value: 0.7480267533585544 and parameters: {'C': 0.04039258497527492}. Best is trial 3 with value: 0.7711968227134284.
[I 2025-10-03 12:47:40,872] Trial 4 finished with value: 0.7725886072673686 and parameters: {'C': 0.3438633777304755}. Best is trial 4 with value: 0.7725886072673686.
[I 2025-10-03 12:47:40,902] Trial 10 finished with value: 0.7750665396649448 and parameters: {'C': 0.464029957481303}. Best is trial 10 with value: 0.7750665396649448.
[I 2025-10-03 12:47:40,962] Trial 11 finished with value: 0.7724834516258935 and parameters: {'C': 0.202721858060749}. Best is trial 10 with value: 0.7750665396649448.
[I 2025-10-03 12:47:40,988] Trial 9 finished with value: 0.768423676333286 and parameters: {'C': 0.1236522469875996}. Best is trial 10 with value: 0.775066539664

Best value for lr_l1_pca_TARGET_5Yrs: 0.7761247652832185
Verification score for lr_l1_pca_TARGET_5Yrs: 0.7761247652832185 (should match best: 0.7761247652832185)


[I 2025-10-03 12:47:48,584] A new study created in RDB with name: lr_l2_pca_TARGET_5Yrs


Deleted existing study: lr_l2_pca_TARGET_5Yrs
Optimizing lr_l2_pca_TARGET_5Yrs...


[I 2025-10-03 12:47:49,088] Trial 0 finished with value: 0.7738468954584985 and parameters: {'C': 0.061627880377664666}. Best is trial 0 with value: 0.7738468954584985.
[I 2025-10-03 12:47:49,138] Trial 1 finished with value: 0.7650784140407272 and parameters: {'C': 0.014722975149352549}. Best is trial 0 with value: 0.7738468954584985.
[I 2025-10-03 12:47:49,186] Trial 4 finished with value: 0.7738208610381411 and parameters: {'C': 0.08876476893173993}. Best is trial 0 with value: 0.7738468954584985.
[I 2025-10-03 12:47:49,188] Trial 7 finished with value: 0.770327894442901 and parameters: {'C': 0.4150065822997125}. Best is trial 0 with value: 0.7738468954584985.
[I 2025-10-03 12:47:49,217] Trial 8 finished with value: 0.770327894442901 and parameters: {'C': 0.3296528915806858}. Best is trial 0 with value: 0.7738468954584985.
[I 2025-10-03 12:47:49,235] Trial 5 finished with value: 0.7684296272455027 and parameters: {'C': 0.03985975081133386}. Best is trial 0 with value: 0.773846895458

Best value for lr_l2_pca_TARGET_5Yrs: 0.7762884839303122
Verification score for lr_l2_pca_TARGET_5Yrs: 0.7762884839303122 (should match best: 0.7762884839303122)


In [65]:
# Training Pipeline
for key in ["l1", "l2", "l1_pca", "l2_pca"]:
    study_name = params[key]["model"] + "_" + target
    study = optuna.load_study(study_name=study_name, storage=storage_url)
    best_C = study.best_params["C"]
    params[key]["C"] = best_C
    if params[key]["scaler_type"] == "robust":
        params[key]["scaler"] = RobustScaler()
    if params[key]["pca_level"] is not None:
        params[key]["pca"] = PCA(n_components=n_components, random_state=seed)
    else:
        params[key]["pca"] = None

for key in ["dummy", "base", "base_pca"]:
    if "scaler_type" in params[key] and params[key]["scaler_type"] == "robust":
        params[key]["scaler"] = RobustScaler()
    if "pca_level" in params[key] and params[key]["pca_level"] is not None:
        params[key]["pca"] = PCA(n_components=n_components, random_state=seed)
    else:
        params[key]["pca"] = None

dummy = DummyEstimator(params["dummy"])
lr = LREstimator(params["base"])
lr_pca = LREstimator(params["base_pca"])
lr_l1 = LREstimator(params["l1"])
lr_l1_pca = LREstimator(params["l1_pca"])
lr_l2 = LREstimator(params["l2"])
lr_l2_pca = LREstimator(params["l2_pca"])

In [66]:
# Fine tuning
for model in [dummy, lr, lr_pca, lr_l1, lr_l1_pca, lr_l2, lr_l2_pca]:
    model.cross_validate(X_train, y_train)

In [67]:
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=[
        "Train Precision", "Train Recall", f"Train F{beta}",
        "Valid Precision", "Valid Recall", f"Valid F{beta}"
    ]
)

model_names = [
    ("dummy", "Dummy"),
    ("lr", "Base"),
    ("lr_l1", "L1"),
    ("lr_l2", "L2"),
    ("lr_pca", "Base (PCA)"),
    ("lr_l1_pca", "L1 (PCA)"),
    ("lr_l2_pca", "L2 (PCA)")
]

# Unique color per model
model_colors = {
    "Dummy": "red",
    "Base": "blue",
    "L1": "orange",
    "L2": "green",
    "Base (PCA)": "blue",
    "L1 (PCA)": "orange",
    "L2 (PCA)": "green"
}

metrics = ["precision", "recall", f"F{beta}"]
scenarios = ["train", "valid"]

for row, scenario in enumerate(scenarios, start=1):
    for col, metric in enumerate(metrics, start=1):
        for model_var, model_label in model_names:
            model = globals()[model_var]
            y_vals = model.cv_scores[scenario][metric]
            is_pca = "PCA" in model_label
            fig.add_trace(
                go.Scatter(
                    y=y_vals,
                    mode="lines+markers",
                    name=model_label,
                    legendgroup=model_label,
                    line=dict(
                        dash="dash" if is_pca else "solid",
                        color=model_colors[model_label]
                    ),
                    showlegend=(row == 1 and col == 1)
                ),
                row=row, col=col
            )

fig.update_layout(
    height=700, width=1500,
    title_text="Cross-Validation Metrics for All Models",
    legend_title_text="Model"
)
fig.show()

1) Model generalize very well (metrics across train / valid folds are close) <br>
2) All LR models outperform dummy classifier. <br>
3) PCA seems to improving both precision and recall (especially during validation) : denoising the data helps the model focusing on relevant patterns !
4) Best model seems to be L2 + PCA : best score on validation + improves the learning process in the sens of the given metric (focusing more on improving precision than recall)

In [68]:
# Mettrics comparison
results = []

for model in [dummy, lr, lr_pca, lr_l1, lr_l1_pca, lr_l2, lr_l2_pca]:
    row = [model.name]
    for scenario in scenarios:
        for metric in metrics:
            avg = np.mean(model.cv_scores[scenario][metric])
            row.append(avg)
    results.append(row)

headers = ["Model"] + [f"{scenario.capitalize()} {metric.capitalize()}" for scenario in scenarios for metric in metrics]
print(tabulate(results, headers=headers, tablefmt="github", floatfmt=".4f"))

| Model     |   Train Precision |   Train Recall |   Train F0.5 |   Valid Precision |   Valid Recall |   Valid F0.5 |
|-----------|-------------------|----------------|--------------|-------------------|----------------|--------------|
| dummy     |            0.6243 |         1.0000 |       0.6750 |            0.6243 |         1.0000 |       0.6750 |
| lr        |            0.8024 |         0.6646 |       0.7704 |            0.7972 |         0.6609 |       0.7654 |
| lr_pca    |            0.7971 |         0.6739 |       0.7690 |            0.7973 |         0.6733 |       0.7688 |
| lr_l1     |            0.7986 |         0.6718 |       0.7695 |            0.8052 |         0.6733 |       0.7746 |
| lr_l1_pca |            0.8058 |         0.6641 |       0.7728 |            0.8108 |         0.6640 |       0.7761 |
| lr_l2     |            0.7996 |         0.6677 |       0.7692 |            0.8073 |         0.6687 |       0.7747 |
| lr_l2_pca |            0.8002 |         0.6677 |      

1) Regularization is stabilizing model generalization (smaller gaps between train and valid + across folds) <br>
2) PCA is improving learning <br>
3) Elastic Net not adapted to this scenario : over-fitting <br>
4) Recall curve is a bit less consistent across folds than precision : might be due to our focus on precision with beta = 0.5. <br>
5) Best model chosen according to validation Fbeta score and curves : PCA + L2

In [69]:
# Saving best configs

best_config = params["l2_pca"].copy() # choose manually
best_config.pop("scaler")
best_config.pop("pca")
best_config.pop("c_min")
best_config.pop("c_max")
best_config.pop("n_trials")
best_config.pop("model")

save_config_path = os.path.join("models","params","lr.json")
os.makedirs(os.path.dirname(save_config_path), exist_ok=True)
with open(save_config_path, 'w') as f:
    json.dump(best_config, f, indent=4)

print(f"Best configuration saved to: {save_config_path}")

Best configuration saved to: models/params/lr.json


In [70]:
# Fit all models and evaluate on test set
test_results = []

for model in [dummy, lr, lr_pca, lr_l1, lr_l1_pca, lr_l2, lr_l2_pca]:
    model.fit(X_train, y_train)
    test_metrics = model.evaluate(X_test, y_test)
    
    row = [model.name, test_metrics["precision"], test_metrics["recall"], test_metrics[f"F{beta}"]]
    test_results.append(row)

# Display results
test_headers = ["Model", "Precision", "Recall", f"F{beta}-Score"]
print("\nTest Set Performance:")
print(tabulate(test_results, headers=test_headers, tablefmt="github", floatfmt=".4f"))

Evaluating on test set
Evaluating on test set
Evaluating on test set
Evaluating on test set
Evaluating on test set
Evaluating on test set
Evaluating on test set

Test Set Performance:
| Model     |   Precision |   Recall |   F0.5-Score |
|-----------|-------------|----------|--------------|
| dummy     |      0.6240 |   1.0000 |       0.6748 |
| lr        |      0.8345 |   0.7205 |       0.8089 |
| lr_pca    |      0.8264 |   0.7391 |       0.8073 |
| lr_l1     |      0.8264 |   0.7391 |       0.8073 |
| lr_l1_pca |      0.8310 |   0.7329 |       0.8093 |
| lr_l2     |      0.8169 |   0.7205 |       0.7956 |
| lr_l2_pca |      0.8227 |   0.7205 |       0.8000 |


Good generalization property (higher and close to  metrics obtained with cross validation validation set) ! <br>
L2 + PCA almost gives best scores on test set (close margin) !