In [111]:
# Dependencies

# Data Manip
import pandas as pd

# Linear Algebra
import numpy as np

# Machine Learning
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from models.lr import LREstimator
from models.dummy import DummyEstimator
from sklearn.decomposition import PCA

# Optimization
import optuna
from optuna.storages import RDBStorage
from functools import partial

# Visualization
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate

# System & Files
import os
import json

seed = 42
np.random.seed(seed)
import random
random.seed(seed)

In [112]:
# Data collection

data_path = "data/nba_filtered_capped.csv"
df = pd.read_csv(data_path)
df = df.loc[:, ~(df.columns.str.contains("capped") | (df.columns == "Name"))]

In [113]:
# Train / Test split
target = "TARGET_5Yrs"

X = df.drop(columns=[target]).values
y = df[target].values

test_size=0.2
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=test_size,shuffle=True,random_state=seed)

# Cross Validation Splitter
n_splits = 4 # to get same number of samples within validation fold as in test set

In [114]:
# Model
max_iter = 10000
n_components = 0.99
cv_folds = 4
beta = 0.5 # precision twice as important as recall
n_trials = 100
cap_factor = 1.5
scaler_type = "robust"
pca_level = 0.99

params = {
    "dummy":{
        "model": "dummy",
        "cv_folds": cv_folds,
        "random_state": seed,
        "strategy": "most_frequent",
        "beta": beta,
    },
    "base": {
        "model": "lr",
        "penalty": None,
        "random_state": seed,
        "cv_folds": cv_folds,
        "class_weight": "balanced",
        "n_jobs": -1,
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": None,
        "beta": beta,
        "cap_factor": cap_factor,
    },
    "l1": {
        "model": "lr_l1",
        "penalty": "l1", 
        "random_state": seed,
        "cv_folds": cv_folds,
        "solver": "liblinear",
        "class_weight": "balanced",
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": None,
        "beta": beta,
        "c_min": 1e-1,
        "c_max": 1,
        "n_trials": n_trials,
        "cap_factor": cap_factor,
    },
    "l2": {
        "model": "lr_l2",
        "penalty": "l2", 
        "random_state": seed,
        "cv_folds": cv_folds,
        "solver": "liblinear",
        "class_weight": "balanced",
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": None,
        "beta": beta,
        "c_min": 1e-2,
        "c_max": 1,
        "n_trials": n_trials,
        "cap_factor": cap_factor,
    },
    "base_pca": {
        "model": "lr_pca",
        "penalty": None,
        "random_state": seed,
        "cv_folds": cv_folds,
        "class_weight": "balanced",
        "n_jobs": -1,
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": pca_level,
        "beta": beta,
        "cap_factor": cap_factor,
    },
    "l1_pca": {
        "model": "lr_l1_pca",
        "penalty": "l1", 
        "random_state": seed,
        "cv_folds": cv_folds,
        "solver": "liblinear",
        "class_weight": "balanced",
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": pca_level,
        "beta": beta,
        "c_min": 1e-2,
        "c_max": 1,
        "n_trials": n_trials,
        "cap_factor": cap_factor,
    },
    "l2_pca": {
        "model": "lr_l2_pca",
        "penalty": "l2", 
        "random_state": seed,
        "cv_folds": cv_folds,
        "solver": "liblinear",
        "class_weight": "balanced",
        "max_iter": max_iter,
        "scaler_type": scaler_type,
        "pca_level": pca_level,
        "beta": beta,
        "c_min": 1e-2,
        "c_max": 1,
        "n_trials": n_trials,
        "cap_factor": cap_factor,
    }

}

In [115]:
# HP fine-tuning
def objective(trial, args: dict, X_train: np.array, y_train: np.array):
    model_name = args["model"]
    if "lr" in model_name:
        h_params = args.copy()
        h_params["C"] = trial.suggest_float("C", args["c_min"], args["c_max"], log=True)
        # Ensure fresh scaler and PCA instances for each trial  
        if args["scaler_type"] == "robust":
            h_params["scaler"] = RobustScaler()
        if args["pca_level"] is not None:
            h_params["pca"] = PCA(n_components=args["pca_level"], random_state=args["random_state"])
        else:
            h_params["pca"] = None
            
    else:
        raise NotImplementedError(f"Objective function not developped for model {model_name}")
    
    return cv_score(model_name, h_params, X_train, y_train)

def cv_score(model_name: str, h_params: dict, X_train: np.array, y_train: np.array):
    if "lr" in model_name:
        model = LREstimator(h_params)
        score = model.cross_validate(X_train, y_train)
        return score
    else:
        raise NotImplementedError(f"Scoring function not developped for model {model_name}")


storage_url = "sqlite:///db.sqlite3"
storage = RDBStorage(url=storage_url)

for config_name, config_args in params.items():
    study_name = config_args["model"] + "_" + target
    if "c_min" in config_args.keys(): # only treat regularized models
        studies = storage.get_all_studies()
        if any(s.study_name == study_name for s in studies):
            optuna.delete_study(study_name=study_name, storage=storage_url)
            print(f"Deleted existing study: {study_name}")
        
        study = optuna.create_study(
            storage=storage_url,
            study_name=study_name,
            direction="maximize",
            sampler=optuna.samplers.TPESampler(seed=seed)
        )
        
        objective_partial = partial(
            objective,
            args=config_args,
            X_train=X_train,
            y_train=y_train,
        )
    
        print(f"Optimizing {study_name}...")
        study.optimize(objective_partial, n_trials=config_args["n_trials"], n_jobs=1) # n_jobs = 1 to ensure reproducibility
        print(f"Best value for {study_name}: {study.best_value}")

[I 2025-10-06 10:46:57,631] A new study created in RDB with name: lr_l1_TARGET_5Yrs
[I 2025-10-06 10:46:57,684] Trial 0 finished with value: 0.7715905921025441 and parameters: {'C': 0.23688639503640782}. Best is trial 0 with value: 0.7715905921025441.
[I 2025-10-06 10:46:57,735] Trial 1 finished with value: 0.7648870014074804 and parameters: {'C': 0.8927180304353626}. Best is trial 0 with value: 0.7715905921025441.
[I 2025-10-06 10:46:57,779] Trial 2 finished with value: 0.7696336299926714 and parameters: {'C': 0.5395030966670228}. Best is trial 0 with value: 0.7715905921025441.


Deleted existing study: lr_l1_TARGET_5Yrs
Optimizing lr_l1_TARGET_5Yrs...


[I 2025-10-06 10:46:57,819] Trial 3 finished with value: 0.7738985474011544 and parameters: {'C': 0.3968793330444371}. Best is trial 3 with value: 0.7738985474011544.
[I 2025-10-06 10:46:57,858] Trial 4 finished with value: 0.7666146584550485 and parameters: {'C': 0.1432249371823025}. Best is trial 3 with value: 0.7738985474011544.
[I 2025-10-06 10:46:57,896] Trial 5 finished with value: 0.7666146584550485 and parameters: {'C': 0.14321698289111517}. Best is trial 3 with value: 0.7738985474011544.
[I 2025-10-06 10:46:57,935] Trial 6 finished with value: 0.7635313650659294 and parameters: {'C': 0.1143098387631322}. Best is trial 3 with value: 0.7738985474011544.
[I 2025-10-06 10:46:57,975] Trial 7 finished with value: 0.768538971895039 and parameters: {'C': 0.7348118405270447}. Best is trial 3 with value: 0.7738985474011544.
[I 2025-10-06 10:46:58,014] Trial 8 finished with value: 0.7738985474011544 and parameters: {'C': 0.3991305878561679}. Best is trial 3 with value: 0.7738985474011544

Best value for lr_l1_TARGET_5Yrs: 0.7745586985530044
Deleted existing study: lr_l2_TARGET_5Yrs
Optimizing lr_l2_TARGET_5Yrs...


[I 2025-10-06 10:47:01,749] Trial 3 finished with value: 0.7697500417584893 and parameters: {'C': 0.15751320499779725}. Best is trial 0 with value: 0.7740646529113744.
[I 2025-10-06 10:47:01,784] Trial 4 finished with value: 0.7641177664773184 and parameters: {'C': 0.020513382630874502}. Best is trial 0 with value: 0.7740646529113744.
[I 2025-10-06 10:47:01,822] Trial 5 finished with value: 0.7641177664773184 and parameters: {'C': 0.020511104188433976}. Best is trial 0 with value: 0.7740646529113744.
[I 2025-10-06 10:47:01,862] Trial 6 finished with value: 0.7650543114236775 and parameters: {'C': 0.01306673923805328}. Best is trial 0 with value: 0.7740646529113744.
[I 2025-10-06 10:47:01,904] Trial 7 finished with value: 0.7627677918102422 and parameters: {'C': 0.5399484409787431}. Best is trial 0 with value: 0.7740646529113744.
[I 2025-10-06 10:47:01,940] Trial 8 finished with value: 0.7697500417584893 and parameters: {'C': 0.15930522616241014}. Best is trial 0 with value: 0.774064652

Best value for lr_l2_TARGET_5Yrs: 0.7760597713481889
Deleted existing study: lr_l1_pca_TARGET_5Yrs
Optimizing lr_l1_pca_TARGET_5Yrs...


[I 2025-10-06 10:47:05,464] Trial 3 finished with value: 0.7727019797545804 and parameters: {'C': 0.15751320499779725}. Best is trial 1 with value: 0.7748633001870691.
[I 2025-10-06 10:47:05,499] Trial 4 finished with value: 0.748206590399019 and parameters: {'C': 0.020513382630874502}. Best is trial 1 with value: 0.7748633001870691.
[I 2025-10-06 10:47:05,535] Trial 5 finished with value: 0.748206590399019 and parameters: {'C': 0.020511104188433976}. Best is trial 1 with value: 0.7748633001870691.
[I 2025-10-06 10:47:05,571] Trial 6 finished with value: 0.737289624135473 and parameters: {'C': 0.01306673923805328}. Best is trial 1 with value: 0.7748633001870691.
[I 2025-10-06 10:47:05,606] Trial 7 finished with value: 0.7729005582633969 and parameters: {'C': 0.5399484409787431}. Best is trial 1 with value: 0.7748633001870691.
[I 2025-10-06 10:47:05,643] Trial 8 finished with value: 0.7727019797545804 and parameters: {'C': 0.15930522616241014}. Best is trial 1 with value: 0.774863300187

Best value for lr_l1_pca_TARGET_5Yrs: 0.776329430708636
Deleted existing study: lr_l2_pca_TARGET_5Yrs
Optimizing lr_l2_pca_TARGET_5Yrs...


[I 2025-10-06 10:47:09,192] Trial 3 finished with value: 0.7726461229563969 and parameters: {'C': 0.15751320499779725}. Best is trial 3 with value: 0.7726461229563969.
[I 2025-10-06 10:47:09,230] Trial 4 finished with value: 0.7665003635515295 and parameters: {'C': 0.020513382630874502}. Best is trial 3 with value: 0.7726461229563969.
[I 2025-10-06 10:47:09,266] Trial 5 finished with value: 0.7665003635515295 and parameters: {'C': 0.020511104188433976}. Best is trial 3 with value: 0.7726461229563969.
[I 2025-10-06 10:47:09,302] Trial 6 finished with value: 0.7638839705916523 and parameters: {'C': 0.01306673923805328}. Best is trial 3 with value: 0.7726461229563969.
[I 2025-10-06 10:47:09,339] Trial 7 finished with value: 0.769041640765557 and parameters: {'C': 0.5399484409787431}. Best is trial 3 with value: 0.7726461229563969.
[I 2025-10-06 10:47:09,376] Trial 8 finished with value: 0.7726461229563969 and parameters: {'C': 0.15930522616241014}. Best is trial 3 with value: 0.7726461229

Best value for lr_l2_pca_TARGET_5Yrs: 0.7745755292550132


In [116]:
# Training Pipeline

for key in ["l1", "l2", "l1_pca", "l2_pca"]:
    study_name = params[key]["model"] + "_" + target
    study = optuna.load_study(study_name=study_name, storage=storage_url)
    best_C = study.best_params["C"]
    params[key]["C"] = best_C
    
    # Ensure fresh scaler and PCA instances
    if params[key]["scaler_type"] == "robust":
        params[key]["scaler"] = RobustScaler()
    if params[key]["pca_level"] is not None:
        params[key]["pca"] = PCA(n_components=params[key]["pca_level"], random_state=seed)
    else:
        params[key]["pca"] = None

# Also ensure fresh instances for non-optimized models
for key in ["dummy", "base", "base_pca"]:
    params[key] = params[key].copy()
    if "scaler_type" in params[key] and params[key]["scaler_type"] == "robust":
        params[key]["scaler"] = RobustScaler()
    if "pca_level" in params[key] and params[key]["pca_level"] is not None:
        params[key]["pca"] = PCA(n_components=n_components, random_state=seed)
    else:
        params[key]["pca"] = None

dummy = DummyEstimator(params["dummy"])
lr = LREstimator(params["base"])
lr_pca = LREstimator(params["base_pca"])
lr_l1 = LREstimator(params["l1"])
lr_l1_pca = LREstimator(params["l1_pca"])
lr_l2 = LREstimator(params["l2"])
lr_l2_pca = LREstimator(params["l2_pca"])

In [117]:
# Fine tuning
for model in [dummy, lr, lr_pca, lr_l1, lr_l1_pca, lr_l2, lr_l2_pca]:
    model.cross_validate(X_train, y_train)

In [118]:
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=[
        "Train Precision", "Train Recall", f"Train F{beta}",
        "Valid Precision", "Valid Recall", f"Valid F{beta}"
    ]
)

model_names = [
    ("dummy", "Dummy"),
    ("lr", "Base"),
    ("lr_l1", "L1"),
    ("lr_l2", "L2"),
    ("lr_pca", "Base (PCA)"),
    ("lr_l1_pca", "L1 (PCA)"),
    ("lr_l2_pca", "L2 (PCA)")
]

# Unique color per model
model_colors = {
    "Dummy": "red",
    "Base": "blue",
    "L1": "orange",
    "L2": "green",
    "Base (PCA)": "blue",
    "L1 (PCA)": "orange",
    "L2 (PCA)": "green"
}

metrics = ["precision", "recall", f"F{beta}"]
scenarios = ["train", "valid"]

for row, scenario in enumerate(scenarios, start=1):
    for col, metric in enumerate(metrics, start=1):
        for model_var, model_label in model_names:
            model = globals()[model_var]
            y_vals = model.cv_scores[scenario][metric]
            is_pca = "PCA" in model_label
            fig.add_trace(
                go.Scatter(
                    y=y_vals,
                    mode="lines+markers",
                    name=model_label,
                    legendgroup=model_label,
                    line=dict(
                        dash="dash" if is_pca else "solid",
                        color=model_colors[model_label]
                    ),
                    showlegend=(row == 1 and col == 1)
                ),
                row=row, col=col
            )

fig.update_layout(
    height=700, width=1500,
    title_text="Cross-Validation Metrics for All Models",
    legend_title_text="Model"
)
fig.show()

1) Model generalize very well (metrics across train / valid folds are close) <br>
2) All LR models outperform dummy classifier. <br>
3) PCA seems to improving both precision and recall (especially during validation) : denoising the data helps the model focusing on relevant patterns !
4) Best model seems to be L1 + PCA : improves the learning process in the sens of the given metric (focusing more on improving precision than recall compared to other models) + best score on validation set + stable metrics across folds.

In [119]:
# Mettrics comparison
results = []

for model in [dummy, lr, lr_pca, lr_l1, lr_l1_pca, lr_l2, lr_l2_pca]:
    row = [model.name]
    for scenario in scenarios:
        for metric in metrics:
            avg = np.mean(model.cv_scores[scenario][metric])
            row.append(avg)
    results.append(row)

headers = ["Model"] + [f"{scenario.capitalize()} {metric.capitalize()}" for scenario in scenarios for metric in metrics]
print(tabulate(results, headers=headers, tablefmt="github", floatfmt=".4f"))

| Model     |   Train Precision |   Train Recall |   Train F0.5 |   Valid Precision |   Valid Recall |   Valid F0.5 |
|-----------|-------------------|----------------|--------------|-------------------|----------------|--------------|
| dummy     |            0.6243 |         1.0000 |       0.6750 |            0.6243 |         1.0000 |       0.6750 |
| lr        |            0.8035 |         0.6651 |       0.7714 |            0.7964 |         0.6578 |       0.7640 |
| lr_pca    |            0.7970 |         0.6734 |       0.7688 |            0.7956 |         0.6718 |       0.7671 |
| lr_l1     |            0.7986 |         0.6718 |       0.7695 |            0.8052 |         0.6733 |       0.7746 |
| lr_l1_pca |            0.8045 |         0.6646 |       0.7719 |            0.8098 |         0.6671 |       0.7763 |
| lr_l2     |            0.7999 |         0.6687 |       0.7697 |            0.8080 |         0.6718 |       0.7761 |
| lr_l2_pca |            0.8016 |         0.6677 |      

In [120]:
# Saving best configs
lr_config = params["l1_pca"]
lr_config.pop("model")
lr_config.pop("c_min")
lr_config.pop("c_max")
lr_config.pop("n_trials")
lr_config.pop("scaler")
lr_config.pop("pca")

save_config_path = os.path.join("models","params","lr.json")
os.makedirs(os.path.dirname(save_config_path), exist_ok=True)
with open(save_config_path, 'w') as f:
    json.dump(lr_config, f, indent=4)

print(f"Best configuration saved to: {save_config_path}")

Best configuration saved to: models/params/lr.json


In [121]:
# Get coefficients from the L1+PCA model
coefs = lr_l1_pca.classifier.coef_[0]

# Create a bar plot to visualize coefficients
fig = go.Figure(data=go.Bar(
    x=list(range(len(coefs))),
    y=np.abs(coefs),
    text=[f'{coef:.3f}' for coef in coefs],
    textposition='auto'
))

fig.update_layout(
    title="L1 + PCA Model Coefficients",
    xaxis_title="Principal Component Index",
    yaxis_title="Coefficient Value",
    height=500,
    width=800
)

fig.show()

L1 + PCA semble être un modèle très intéressant pour notre approche : il permet d'affiner encore plus les résultats de recherche en ne sélectionnant que les axes utiles. <br> En particulier toutes les variables d'origine (filtrées) ont une contribution significative à ce sous-set de composantes principales. <br>
Un point intéressant pourrait être de ne pas sélectionner les axes dont le coefficiant associé est en dessous d'un certain threshold. Le modèle présenté ici est cependant satisfaisant pour décider de le mettre en production.

In [122]:
# Fit all models and evaluate on test set
test_results = []

for model in [dummy, lr, lr_pca, lr_l1, lr_l1_pca, lr_l2, lr_l2_pca]:
    model.fit(X_train, y_train)
    test_metrics = model.evaluate(X_test, y_test)
    
    row = [model.name, test_metrics["precision"], test_metrics["recall"], test_metrics[f"F{beta}"]]
    test_results.append(row)

# Display results
test_headers = ["Model", "Precision", "Recall", f"F{beta}-Score"]
print("\nTest Set Performance:")
print(tabulate(test_results, headers=test_headers, tablefmt="github", floatfmt=".4f"))


Test Set Performance:
| Model     |   Precision |   Recall |   F0.5-Score |
|-----------|-------------|----------|--------------|
| dummy     |      0.6240 |   1.0000 |       0.6748 |
| lr        |      0.8345 |   0.7205 |       0.8089 |
| lr_pca    |      0.8264 |   0.7391 |       0.8073 |
| lr_l1     |      0.8264 |   0.7391 |       0.8073 |
| lr_l1_pca |      0.8322 |   0.7391 |       0.8117 |
| lr_l2     |      0.8169 |   0.7205 |       0.7956 |
| lr_l2_pca |      0.8227 |   0.7205 |       0.8000 |


Good generalization property (very close from metrics obtained with cross validation validation set, plus higher) ! <br>
L1 + PCA gives best scores (very little margin) on test set !