In [1]:
# Dependencies

# Data Manip
import pandas as pd

# Linear Algebra
import numpy as np

# Machine Learning
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from models.svc import SVEstimator
from models.dummy import DummyEstimator
from sklearn.decomposition import PCA

# Optimization
import optuna
from optuna.storages import RDBStorage
from functools import partial

# System & Files
import os
import json

# Visualization
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate

seed = 42
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Data collection
data_path = "data/nba_filtered_capped.csv"
df = pd.read_csv(data_path)
df = df.loc[:, ~(df.columns.str.contains("capped"))]

In [3]:
# Train / Test split
target = "TARGET_5Yrs"

X = df.drop(columns=[target]).values
y = df[target].values

test_size=0.2
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=test_size,shuffle=True,random_state=seed)

# Cross Validation Splitter
n_splits = 4 # to get same number of samples within validation fold as in test set
splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) # Keeps same class proportions across sets

In [4]:
# Fine-tuning

beta = 0.5
n_components = 0.99
scaler_type = "robust"
n_trials = 200

args = {
    "svc": {
        "model": "svc",
        "kernel": "linear", # best empirical kernel (performances + convergence. Ex polynomial kernel with degree >= 3 does not converge (very long time per trial))
        "C_min": 0.1,
        "C_max": 1,
        # "gamma_min": 1e-4,
        # "gamma_max": 10,
        # "degree_min": 1,
        # "degree_max": 3,
        # "coef0_min": -10,
        # "coef0_max": 10,
        "class_weight": "balanced",
        "random_state": seed,
        "beta": beta,
        "n_trials": n_trials,
        "cv_folds": n_splits,
        "scaler_type": scaler_type,
        "pca_level": None,
    },
    "svc_pca": {
        "model": "svc_pca",
        "kernel": "linear",
        "C_min": 0.1,
        "C_max": 1,
        # "gamma_min": 1e-4,
        # "gamma_max": 10,
        # "degree_min": 1,
        # "degree_max": 3,
        # "coef0_min": -10,
        # "coef0_max": 10,
        "class_weight": "balanced",
        "random_state": seed,
        "beta": beta,
        "n_trials": n_trials,
        "cv_folds": n_splits,
        "scaler_type": scaler_type,
        "pca_level": n_components,
    }
}


def objective(trial, args: dict, X_train: np.array, y_train: np.array):
    model_name = args["model"]
    if model_name in ["svc","svc_pca"]:
        h_params = args.copy()
        # h_params["kernel"] = trial.suggest_categorical("kernel", ["linear", "rbf", "sigmoid"])
        h_params["kernel"] = args["kernel"] # best empirical kernel
        h_params["C"] = trial.suggest_float("C", args["C_min"], args["C_max"], log=True)
        if args["scaler_type"] == "robust":
            h_params["scaler"] = RobustScaler()
        if args["pca_level"] is not None:
            h_params["pca"] = PCA(n_components=args["pca_level"],random_state=h_params["random_state"])
        else:
            h_params["pca"] = None

    else:
        raise NotImplementedError(f"Objective function not developped for model {model_name}")
    
    return cv_score(model_name, h_params, X_train, y_train)

def cv_score(model_name: str, h_params: dict, X_train: np.array, y_train: np.array):
    if model_name in ["svc","svc_pca"]:
        model = SVEstimator(h_params)
        score = model.cross_validate(X_train, y_train)
        return score
    else:
        raise NotImplementedError(f"Scoring function not developped for model {model_name}")


storage_url = "sqlite:///db.sqlite3"
storage = RDBStorage(url=storage_url)

# Run optimization for both SVC configurations
for config_name, config_args in args.items():
    study_name = config_args["model"] + "_" + target
    
    # Delete existing study if it exists
    studies = storage.get_all_studies()
    if any(s.study_name == study_name for s in studies):
        optuna.delete_study(study_name=study_name, storage=storage_url)
        print(f"Deleted existing study: {study_name}")
    
    # Create new study
    study = optuna.create_study(
        storage=storage_url,
        study_name=study_name,
        direction="maximize",
        sampler=optuna.samplers.TPESampler(seed=seed)
    )
    
    # Create partial objective function with current configuration
    objective_partial = partial(
        objective,
        args=config_args,
        X_train=X_train,
        y_train=y_train,
    )
    
    # Run optimization
    print(f"Optimizing {study_name}...")
    study.optimize(objective_partial, n_trials=config_args["n_trials"], n_jobs=1)
    print(f"Best value for {study_name}: {study.best_value}")

[I 2025-10-06 15:59:10,249] A new study created in RDB with name: svc_TARGET_5Yrs


Deleted existing study: svc_TARGET_5Yrs
Optimizing svc_TARGET_5Yrs...


[I 2025-10-06 15:59:10,454] Trial 0 finished with value: 0.7666021406217158 and parameters: {'C': 0.23688639503640782}. Best is trial 0 with value: 0.7666021406217158.
[I 2025-10-06 15:59:11,008] Trial 1 finished with value: 0.7670981183212886 and parameters: {'C': 0.8927180304353626}. Best is trial 1 with value: 0.7670981183212886.
[I 2025-10-06 15:59:11,383] Trial 2 finished with value: 0.7689834411491798 and parameters: {'C': 0.5395030966670228}. Best is trial 2 with value: 0.7689834411491798.
[I 2025-10-06 15:59:11,652] Trial 3 finished with value: 0.7689148415832968 and parameters: {'C': 0.3968793330444371}. Best is trial 2 with value: 0.7689834411491798.
[I 2025-10-06 15:59:11,810] Trial 4 finished with value: 0.7667558400136225 and parameters: {'C': 0.1432249371823025}. Best is trial 2 with value: 0.7689834411491798.
[I 2025-10-06 15:59:11,970] Trial 5 finished with value: 0.7667558400136225 and parameters: {'C': 0.14321698289111517}. Best is trial 2 with value: 0.76898344114917

Best value for svc_TARGET_5Yrs: 0.7720589954222524
Deleted existing study: svc_pca_TARGET_5Yrs
Optimizing svc_pca_TARGET_5Yrs...


[I 2025-10-06 16:00:30,959] Trial 1 finished with value: 0.7574664440409282 and parameters: {'C': 0.8927180304353626}. Best is trial 0 with value: 0.7590669562048207.
[I 2025-10-06 16:00:31,218] Trial 2 finished with value: 0.7584469379791685 and parameters: {'C': 0.5395030966670228}. Best is trial 0 with value: 0.7590669562048207.
[I 2025-10-06 16:00:31,435] Trial 3 finished with value: 0.7590669562048207 and parameters: {'C': 0.3968793330444371}. Best is trial 0 with value: 0.7590669562048207.
[I 2025-10-06 16:00:31,557] Trial 4 finished with value: 0.7584469379791685 and parameters: {'C': 0.1432249371823025}. Best is trial 0 with value: 0.7590669562048207.
[I 2025-10-06 16:00:31,683] Trial 5 finished with value: 0.7590669562048207 and parameters: {'C': 0.14321698289111517}. Best is trial 0 with value: 0.7590669562048207.
[I 2025-10-06 16:00:31,794] Trial 6 finished with value: 0.7590669562048207 and parameters: {'C': 0.1143098387631322}. Best is trial 0 with value: 0.759066956204820

Best value for svc_pca_TARGET_5Yrs: 0.7590669562048207


In [5]:
# Best models details
best_configs = {}

study_names = ["svc_TARGET_5Yrs", "svc_pca_TARGET_5Yrs"]

for study_name in study_names:
    name = "_".join(study_name.split("_")[:-2])
    print(name)
    study = optuna.load_study(study_name=study_name, storage=storage_url)
    best_configs[name] = study.best_params
    cv_score = study.best_value
    print(f"Best CV score: {cv_score:.4f}")

svc
Best CV score: 0.7721
svc_pca
Best CV score: 0.7591


In [6]:
# Instantiate estimators and cross validate
dummy_params = {
    "strategy": "most_frequent",
    "random_state": seed,
    "cv_folds": n_splits,
    "beta": beta,
}
dummy = DummyEstimator(dummy_params)
dummy_score = dummy.cross_validate(X_train, y_train)

# SVC without PCA
svc_params = best_configs['svc']
svc_params.update({
    "kernel": args['svc']["kernel"],
    "class_weight": args['svc']["class_weight"],
    "random_state": seed,
    "cv_folds": n_splits,
    "beta": beta,
    "pca": None,
})
if args["svc"]["scaler_type"] == "robust":
    svc_params["scaler"] = RobustScaler() 

svc = SVEstimator(svc_params)
svc_score = svc.cross_validate(X_train, y_train)

# SVC with PCA
svc_pca_params = best_configs['svc_pca']
svc_pca_params.update({
    "kernel": args['svc_pca']["kernel"],
    "class_weight": args['svc_pca']["class_weight"],
    "random_state": seed,
    "cv_folds": n_splits,
    "beta": beta,
})
if args["svc_pca"]["scaler_type"] == "robust":
    svc_pca_params["scaler"] = RobustScaler()

if args["svc_pca"]["pca_level"] is not None:
    svc_pca_params["pca"] = PCA(n_components=args["svc_pca"]["pca_level"], random_state=args["svc_pca"]["random_state"])

svc_pca = SVEstimator(svc_pca_params)
svc_pca_score = svc_pca.cross_validate(X_train, y_train)

print(f"Dummy CV Score: {dummy_score:.4f}")
print(f"SVC CV Score: {svc_score:.4f}")
print(f"SVC+PCA CV Score: {svc_pca_score:.4f}")

# Create metrics visualization across folds
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=('Precision', 'Recall', f'F{beta} Score'),
    row_titles=['Train', 'Validation'],
    vertical_spacing=0.1
)

models = {'Dummy': dummy, 'SVC': svc, 'SVC+PCA': svc_pca}
colors = {'Dummy': 'red', 'SVC': 'blue', 'SVC+PCA': 'green'}

for model_name, model in models.items():
    color = colors[model_name]
    
    # Train metrics
    fig.add_trace(
        go.Scatter(x=list(range(1, n_splits+1)), 
                  y=model.cv_scores['train']['precision'],
                  mode='lines+markers', 
                  name=f'{model_name}',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=True),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=list(range(1, n_splits+1)), 
                  y=model.cv_scores['train']['recall'],
                  mode='lines+markers', 
                  name=f'{model_name} Train Recall',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=False),
        row=1, col=2
    )
    
    fig.add_trace(
        go.Scatter(x=list(range(1, n_splits+1)), 
                  y=model.cv_scores['train'][f'F{beta}'],
                  mode='lines+markers', 
                  name=f'{model_name} Train F{beta}',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=False),
        row=1, col=3
    )
    
    # Validation metrics
    fig.add_trace(
        go.Scatter(x=list(range(1, n_splits+1)), 
                  y=model.cv_scores['valid']['precision'],
                  mode='lines+markers', 
                  name=f'{model_name} Valid Precision',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=False),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=list(range(1, n_splits+1)), 
                  y=model.cv_scores['valid']['recall'],
                  mode='lines+markers', 
                  name=f'{model_name} Valid Recall',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=False),
        row=2, col=2
    )
    
    fig.add_trace(
        go.Scatter(x=list(range(1, n_splits+1)), 
                  y=model.cv_scores['valid'][f'F{beta}'],
                  mode='lines+markers', 
                  name=f'{model_name} Valid F{beta}',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=False),
        row=2, col=3
    )

fig.update_xaxes(title_text="Fold")
fig.update_yaxes(title_text="Score")
fig.update_layout(height=600, width=1500, title="Cross-Validation Metrics Across Folds")
fig.show()

Dummy CV Score: 0.6750
SVC CV Score: 0.7721
SVC+PCA CV Score: 0.7591


0) Small differnece in metrics compared to OPTUNA experiment 
1) SVC models are outperforming dummy classifier !
2) No over-fitting : metrics consistent across folds + small gaps between train and validation scores.
3) PCA is helping to improve model to focus on recall but not on precision : not really useful in our setting (beta < 1)

In [7]:
# Saving best configs
best_config = svc_params.copy()
best_config.pop("scaler")
best_config["scaler_type"] = args["svc"]["scaler_type"]
best_config.pop("pca")
best_config["pca_level"] = args["svc"]["pca_level"]

save_config_path = os.path.join("models","params","svc.json")
os.makedirs(os.path.dirname(save_config_path), exist_ok=True)
with open(save_config_path, 'w') as f:
    json.dump(best_config, f, indent=4)

print(f"Best configuration saved to: {save_config_path}")


Best configuration saved to: models/params/svc.json


In [8]:
# Test results

dummy.fit(X_train,y_train)
svc.fit(X_train,y_train)
svc_pca.fit(X_train,y_train)

dummy_test_scores = dummy.evaluate(X_test,y_test)
svc_test_scores = svc.evaluate(X_test,y_test)
svc_pca_test_scores = svc_pca.evaluate(X_test, y_test)

# Display test results in a table
results_data = [
    ['Model', 'Precision', 'Recall', f'F{beta} Score'],
    ['Dummy', f"{dummy_test_scores['precision']:.4f}", f"{dummy_test_scores['recall']:.4f}", f"{dummy_test_scores[f'F{beta}']:.4f}"],
    ['SVC', f"{svc_test_scores['precision']:.4f}", f"{svc_test_scores['recall']:.4f}", f"{svc_test_scores[f'F{beta}']:.4f}"],
    ['SVC+PCA', f"{svc_pca_test_scores['precision']:.4f}", f"{svc_pca_test_scores['recall']:.4f}", f"{svc_pca_test_scores[f'F{beta}']:.4f}"]
]

print("Test Set Results:")
print(tabulate(results_data, headers='firstrow', tablefmt='github'))

Test Set Results:
| Model   |   Precision |   Recall |   F0.5 Score |
|---------|-------------|----------|--------------|
| Dummy   |      0.624  |   1      |       0.6748 |
| SVC     |      0.8369 |   0.7329 |       0.8138 |
| SVC+PCA |      0.8429 |   0.7329 |       0.8183 |


Good generalization properties :
1) Models are still outperforming dummy classifier
2) Scores are a bit higher than during training (especially for SVC+PCA) which might indicate some fold dependency.