In [1]:
# Dependencies

# Data Manip
import pandas as pd

# Linear Algebra
import numpy as np

# Optimization
import optuna
from optuna.storages import RDBStorage
from functools import partial

# Machine Learning
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, fbeta_score, make_scorer
from models.rf import RFEstimator
from models.dummy import DummyEstimator


# Visualization
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate

# System & Files
import os
import json

seed = 42
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Data collection
data_path = "data/nba_filtered.csv" # random forest robust to outliers
df = pd.read_csv(data_path)

In [3]:
# Train / Test split
target = "TARGET_5Yrs"

X = df.drop(columns=[target]).values
y = df[target].values

test_size=0.2
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=test_size,shuffle=True,random_state=seed)

# Cross Validation Splitter
cv_folds = 4 # To get approximately same number of samples in validation sets and test set

Lots of hyperparameters to fine-tune, let's use a specialized library : OPTUNA <br>
Bayesian Optimization / consecutively narrowing confidence intervals over various parameters of scoring function <br>
Logs stored under "db.sqlite3" for more details 

In [4]:
# Hyperparameter Tuning
beta = 0.5
model_name = "random_forest"

args = {
    "model": "random_forest",
    "cv_folds": cv_folds,
    "class_weight": "balanced_subsample",
    "min_est": 5,
    "max_est": 50,
    "max_depth_min": 5,
    "max_depth_max": 50,
    "min_samples_split_min": 2,
    "min_samples_split_max": 30,
    "min_samples_leaf_min": 1,
    "min_samples_leaf_max": 20,
    "min_impurity_decrease_min": 1e-5,
    "min_impurity_decrease_max": 1e-3,
    "ccp_alpha_min": 1e-3,
    "ccp_alpha_max": 1e-2,
    "n_trials": 200,
    "beta": beta,
    "random_state": seed,
}

def objective(trial, args: dict, X_train: np.array, y_train: np.array):
    model_name = args["model"]
    if model_name == "random_forest":
        h_params = {}
        h_params["n_estimators"] = trial.suggest_int("n_estimators", args["min_est"], args["max_est"])
        h_params["min_samples_split"] = trial.suggest_int("min_samples_split", args["min_samples_split_min"], args["min_samples_split_max"])
        h_params["min_impurity_decrease"] = trial.suggest_float("min_impurity_decrease", args["min_impurity_decrease_min"], args["min_impurity_decrease_max"], log=True)
        h_params["min_samples_leaf"] = trial.suggest_int("min_samples_leaf", args["min_samples_leaf_min"], args["min_samples_leaf_max"])
        h_params["max_depth"] = trial.suggest_int("max_depth", args["max_depth_min"], args["max_depth_max"])
        h_params["ccp_alpha"] = trial.suggest_float("ccp_alpha", args["ccp_alpha_min"], args["ccp_alpha_max"])
        h_params["beta"] = args["beta"]
        h_params["cv_folds"] = args["cv_folds"]
        h_params["random_state"] = args["random_state"]
        h_params["class_weight"] = args["class_weight"]
    else:
        raise NotImplementedError(f"Objective function not developped for model {model_name}")
    
    return cv_score(model_name, h_params, X_train, y_train)

def cv_score(model_name: str, h_params: dict, X_train: np.array, y_train: np.array):
    if model_name == "random_forest":
        model = RFEstimator(h_params)
        score = model.cross_validate(X_train, y_train)
        return score
    else:
        raise NotImplementedError(f"Scoring function not developped for model {model_name}")

storage_url = "sqlite:///db.sqlite3"
study_name = model_name + "_" + target
storage=RDBStorage(url=storage_url)
studies = storage.get_all_studies()
if any(s.study_name == study_name for s in studies):
    optuna.delete_study(study_name=study_name, storage=storage_url)
    print(f"Deleted existing study: {study_name}")
study = optuna.create_study(
        storage=storage_url,
        study_name=study_name,
        direction="maximize",
        sampler=optuna.samplers.TPESampler(seed=seed)
    )

objective_partial = partial(
    objective,
    args=args,
    X_train=X_train,
    y_train=y_train,
)

study.optimize(objective_partial, n_trials=args["n_trials"], n_jobs=1)

[I 2025-10-06 16:01:31,334] A new study created in RDB with name: random_forest_TARGET_5Yrs
[I 2025-10-06 16:01:31,468] Trial 0 finished with value: 0.7560858802874928 and parameters: {'n_estimators': 22, 'min_samples_split': 29, 'min_impurity_decrease': 0.000291063591313307, 'min_samples_leaf': 12, 'max_depth': 12, 'ccp_alpha': 0.002403950683025824}. Best is trial 0 with value: 0.7560858802874928.


Deleted existing study: random_forest_TARGET_5Yrs


[I 2025-10-06 16:01:31,530] Trial 1 finished with value: 0.7389790395061033 and parameters: {'n_estimators': 7, 'min_samples_split': 27, 'min_impurity_decrease': 0.00015930522616241006, 'min_samples_leaf': 15, 'max_depth': 5, 'ccp_alpha': 0.00972918866945795}. Best is trial 0 with value: 0.7560858802874928.
[I 2025-10-06 16:01:31,737] Trial 2 finished with value: 0.7638066152895209 and parameters: {'n_estimators': 43, 'min_samples_split': 8, 'min_impurity_decrease': 2.3102018878452926e-05, 'min_samples_leaf': 4, 'max_depth': 18, 'ccp_alpha': 0.005722807884690141}. Best is trial 2 with value: 0.7638066152895209.
[I 2025-10-06 16:01:31,868] Trial 3 finished with value: 0.7485910764751367 and parameters: {'n_estimators': 24, 'min_samples_split': 10, 'min_impurity_decrease': 0.00016738085788752134, 'min_samples_leaf': 3, 'max_depth': 18, 'ccp_alpha': 0.004297256589643226}. Best is trial 2 with value: 0.7638066152895209.
[I 2025-10-06 16:01:31,998] Trial 4 finished with value: 0.75239636574

Hyperparameters are selected / adjusted (intervals) manually looking at optuna logs

In [5]:
# Best model details
best_trial = sorted(study.trials, key=lambda x: x.value if x.value is not None else float("inf"))[-1] # Maximization scenario
best_params = best_trial.params
best_score = best_trial.value
print("Best score:", best_score)

Best score: 0.7675127183200857


According to OPTUNA logs : <br>
1) All hyper parameters seem relevant for the model (coefficient between 0.05 and 0.18)
2) Fine-tuning process is going well (global increasing trend score-wise)

In [6]:
# Feature importance
best_config = best_params
best_config.update({
    "random_state": args["random_state"],
    "class_weight": args["class_weight"],
    "beta": args["beta"],
    "cv_folds": args["cv_folds"],
})

model = RFEstimator(best_config)

model.fit(X_train,y_train)

importances = model.classifier.feature_importances_
feature_names = df.drop(columns=[target]).columns

fig = go.Figure(
    go.Bar(
        x=feature_names,
        y=importances,
        marker=dict(color="royalblue")
    )
)
fig.update_layout(
    title="Feature Importances (Random Forest)",
    xaxis_title="Features",
    yaxis_title="Importance",
    width=900,
    height=500
)
fig.show()

Value importance ranges from around 2 to 20%. Trying to remove less important features results in a less precise / stable model (when looking at cross validation scores across folds).<br>
I don't think we should remove one of them (all significantly important).

In [7]:
# Score details on each fold
dummy_params = {
    "beta": args["beta"],
    "cv_folds": args["cv_folds"],
    "strategy": "most_frequent",
    "random_state": args["random_state"],
}
dummy = DummyEstimator(dummy_params)
dummy_cv_score = dummy.cross_validate(X_train,y_train)

model_cv_score = model.cross_validate(X_train,y_train)

print(f"Dummy CV Score: {dummy_cv_score:.4f}")
print(f"Random Forest CV Score: {model_cv_score:.4f}")

dummy_scores = dummy.cv_scores  
model_scores = model.cv_scores

fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=('Precision', 'Recall', f'F{beta} Score'),
    row_titles=['Train', 'Validation'],
    vertical_spacing=0.1
)

models = {'Dummy': dummy, 'Random Forest': model }
colors = {'Dummy': 'red', 'Random Forest': 'green'}

for model_name, model in models.items():
    color = colors[model_name]
    
    # Train metrics
    fig.add_trace(
        go.Scatter(x=list(range(1, cv_folds+1)), 
                  y=model.cv_scores['train']['precision'],
                  mode='lines+markers', 
                  name=f'{model_name}',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=True),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=list(range(1, cv_folds+1)), 
                  y=model.cv_scores['train']['recall'],
                  mode='lines+markers', 
                  name=f'{model_name} Train Recall',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=False),
        row=1, col=2
    )
    
    fig.add_trace(
        go.Scatter(x=list(range(1, cv_folds+1)), 
                  y=model.cv_scores['train'][f'F{beta}'],
                  mode='lines+markers', 
                  name=f'{model_name} Train F{beta}',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=False),
        row=1, col=3
    )
    
    # Validation metrics
    fig.add_trace(
        go.Scatter(x=list(range(1, cv_folds+1)), 
                  y=model.cv_scores['valid']['precision'],
                  mode='lines+markers', 
                  name=f'{model_name} Valid Precision',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=False),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=list(range(1, cv_folds+1)), 
                  y=model.cv_scores['valid']['recall'],
                  mode='lines+markers', 
                  name=f'{model_name} Valid Recall',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=False),
        row=2, col=2
    )
    
    fig.add_trace(
        go.Scatter(x=list(range(1, cv_folds+1)), 
                  y=model.cv_scores['valid'][f'F{beta}'],
                  mode='lines+markers', 
                  name=f'{model_name} Valid F{beta}',
                  line=dict(color=color),
                  legendgroup=model_name,
                  showlegend=False),
        row=2, col=3
    )

fig.update_xaxes(title_text="Fold")
fig.update_yaxes(title_text="Score")
fig.update_layout(height=600, width=1500, title="Cross-Validation Metrics Across Folds")
fig.show()

Dummy CV Score: 0.6750
Random Forest CV Score: 0.7675


1) We observe some over-fitting (gap in metrics in fold 3) but scores remain consistent.
2) The model shows a higher precision than recall as expected.
3) Dummy Classifier is clearly outperformed !

In [8]:
# Saving best configs
save_config_path = os.path.join("models","params","rf.json")
os.makedirs(os.path.dirname(save_config_path), exist_ok=True)
with open(save_config_path, 'w') as f:
    json.dump(best_config, f, indent=4)

print(f"Best configuration saved to: {save_config_path}")

Best configuration saved to: models/params/rf.json


In [9]:
# Evaluation metrics

model_test_scores = model.evaluate(X_test,y_test)
dummy_test_scores = dummy.evaluate(X_test,y_test)

# Create comparison table
data = [
    ["Model", "Precision", "Recall", f"F{beta} Score"],
    ["Dummy", f"{dummy_test_scores['precision']:.4f}", f"{dummy_test_scores['recall']:.4f}", f"{dummy_test_scores[f'F{beta}']:.4f}"],
    ["Random Forest", f"{model_test_scores['precision']:.4f}", f"{model_test_scores['recall']:.4f}", f"{model_test_scores[f'F{beta}']:.4f}"]
]

print("Test Set Performance Comparison:")
print(tabulate(data, headers="firstrow", tablefmt="github"))

Test Set Performance Comparison:
| Model         |   Precision |   Recall |   F0.5 Score |
|---------------|-------------|----------|--------------|
| Dummy         |      0.624  |   1      |       0.6748 |
| Random Forest |      0.8138 |   0.7329 |       0.7962 |


Random Forest generalizes well on unseen data (almost the same metrics as during cross validation), plus it still outperforms dummy classifier !