In [8]:
# Dependencies

# Data Manip
import pandas as pd

# Linear Algebra
import numpy as np

# Machine Learning
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from models.lr import LREstimator
from models.rf import RFEstimator
from models.knn import KNEstimator
from models.svc import SVEstimator
from sklearn.decomposition import PCA

# System & Files
import os
import json

# Visualization
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate

seed = 42
np.random.seed(seed)
import random
random.seed(seed)

In [9]:
# Loading best models
best_config_folder_path = os.path.join("models","params")

# KNN
knn_config_path = os.path.join(best_config_folder_path,"knn.json")
with open(knn_config_path, 'r') as f:
    knn_config = json.load(f)

if knn_config["scaler_type"] == "robust":
    knn_config["scaler"] = RobustScaler()

if knn_config["pca_level"] is not None:
    knn_config["pca"] = PCA(n_components=knn_config["pca_level"],random_state=knn_config["random_state"])
else:
    knn_config["pca"] = None

knn = KNEstimator(knn_config)

# LR
lr_config_path = os.path.join(best_config_folder_path,"lr.json")
with open(lr_config_path, 'r') as f:
    lr_config = json.load(f)

if lr_config["scaler_type"] == "robust":
    lr_config["scaler"] = RobustScaler()

if lr_config["pca_level"] is not None:
    lr_config["pca"] = PCA(n_components=lr_config["pca_level"],random_state=lr_config["random_state"])
else:
    lr_config["pca"] = None

lr = LREstimator(lr_config)

# RF
rf_config_path = os.path.join(best_config_folder_path,"rf.json")
with open(rf_config_path, 'r') as f:
    rf_config = json.load(f)

rf = RFEstimator(rf_config)

# SVC
svc_config_path = os.path.join(best_config_folder_path,"svc.json")
with open(svc_config_path, 'r') as f:
    svc_config = json.load(f)

if svc_config["scaler_type"] == "robust":
    svc_config["scaler"] = RobustScaler()

if svc_config["pca_level"] is not None:
    svc_config["pca"] = PCA(n_components=svc_config["pca_level"],random_state=svc_config["random_state"])
else:
    svc_config["pca"] = None

svc = SVEstimator(svc_config)

In [10]:
# Data Collection
data_filtered_path = "data/nba_filtered.csv"
df_filtered = pd.read_csv(data_filtered_path)

data_filtered_capped_path = "data/nba_filtered_capped.csv"
df_filtered_capped = pd.read_csv(data_filtered_capped_path)

In [11]:
# Data Split
target = "TARGET_5Yrs"
test_size = 0.2
cv_folds = 4

X_filtered = df_filtered.drop(columns=[target]).values
y_filtered = df_filtered_capped[target].values

X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(X_filtered,y_filtered,stratify=y_filtered,test_size=test_size,shuffle=True,random_state=seed)

X_filtered_capped = df_filtered_capped.drop(columns=[target]).values
y_filtered_capped = df_filtered_capped[target].values

X_train_filtered_capped, X_test_filtered_capped, y_train_filtered_capped, y_test_filtered_capped = train_test_split(X_filtered_capped,y_filtered_capped,stratify=y_filtered_capped,test_size=test_size,shuffle=True,random_state=seed)

In [12]:
# Cross validating
knn_cv_score = knn.cross_validate(X_train=X_train_filtered_capped,y_train=y_train_filtered_capped) # could use the same y for all models but for readibility I prefer calling different instances
lr_cv_score = lr.cross_validate(X_train=X_train_filtered_capped,y_train=y_train_filtered_capped)
svc_cv_score = svc.cross_validate(X_train=X_train_filtered_capped,y_train=y_train_filtered_capped)
rf_cv_score = rf.cross_validate(X_train=X_train_filtered,y_train=y_train_filtered)

In [13]:
# Metric detail on folds
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=('Precision', 'Recall', 'F-Beta', 'Precision', 'Recall', 'F-Beta'),
    row_titles=['Training', 'Validation'],
    vertical_spacing=0.1
)

model_colors = {'KNN': 'blue', 'LR': 'red', 'SVC': 'green', 'RF': 'orange'}

fold_scores_dict = {
    'KNN': knn.cv_scores,
    'LR': lr.cv_scores,
    'SVC': svc.cv_scores,
    'RF': rf.cv_scores
}

for i, model in enumerate(['KNN', 'LR', 'SVC', 'RF']):
    fold_scores = fold_scores_dict[model]
    
    for j, metric in enumerate(['precision', 'recall', 'F0.5']):
        fig.add_trace(
            go.Scatter(
                x=list(range(len(fold_scores['train'][metric]))),
                y=fold_scores['train'][metric],
                mode='markers+lines',
                name=model,
                marker_color=model_colors[model],
                legendgroup=model,
                showlegend=(j == 0)
            ),
            row=1, col=j+1
        )

    for j, metric in enumerate(['precision', 'recall', 'F0.5']):
        fig.add_trace(
            go.Scatter(
                x=list(range(len(fold_scores['valid'][metric]))),
                y=fold_scores['valid'][metric],
                mode='markers+lines',
                name=model,
                marker_color=model_colors[model],
                legendgroup=model,
                showlegend=False
            ),
            row=2, col=j+1
        )

fig.update_layout(
    title="Model Performance Metrics - Training vs Validation",
    height=600
)

fig.show()

# Average scores on validation set
cv_scores_summary = {
    'Model': ['KNN', 'LR', 'SVC', 'RF'],
    'CV Score': [knn_cv_score, lr_cv_score, svc_cv_score, rf_cv_score]
}

cv_scores_df = pd.DataFrame(cv_scores_summary)
print(tabulate(cv_scores_df, headers='keys', tablefmt='github', floatfmt='.4f', showindex=False))

| Model   |   CV Score |
|---------|------------|
| KNN     |     0.7539 |
| LR      |     0.7759 |
| SVC     |     0.7692 |
| RF      |     0.7675 |


During validation, all models are seeing their metrics dropping on the third and forth folds (especially for recall), which I think is due to a **shift in distribution** of some features. However the diversity observed in curves highlight models are learning (without significant over-fitting)! <br>
In particular **KNN** seems really good at finding potential "good" athletes (**recall** > precision even with beta < 1) and **other models** at finding real "good" athletes (**precision** > recall). <br>
**Logistic Regression** is showing very good learning in terms of **precision** and has the **best average cross validation score** : I choose this model as the classifier to be put in production.

In [14]:
# Test set evaluation
lr.fit(X_train_filtered_capped, y_train_filtered_capped)
knn.fit(X_train_filtered_capped, y_train_filtered_capped)
svc.fit(X_train_filtered_capped, y_train_filtered_capped)
rf.fit(X_train_filtered, y_train_filtered)

lr_test_scores = lr.evaluate(X_test_filtered_capped, y_test_filtered_capped)
knn_test_scores = knn.evaluate(X_test_filtered_capped, y_test_filtered_capped)
svc_test_scores = svc.evaluate(X_test_filtered_capped, y_test_filtered_capped)
rf_test_scores = rf.evaluate(X_test_filtered, y_test_filtered)

lr_test_scores

# Create a summary table with test set metrics for all models
test_scores_summary = {
    'Model': ['KNN', 'LR', 'SVC', 'RF'],
    'Precision': [knn_test_scores['precision'], lr_test_scores['precision'], 
                  svc_test_scores['precision'], rf_test_scores['precision']],
    'Recall': [knn_test_scores['recall'], lr_test_scores['recall'], 
               svc_test_scores['recall'], rf_test_scores['recall']],
    'F0.5 Score': [knn_test_scores['F0.5'], lr_test_scores['F0.5'], 
                   svc_test_scores['F0.5'], rf_test_scores['F0.5']]
}

test_scores_df = pd.DataFrame(test_scores_summary)
print(tabulate(test_scores_df, headers='keys', tablefmt='github', floatfmt='.4f', showindex=False))

| Model   |   Precision |   Recall |   F0.5 Score |
|---------|-------------|----------|--------------|
| KNN     |      0.7706 |   0.8137 |       0.7788 |
| LR      |      0.8333 |   0.7453 |       0.8141 |
| SVC     |      0.8357 |   0.7267 |       0.8114 |
| RF      |      0.8156 |   0.7143 |       0.7931 |


Good generalisation of all models as expected (metrics close to what obtained during validation). Logistic Regression is showing the best score, conforting our approach !