In [1]:
# Dependencies

# Data Manip
import pandas as pd

# Linear Algebra
import numpy as np

# Machine Learning
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from models.dummy import DummyEstimator
from models.knn import KNEstimator
from sklearn.decomposition import PCA

# System & Files
import os
import json

# Visualization
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate

seed = 42
np.random.seed(seed)

In [2]:
# Data collection
data_path = "data/nba_filtered_capped.csv"
df = pd.read_csv(data_path)
df = df.loc[:, ~(df.columns.str.contains("capped"))]

In [3]:
# Train / Test split
target = "TARGET_5Yrs"

X = df.drop(columns=[target]).values
y = df[target].values

test_size=0.2
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=test_size,shuffle=True,random_state=seed)

# Cross Validation Splitter
cv_folds = 4 # to get same number of samples within validation fold as in test set
splitter = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=seed) # Keeps same class proportions across sets

In [4]:
# Fine-tuning pipeline
beta= 0.5
n_components = 0.99
scorer= make_scorer(fbeta_score,beta=beta)

h_params = {
    "classifier__n_neighbors": list(range(10,100)),
}

pipeline = Pipeline([
    ("scaler", RobustScaler()),
    ("classifier", KNeighborsClassifier(weights="uniform"))
])

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=h_params,
    cv=cv_folds,
    n_jobs=-1,
    verbose=1,
    scoring=scorer,
)


grid_search.fit(X_train, y_train)

pipeline_pca = Pipeline([
    ("scaler", RobustScaler()),
    ("pca", PCA(n_components=n_components, random_state=seed)),
    ("classifier", KNeighborsClassifier(weights="uniform"))
])

grid_search_pca = GridSearchCV(
    estimator=pipeline_pca,
    param_grid=h_params,
    cv=cv_folds,
    n_jobs=-1,
    verbose=1,
    scoring=scorer,
)

grid_search_pca.fit(X_train, y_train)

Fitting 4 folds for each of 90 candidates, totalling 360 fits
Fitting 4 folds for each of 90 candidates, totalling 360 fits


0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'classifier__n_neighbors': [10, 11, ...]}"
,scoring,"make_scorer(f...ct', beta=0.5)"
,n_jobs,-1
,refit,True
,cv,4
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,n_components,0.99
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42

0,1,2
,n_neighbors,44
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [5]:
scores = grid_search.cv_results_['mean_test_score']
neighbors = grid_search.cv_results_['param_classifier__n_neighbors'].data

# Plot scores for the first grid search (without PCA)
fig = go.Figure()
fig.add_trace(go.Scatter(x=neighbors, y=scores, mode='lines+markers', name='Without PCA'))
fig.update_layout(
    title='Grid Search Scores by n_neighbors',
    xaxis_title='n_neighbors',
    yaxis_title=f'Mean Validation Score (F{beta})'
)

best_score = np.max(scores)
fig.add_shape(
    type="line",
    x0=neighbors[0], x1=neighbors[-1],
    y0=best_score, y1=best_score,
    line=dict(color="red", dash="dash"),
)
fig.add_annotation(
    x=neighbors[-1],
    y=best_score,
    text=f"Best Score: {best_score:.3f}",
    showarrow=False,
    yanchor="bottom",
    font=dict(color="red")
)

# Fit the PCA grid search and add its results
scores_pca = grid_search_pca.cv_results_['mean_test_score']
neighbors_pca = grid_search_pca.cv_results_['param_classifier__n_neighbors'].data

fig.add_trace(go.Scatter(x=neighbors_pca, y=scores_pca, mode='lines+markers', name='With PCA'))

best_score_pca = np.max(scores_pca)
fig.add_shape(
    type="line",
    x0=neighbors_pca[0], x1=neighbors_pca[-1],
    y0=best_score_pca, y1=best_score_pca,
    line=dict(color="blue", dash="dash"),
)
fig.add_annotation(
    x=neighbors_pca[-1] - 5,
    y=best_score_pca,
    text=f"Best Score PCA: {best_score_pca:.3f}",
    showarrow=False,
    yanchor="top",
    font=dict(color="blue")
)

fig.show()

In [6]:
# CV details
params = {
    "dummy":{
        "cv_folds": cv_folds,
        "random_state": seed,
        "strategy": "most_frequent",
        "beta": beta,
    },
    "knn": {
        "cv_folds": cv_folds,
        "beta": beta,
        "weights": "uniform",
        "scaler": RobustScaler(),
        "seed": seed,
        "n_neighbors": grid_search.best_params_['classifier__n_neighbors'],
        "pca": None,
    },
    "knn_pca": {
        "cv_folds": cv_folds,
        "beta": beta,
        "weights": "uniform",
        "scaler": RobustScaler(),
        "seed": seed,
        "n_neighbors": grid_search.best_params_['classifier__n_neighbors'],
        "pca": PCA(n_components=n_components, random_state=seed),
    },
}

knn = KNEstimator(params["knn"])
knn_pca = KNEstimator(params["knn_pca"])
dummy = DummyEstimator(params["dummy"])

dummy_cv_score = dummy.cross_validate(X_train, y_train)
print("Dummy CV score : ", dummy_cv_score)
knn_cv_score = knn.cross_validate(X_train, y_train)
print("Knn CV score : ",knn_cv_score)
knn_pca_cv_score = knn_pca.cross_validate(X_train, y_train)
print("Knn + pca CV score : ", knn_pca_cv_score)

# Fold metric plots
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=('Precision', 'Recall', f'F{beta}'),
    row_titles=['Training', 'Validation'],
    vertical_spacing=0.1
)


colors = {'knn': 'blue', 'dummy': 'red', 'knn_pca': 'green'}

for i, metric in enumerate(['precision', 'recall', f'F{beta}'], 1):
    # KNN training
    fig.add_trace(
        go.Scatter(
            x=list(range(1, cv_folds + 1)),
            y=knn.cv_scores['train'][metric],
            mode='lines+markers',
            name='KNN' if i == 1 else None,
            line=dict(color=colors['knn']),
            legendgroup='knn',
            showlegend=(i == 1)
        ),
        row=1, col=i
    )

    fig.add_trace(
        go.Scatter(
            x=list(range(1, cv_folds + 1)),
            y=knn_pca.cv_scores['train'][metric],
            mode='lines+markers',
            name='KNN PCA' if i == 1 else None,
            line=dict(color=colors['knn_pca']),
            legendgroup='knn pca',
            showlegend=(i == 1)
        ),
        row=1, col=i
    )
    
    # Dummy training
    fig.add_trace(
        go.Scatter(
            x=list(range(1, cv_folds + 1)),
            y=dummy.cv_scores['train'][metric],
            mode='lines+markers',
            name='Dummy' if i == 1 else None,
            line=dict(color=colors['dummy']),
            legendgroup='dummy',
            showlegend=(i == 1)
        ),
        row=1, col=i
    )

for i, metric in enumerate(['precision', 'recall', f'F{beta}'], 1):
    # KNN validation
    fig.add_trace(
        go.Scatter(
            x=list(range(1, cv_folds + 1)),
            y=knn.cv_scores['valid'][metric],
            mode='lines+markers',
            name=None,
            line=dict(color=colors['knn']),
            legendgroup='knn',
            showlegend=False
        ),
        row=2, col=i
    )

    fig.add_trace(
        go.Scatter(
            x=list(range(1, cv_folds + 1)),
            y=knn_pca.cv_scores['valid'][metric],
            mode='lines+markers',
            name=None,
            line=dict(color=colors['knn_pca']),
            legendgroup='knn pca',
            showlegend=False
        ),
        row=2, col=i
    )
    
    # Dummy validation
    fig.add_trace(
        go.Scatter(
            x=list(range(1, cv_folds + 1)),
            y=dummy.cv_scores['valid'][metric],
            mode='lines+markers',
            name=None,
            line=dict(color=colors['dummy']),
            legendgroup='dummy',
            showlegend=False
        ),
        row=2, col=i
    )

fig.update_layout(
    title='Cross-Validation Metrics Comparison',
    height=800,
    showlegend=True
)

fig.update_xaxes(title_text="CV Fold")
fig.update_yaxes(title_text="Score")

fig.show()

Dummy CV score :  0.6749947949461583
Knn CV score :  0.7574294303100964
Knn + pca CV score :  0.7553782399761212


0) Some difference in scores compared to the ones obtained with GridSearchCV, maybe due to difference in splitting
1) No over fitting appearing : learning process stable across folds (and scenario)
2) KNN is outperforming dummy classifier by a large margin
3) Model has a higher recall than precision, not really the desired effect when focusing more on precise results (beta < 1)
4) PCA is not bringing much to the current model (slightly worse learning process)

In [7]:
best_config = params["knn"].copy()
best_config.pop("scaler")
best_config.pop("pca")
best_config["pca_level"] = None
best_config["scaler_type"] = "robust"

save_config_path = os.path.join("models","params","knn.json")
os.makedirs(os.path.dirname(save_config_path), exist_ok=True)
with open(save_config_path, 'w') as f:
    json.dump(best_config, f, indent=4)

print(f"Best configuration saved to: {save_config_path}")

Best configuration saved to: models/params/knn.json


In [8]:
# Fit models and evaluate on test set
knn.fit(X_train, y_train)
dummy.fit(X_train, y_train)

knn_test_scores = knn.evaluate(X_test, y_test)
dummy_test_scores = dummy.evaluate(X_test, y_test)

# Create comparison table
test_results = []
test_results.append({
    'Model': 'KNN',
    'Precision': f"{knn_test_scores['precision']:.4f}",
    'Recall': f"{knn_test_scores['recall']:.4f}",
    f'F{beta}': f"{knn_test_scores[f'F{beta}']:.4f}"
})
test_results.append({
    'Model': 'Dummy',
    'Precision': f"{dummy_test_scores['precision']:.4f}",
    'Recall': f"{dummy_test_scores['recall']:.4f}",
    f'F{beta}': f"{dummy_test_scores[f'F{beta}']:.4f}"
})

# Display results table
print("Test Set Evaluation Results:")
print(tabulate(test_results, headers="keys", tablefmt="github"))

Test Set Evaluation Results:
| Model   |   Precision |   Recall |   F0.5 |
|---------|-------------|----------|--------|
| KNN     |      0.7661 |   0.8137 | 0.7751 |
| Dummy   |      0.624  |   1      | 0.6748 |


Good generalization : scores close to the ones observed during cross validation and still outperforming dummy classifier. <br>
This algorithm is a good start to be used to compare / combine with other models.