In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
df =  pd.read_parquet("all_teams_last10seasons.parquet")

In [2]:
df.head()

Unnamed: 0,MATCHUP,TARGET_WL,HOME,WIN_STREAK,FGM_rolling5,FGA_rolling5,FG_PCT_rolling5,FG3M_rolling5,FG3A_rolling5,FG3_PCT_rolling5,FTA_rolling5,FT_PCT_rolling5,OREB_rolling5,DREB_rolling5,AST_rolling5,STL_rolling5,BLK_rolling5,TOV_rolling5,PF_rolling5,PTS_rolling5
0,ATL @ CLE,0,0,1,45.0,94.0,0.479,15.0,35.0,0.429,9.0,0.889,6.0,49.0,31.0,8.0,9.0,13.0,16.0,113.0
1,ATL vs. DET,1,1,0,41.5,96.5,0.4315,12.5,34.5,0.3615,12.0,0.7445,11.5,43.0,25.5,6.5,6.0,11.0,19.5,104.0
2,ATL @ NOP,1,0,1,43.0,94.333333,0.458,12.333333,33.666667,0.366,15.0,0.782,11.0,41.666667,25.0,8.0,5.0,11.666667,19.333333,110.0
3,ATL @ WAS,0,0,2,42.25,94.75,0.44775,11.25,32.75,0.34125,15.5,0.7925,13.5,39.75,24.0,7.0,4.75,11.5,18.0,108.0
4,ATL @ PHI,0,0,0,43.4,93.4,0.4672,10.2,30.4,0.3302,15.2,0.7626,12.0,39.2,24.4,6.4,4.4,11.8,17.6,108.6


In [3]:
# --- 2️⃣ Compute advanced metrics using rolling features ---

# True Shooting % (TS%) = PTS / (2 * (FGA + 0.44 * FTA))
df['TS%'] = df['PTS_rolling5'] / (2 * (df['FGA_rolling5'] + 0.44 * df['FTA_rolling5']))

# Assist-to-Turnover Ratio
df['AST_TOV'] = df['AST_rolling5'] / df['TOV_rolling5'].replace(0, np.nan)

# --- 3️⃣ Clean up ---
df = df.replace([np.inf, -np.inf], np.nan).dropna()

In [4]:
df = df.drop(columns=['FGM_rolling5', 'FG3M_rolling5', 'FG_PCT_rolling5', 'FG3_PCT_rolling5', 'AST_rolling5', 'TOV_rolling5'])
corr = df.corr()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

# Show pairs with correlation > 0.9 (highly correlated)
high_corr = [(col, row, upper.loc[row, col]) 
             for col in upper.columns 
             for row in upper.index 
             if pd.notnull(upper.loc[row, col]) and abs(upper.loc[row, col]) > 0.7]

high_corr

ValueError: could not convert string to float: 'ATL @ CLE'

In [None]:
corr['TS%'].sort_values(ascending=False)

TS%                1.000000
PTS_rolling5       0.778078
AST_TOV            0.363218
WIN_STREAK         0.347222
FT_PCT_rolling5    0.258807
FTA_rolling5       0.137157
TARGET_WL          0.118817
DREB_rolling5      0.071562
BLK_rolling5       0.049537
FG3A_rolling5      0.035833
HOME              -0.004040
PF_rolling5       -0.052513
STL_rolling5      -0.075796
FGA_rolling5      -0.270930
OREB_rolling5     -0.401133
Name: TS%, dtype: float64

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

target = 'TARGET_WL'
features = [col for col in df.columns if col != target]

df_model = df.dropna(subset=features + [target])

# Split chronologically (e.g., 80% train, 20% test)
split_idx = int(len(df_model) * 0.8)
train_df = df_model.iloc[:split_idx]
test_df = df_model.iloc[split_idx:]

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

model = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import statsmodels.api as sm
import numpy as np

# Add constant (intercept)
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

# Fit logistic regression
model = sm.Logit(y_train, X_train_sm).fit()

# Predict probabilities (not classes yet)
y_proba = model.predict(X_test_sm)

# Convert probabilities to class predictions (threshold = 0.5)
y_pred_log = (y_proba >= 0.5).astype(int)

print("\nAIC:", model.aic.round(4))
print("BIC:", model.bic.round(4))
print("\nAccuracy:", round(accuracy_score(y_test, y_pred_log),4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))


Optimization terminated successfully.
         Current function value: 0.674427
         Iterations 5

AIC: 10729.1033
BIC: 10833.7832

Accuracy: 0.5796

Confusion Matrix:
 [[731 475]
 [359 419]]

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.61      0.64      1206
           1       0.47      0.54      0.50       778

    accuracy                           0.58      1984
   macro avg       0.57      0.57      0.57      1984
weighted avg       0.59      0.58      0.58      1984



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# --- SVM ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)

print("\n=== SVM ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


=== SVM ===
Accuracy: 0.5483870967741935
Confusion Matrix:
 [[661 545]
 [351 427]]
              precision    recall  f1-score   support

           0       0.65      0.55      0.60      1206
           1       0.44      0.55      0.49       778

    accuracy                           0.55      1984
   macro avg       0.55      0.55      0.54      1984
weighted avg       0.57      0.55      0.55      1984



In [None]:
# --- Random Forest ---
rf_model = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("\n=== Random Forest ===")
print("Accuracy:", round(accuracy_score(y_test, y_pred_rf), 4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


=== Random Forest ===
Accuracy: 0.5509
Confusion Matrix:
 [[708 498]
 [393 385]]
              precision    recall  f1-score   support

           0       0.64      0.59      0.61      1206
           1       0.44      0.49      0.46       778

    accuracy                           0.55      1984
   macro avg       0.54      0.54      0.54      1984
weighted avg       0.56      0.55      0.55      1984



In [None]:
# --- Collect all model metrics ---
results = []

# Logistic Regression
results.append({
    "model": "Logistic Regression",
    "accuracy": round(accuracy_score(y_test, y_pred_log), 4),
    "precision": round(precision_score(y_test, y_pred_log, average='macro'), 4),
    "recall": round(recall_score(y_test, y_pred_log, average='macro'), 4),
    "f1_score": round(f1_score(y_test, y_pred_log, average='macro'), 4)
})

# SVM
results.append({
    "model": "SVM",
    "accuracy": round(accuracy_score(y_test, y_pred_svm), 4),
    "precision": round(precision_score(y_test, y_pred_svm, average='macro'), 4),
    "recall": round(recall_score(y_test, y_pred_svm, average='macro'), 4),
    "f1_score": round(f1_score(y_test, y_pred_svm, average='macro'), 4)
})

# Random Forest
results.append({
    "model": "Random Forest",
    "accuracy": round(accuracy_score(y_test, y_pred_rf), 4),
    "precision": round(precision_score(y_test, y_pred_rf, average='macro'), 4),
    "recall": round(recall_score(y_test, y_pred_rf, average='macro'), 4),
    "f1_score": round(f1_score(y_test, y_pred_rf, average='macro'), 4)
})

# --- Convert to DataFrame and save ---
comparison_table = pd.DataFrame(results)
comparison_table

Unnamed: 0,model,accuracy,precision,recall,f1_score
0,Logistic Regression,0.5796,0.5697,0.5723,0.569
1,SVM,0.5484,0.5462,0.5485,0.542
2,Random Forest,0.5509,0.5395,0.541,0.5387
