In [None]:
import warnings
warnings.filterwarnings("ignore", category=Warning)
warnings.simplefilter(action='ignore', category=FutureWarning)
#Libraries for Data
import json
import requests 
from requests.exceptions import ReadTimeout
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
#Specific NBA Libraries
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.endpoints import commonteamroster
from nba_api.stats.static import teams
from nba_api.stats.endpoints import teamestimatedmetrics

In [None]:
#Help with API Timeouts
from nba_api.stats.endpoints import commonplayerinfo
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:118.0) Gecko/20100101 Firefox/118.0",
    "Referer": "https://www.nba.com/",
    "Accept-Language": "en-US,en;q=0.5",
}

In [None]:
#Read in csvs
#Players
playersCurrent = pd.DataFrame(pd.read_csv('player_stats_current.csv'))
playersPast = pd.DataFrame(pd.read_csv('player_stats_2022-25.csv'))
#Teams
teamsCurrent = pd.DataFrame(pd.read_csv('team_metrics_current.csv'))
teamsPast = pd.DataFrame(pd.read_csv('team_metrics_2022-25.csv'))
#Games
gamesCurrent = pd.DataFrame(pd.read_csv('games_current.csv'))
gamesPast = pd.DataFrame(pd.read_csv('games_2022-25.csv'))

In [None]:
#Fix game data
gamesCurrent = gamesCurrent.iloc[:608]
gamesPast = gamesPast.iloc[:7627]

In [None]:
gamesCurrent.to_csv('games_current.csv')
gamesPast.to_csv('games_2022-25.csv')

In [None]:
teamsPast.info()

In [None]:
#Merge datasets of same type
#Team stats
teamsCurrent['SEASON'] = '2025-26'
teams = pd.concat([teamsCurrent, teamsPast], ignore_index=True)
# Ensure TEAM_ID is int
teams['TEAM_ID'] = teams['TEAM_ID'].astype(int)

#Player stats
players = pd.concat([playersCurrent, playersPast], ignore_index=True)
players['TEAM_ABBREVIATION'] = players['TEAM_ABBREVIATION'].astype(str)
#Games
games = pd.concat([gamesCurrent, gamesPast], ignore_index=True)
games = games.loc[:, ~games.columns.str.contains("^Unnamed")]
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

In [None]:
#Clean data
teams = teams.iloc[:, 1:]
players = players.iloc[:, 1:]

In [None]:
#Matchups
games[['HOME_ABBR', 'AWAY_ABBR']] = games['MATCHUP'].str.split(' vs\. | @ ', expand=True)
games['WL_NUM'] = games['WL'].map({'W': 1, 'L': 0})

In [None]:
#Sort by date to get rolling averages for good predictions
games = games.sort_values(['TEAM_ID','GAME_DATE'], ascending=False)
games

In [None]:
#Calc rolling averages
rolling = ['PTS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB',
          'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']
for col in rolling:
    str = col + '_RA'
    games[str] = games[col].rolling(window=5).mean()
    
games.drop(columns = rolling, inplace = True)
games.info()

In [None]:
#Clean
games.dropna(inplace = True)
games['GAME_DATE'] = games['GAME_DATE'].astype(int) / 10**9
games.info()

In [None]:
#Make feature matrix and target
X = games.drop(columns = ['WL', 'WL_NUM'])
y = games['WL_NUM']

In [None]:
#Standardize
scaler = StandardScaler()
col = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
X[col] = scaler.fit_transform(X[col])
X.columns

In [None]:
#one-hot encoding
X = pd.DataFrame(pd.get_dummies(X))
X

In [None]:
# Now split and train
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'gamma': [0, 0.1, 0.5]
}
grid_search = GridSearchCV(
    estimator=XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5
)
grid_search.fit(X_train, y_train)
print("Best Score:", grid_search.best_score_)
print("Best Params:", grid_search.best_params_)

In [None]:
#Train XGBoost
model = grid_search.best_estimator_

In [None]:
# Evaluate
y_pred = model.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
report_dict = classification_report(y_test, y_pred, output_dict = True)

In [None]:
df_report = pd.DataFrame(report_dict).iloc[:-1, :].T

# 3. Plot the Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df_report, 
            annot=True, 
            cmap="Blues", # Use a cool color palette
            fmt='.2f', 
            cbar=True)
plt.title('XGBoost Classification Report Heatmap')
plt.show()

In [None]:
#LightGBM
import lightgbm as lgb

In [None]:
#GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [200, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31, 63],
    'min_child_samples': [10, 20]
}


grid_search = GridSearchCV(
    estimator=lgb.LGBMClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=0,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

In [None]:
# Results
print("Best parameters:", grid_search.best_params_)
print("Best CV score:", -grid_search.best_score_)

In [None]:
# Evaluate on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
#save models
import joblib
joblib.dump(model, 'xgb_model.pkl')
joblib.dump(best_model, 'lgb_model.pkl')

#Save scaler
joblib.dump(scaler, 'scaler.pkl')

#Save feature names for reference
joblib.dump(X_train.columns.tolist(), 'feature_names.pkl')

In [None]:
#Dump games dataset
joblib.dump(games, 'data.pkl')

In [None]:
report_dict = classification_report(y_test, y_pred, output_dict = True)

In [None]:
df_report = pd.DataFrame(report_dict).iloc[:-1, :].T

#heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df_report, 
            annot=True, 
            cmap="Blues",
            fmt='.2f', 
            cbar=True)
plt.title('LightGBM Classification Report Heatmap')
plt.show()