In [1]:
from nba_api.stats.endpoints import leagueleaders
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from nba_api.stats.endpoints import leaguedashplayerstats, synergyplaytypes, leaguedashteamstats
from nba_api.stats.endpoints import teamyearbyyearstats, playercareerstats
from nba_api.stats.static.players import find_players_by_full_name, find_player_by_id
from nba_api.stats.static.teams import find_team_name_by_id
from matplotlib.offsetbox import OffsetImage, AnnotationBbox, DrawingArea, TextArea
from matplotlib.patches import Circle, FancyArrowPatch, FancyArrow, ArrowStyle, Arrow, Rectangle
import matplotlib.ticker as mtick
import time
from tqdm import tqdm
from sklearn.cluster import KMeans
import numpy as np
import functools
import seaborn as sns
import glob
from sklearn import preprocessing
from scipy import stats
from fitter import Fitter, get_common_distributions, get_distributions
import sklearn
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from scipy.stats import gamma, exponpow, lognorm, cauchy, genhyperbolic
from datetime import date
from PIL import Image
from bs4 import BeautifulSoup, Comment
import requests
import html5lib

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')

  from IPython.core.display import display, HTML


In [2]:
def convert_season_to_int(season):
    season = season.split('-')[0]
    return int(season) + 1

def convert_int_to_season(year):
    previos_year = year - 1
    year = str(year)
    previous_year = str(previos_year)
    year = year[-2:]
    return f'{previos_year}-{year}'

In [107]:
def getSoupFromURL(url):

    try:
        r = requests.get(url)
    except:
        return None

    return BeautifulSoup(r.text, "html.parser")

def relative_table(table, avg_table, skip_names):
    for column_name in table.columns:
        if column_name in skip_names:
            continue
    
        table[f'{column_name}'] = (table[f'{column_name}'] / avg_table[f'{column_name}']) - 1
        
    return table

def get_bbref_teams_dataframe(first_year,
                              last_year,
                              find_champion=True,
                              save=True,
                              relative=True,
                              time_to_sleep=2):

    all_teams = pd.DataFrame()

    for i, year in tqdm(enumerate(range(first_year, last_year + 1)), total=last_year + 1 - first_year):

        url = f'https://www.basketball-reference.com/leagues/NBA_{year}.html'

        advanced_table_id = "advanced-team"
        per_game_table_id = 'per_game-team'

        soup = getSoupFromURL(url)
        time.sleep(time_to_sleep)

        advanced_table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']==advanced_table_id)
        per_game_table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']==per_game_table_id)
        
        if find_champion:
            champion = soup.find(lambda tag: tag.name=='div' and tag.has_attr('id') and tag['id']=='info')
            champion_name = " ".join(champion.find_all('div')[2].find('p').text.split(" ")[2:])

        advanced_stats = pd.read_html(str(advanced_table), header=1)[0]
        advanced_stats = advanced_stats.drop(['Unnamed: 17', 'Unnamed: 22', 'Unnamed: 27', 'Arena', 'Attend.'], axis=1)
        advanced_league_avg = advanced_stats.iloc[-1]
        advanced_stats = advanced_stats[:-1]

        per_game_stats = pd.read_html(str(per_game_table), header=0)[0]
        per_game_league_avg = per_game_stats.iloc[-1]
        per_game_stats = per_game_stats[:-1]
        
        if relative:
            per_game_stats = relative_table(per_game_stats, per_game_league_avg, skip_names=['Team', 'Rk'])
            advanced_stats = relative_table(advanced_stats, advanced_league_avg, skip_names=['Team', 'Rk', 'W', 'L', 'MOV', 'SOS', 'SRS', 'NRtg'])

        combined_team_stats = pd.merge(per_game_stats, advanced_stats, on=['Team'])
        combined_team_stats['Team'] = combined_team_stats['Team'].str.replace('*', '')

        combined_team_stats['Season'] = convert_int_to_season(year)
        combined_team_stats['Win%'] = combined_team_stats['W'] / (combined_team_stats['W'] + combined_team_stats['L'])
        
        combined_team_stats = combined_team_stats.fillna(0)
        combined_team_stats = combined_team_stats.drop(['Rk_x', 'Rk_y', 'G', 'MP'], axis=1)
        
        if find_champion:
            combined_team_stats['Champion'] = 0
            combined_team_stats.loc[combined_team_stats['Team'] == champion_name, 'Champion'] = 1
            
            if combined_team_stats.Champion.sum() != 1:
                print(f'[ERROR] - {season_year}')
                break
                
        all_teams = pd.concat([all_teams, combined_team_stats])

    if save:
        all_teams.to_csv('historical_relative_data.csv')
    
    return all_teams

In [115]:
first_year = 1980
last_year = 2023

teams = get_bbref_teams_dataframe(first_year, last_year, save=True)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [03:08<00:00,  4.28s/it]


In [117]:
teams

Unnamed: 0,Team,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Attend./G,Season,Win%,Champion
0,San Antonio Spurs,0.077982,0.041943,0.035343,-0.250000,-0.107143,-0.100000,0.081585,0.045506,0.034836,...,-0.050746,0.114894,0.041152,-0.070968,0.004511,-0.080851,-0.392126,1979-80,0.500000,0
1,Los Angeles Lakers,0.089450,-0.007726,0.099792,-0.750000,-0.571429,-0.285714,0.102564,0.007964,0.094262,...,-0.026866,-0.063830,-0.022634,-0.096774,0.006015,-0.229787,-0.327559,1979-80,0.731707,1
2,Cleveland Cavaliers,0.066514,0.082781,-0.014553,-0.500000,-0.178571,-0.310714,0.072261,0.089875,-0.014344,...,-0.011940,-0.097872,0.039095,0.051613,-0.009023,-0.080851,-0.406299,1979-80,0.451220,0
3,New York Knicks,0.064220,0.033113,0.031185,-0.375000,-0.178571,-0.214286,0.069930,0.037543,0.030738,...,0.005970,-0.059574,0.024691,0.058065,-0.037594,0.119149,-0.247244,1979-80,0.475610,0
4,Boston Celtics,0.011468,-0.005519,0.018711,1.500000,0.821429,0.371429,-0.018648,-0.034130,0.016393,...,0.038806,0.097872,-0.022634,0.064516,0.019549,-0.004255,6.697638,1979-80,0.743902,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,Orlando Magic,-0.035714,-0.022650,-0.010526,-0.121951,-0.090643,-0.041551,0.006757,0.020333,-0.016423,...,-0.008333,0.091346,0.009174,0.048000,0.022368,0.014423,-0.012616,2022-23,0.414634,0
26,Charlotte Hornets,-0.016667,0.023783,-0.037895,-0.130081,-0.049708,-0.085873,0.030405,0.070240,-0.036496,...,-0.008333,-0.062500,-0.001835,0.000000,-0.006579,0.014423,-0.048352,2022-23,0.329268,0
27,Houston Rockets,-0.033333,0.006795,-0.037895,-0.154472,-0.067251,-0.094183,0.020270,0.051756,-0.032847,...,0.258333,0.033654,0.034862,-0.056000,-0.002632,0.048077,-0.093314,2022-23,0.268293,0
28,Detroit Pistons,-0.057143,-0.013590,-0.044211,-0.073171,-0.052632,-0.027701,-0.047297,0.009242,-0.058394,...,0.037500,0.091346,0.022018,-0.048000,-0.026316,0.110577,0.033513,2022-23,0.207317,0


In [118]:
corr = teams.select_dtypes(include=np.number).corr()
corr.style.background_gradient(cmap='coolwarm').format(precision=2)

Unnamed: 0,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Attend./G,Win%,Champion
FG,1.0,0.65,0.63,0.15,0.13,0.14,0.73,0.35,0.59,0.07,0.04,0.1,0.14,0.35,0.38,0.62,0.24,0.1,-0.15,-0.04,0.87,0.1,0.4,-0.4,0.42,-0.42,0.43,-0.17,0.43,0.64,0.02,0.42,0.62,-0.19,0.04,0.5,0.58,-0.37,0.11,-0.15,0.02,-0.05,-0.07,-0.15,0.08,0.41,0.18
FGA,0.65,1.0,-0.17,0.13,0.19,-0.07,0.46,0.55,-0.14,-0.1,-0.09,-0.04,0.54,0.11,0.47,0.21,0.23,-0.02,-0.12,0.09,0.51,-0.21,-0.11,0.11,-0.11,0.11,-0.11,0.04,-0.11,0.08,0.27,-0.11,0.71,-0.43,0.05,-0.23,-0.15,-0.45,0.23,-0.42,0.3,0.03,-0.21,-0.0,-0.07,-0.12,-0.02
FG%,0.63,-0.17,1.0,0.07,-0.02,0.26,0.47,-0.1,0.92,0.2,0.15,0.17,-0.37,0.34,0.01,0.59,0.08,0.15,-0.07,-0.15,0.61,0.34,0.64,-0.63,0.66,-0.66,0.67,-0.27,0.67,0.75,-0.25,0.67,0.08,0.2,0.0,0.89,0.91,-0.02,-0.09,0.24,-0.29,-0.09,0.13,-0.19,0.17,0.65,0.25
3P,0.15,0.13,0.07,1.0,0.97,0.69,-0.36,-0.49,0.23,-0.04,-0.07,0.07,-0.15,0.1,-0.02,0.09,-0.03,-0.12,-0.16,-0.09,0.32,0.16,0.2,-0.2,0.21,-0.21,0.21,-0.1,0.21,0.32,0.01,0.21,0.13,-0.1,0.96,0.32,0.37,-0.19,-0.2,-0.08,0.0,-0.08,-0.0,-0.13,0.1,0.2,0.04
3PA,0.13,0.19,-0.02,0.97,1.0,0.52,-0.41,-0.52,0.2,-0.05,-0.05,0.03,-0.1,0.09,0.01,0.04,0.01,-0.14,-0.13,-0.08,0.31,0.12,0.15,-0.14,0.15,-0.15,0.15,-0.07,0.15,0.27,0.05,0.15,0.18,-0.11,0.99,0.25,0.3,-0.18,-0.19,-0.1,0.05,-0.06,-0.04,-0.13,0.06,0.15,0.02
3P%,0.14,-0.07,0.26,0.69,0.52,1.0,-0.12,-0.25,0.21,0.01,-0.06,0.18,-0.24,0.11,-0.08,0.18,-0.1,-0.06,-0.19,-0.08,0.23,0.2,0.26,-0.27,0.29,-0.29,0.29,-0.15,0.29,0.35,-0.08,0.29,-0.05,-0.03,0.54,0.38,0.39,-0.14,-0.18,0.03,-0.12,-0.09,0.08,-0.07,0.14,0.28,0.05
2P,0.73,0.46,0.47,-0.36,-0.41,-0.12,1.0,0.81,0.28,0.09,0.08,0.04,0.26,0.16,0.3,0.44,0.21,0.14,-0.04,0.02,0.45,-0.06,0.14,-0.14,0.14,-0.14,0.15,-0.05,0.15,0.27,0.06,0.15,0.4,-0.09,-0.48,0.13,0.15,-0.21,0.28,-0.07,0.07,0.02,-0.06,-0.02,0.03,0.15,0.1
2PA,0.35,0.55,-0.1,-0.49,-0.52,-0.25,0.81,1.0,-0.34,-0.04,-0.03,-0.02,0.49,-0.05,0.29,0.09,0.14,0.07,-0.02,0.11,0.05,-0.28,-0.24,0.24,-0.26,0.26,-0.25,0.11,-0.25,-0.19,0.2,-0.26,0.31,-0.22,-0.6,-0.43,-0.42,-0.21,0.36,-0.21,0.23,0.07,-0.12,0.11,-0.08,-0.25,-0.05
2P%,0.59,-0.14,0.92,0.23,0.2,0.21,0.28,-0.34,1.0,0.21,0.19,0.1,-0.38,0.35,0.02,0.55,0.11,0.12,-0.03,-0.15,0.65,0.36,0.63,-0.63,0.65,-0.65,0.66,-0.25,0.66,0.75,-0.23,0.66,0.15,0.22,0.22,0.9,0.94,0.0,-0.14,0.24,-0.26,-0.09,0.1,-0.22,0.16,0.65,0.26
FT,0.07,-0.1,0.2,-0.04,-0.05,0.01,0.09,-0.04,0.21,1.0,0.92,0.32,0.07,0.16,0.17,0.01,0.11,0.11,0.13,0.14,0.46,0.05,0.23,-0.23,0.25,-0.25,0.25,-0.11,0.25,0.39,0.03,0.25,0.27,0.87,-0.03,0.39,0.16,0.04,0.16,0.95,-0.0,-0.01,-0.01,0.02,0.06,0.24,-0.04


In [149]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(teams.loc[:, teams.columns != 'Champion'], teams['Champion'])

In [122]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
clf.fit(teams.select_dtypes(include=np.number), teams.Champion)
# accuracy = clf.score(X_test.select_dtypes(include=np.number), y_test)
# print(f'Acc: {accuracy}')

Acc: 1.0


In [158]:
current_year = get_bbref_teams_dataframe(first_year=2024,
                                         last_year=2024,
                                         find_champion=False,
                                         save=False)

current_year_with_metrics = pd.DataFrame(current_year)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.44s/it]


In [162]:
clf = make_pipeline(SVC(gamma='auto', probability=True))

X = teams.loc[:, teams.columns != 'Champion'].select_dtypes(include=np.number)
label = teams['Champion']
clf.fit(X, label)

probs = clf.predict_proba(current_year.select_dtypes(include=np.number))

# print(classification_report(y_true=label, y_pred=probs[:,1]))

current_year_with_metrics['SVM_Champion%'] = 100 * probs[:,1]

In [143]:
import xgboost as xgb

dtrain = xgb.DMatrix(X, label=label)
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'

evallist = [(dtrain, 'train')]

num_round = 100
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	train-auc:0.94133
[1]	train-auc:0.95953
[2]	train-auc:0.97461
[3]	train-auc:0.98186
[4]	train-auc:0.98887
[5]	train-auc:0.99249
[6]	train-auc:0.99502
[7]	train-auc:0.99654
[8]	train-auc:0.99780
[9]	train-auc:0.99836
[10]	train-auc:0.99911
[11]	train-auc:0.99973
[12]	train-auc:0.99988
[13]	train-auc:1.00000
[14]	train-auc:1.00000
[15]	train-auc:1.00000
[16]	train-auc:1.00000
[17]	train-auc:1.00000
[18]	train-auc:1.00000
[19]	train-auc:1.00000
[20]	train-auc:1.00000
[21]	train-auc:1.00000
[22]	train-auc:1.00000
[23]	train-auc:1.00000
[24]	train-auc:1.00000
[25]	train-auc:1.00000
[26]	train-auc:1.00000
[27]	train-auc:1.00000
[28]	train-auc:1.00000
[29]	train-auc:1.00000
[30]	train-auc:1.00000
[31]	train-auc:1.00000
[32]	train-auc:1.00000
[33]	train-auc:1.00000
[34]	train-auc:1.00000
[35]	train-auc:1.00000
[36]	train-auc:1.00000
[37]	train-auc:1.00000
[38]	train-auc:1.00000
[39]	train-auc:1.00000
[40]	train-auc:1.00000
[41]	train-auc:1.00000
[42]	train-auc:1.00000
[43]	train-auc:1.0000

In [146]:
dtest = xgb.DMatrix(current_year.select_dtypes(include=np.number))
ypred = bst.predict(dtest)

In [164]:
current_year_with_metrics['XGBoost_Champion%'] = 100 * ypred

In [167]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()
clf.fit(X, label)

lda_probs = clf.predict_proba(current_year.select_dtypes(include=np.number))
current_year_with_metrics['LDA_Champion%'] = 100 * lda_probs[:,1]

In [172]:
current_year_with_metrics.sort_values(by=['LDA_Champion%'], ascending=False)

Unnamed: 0,Team,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Attend./G,Season,Win%,SVM_Champion%,XGBoost_Champion%,LDA_Champion%
1,Boston Celtics,0.040284,0.014623,0.027426,0.289062,0.210826,0.060109,-0.064846,-0.113383,0.055046,...,-0.043876,-0.107438,0.006596,-0.244792,0.022151,2023-24,0.780488,26.00762,0.74553,73.192441
13,Denver Nuggets,0.042654,-0.001125,0.046414,-0.085938,-0.111111,0.021858,0.102389,0.068773,0.031193,...,-0.038391,-0.07438,-0.002639,0.015625,0.07657,2023-24,0.695122,3.083037,42.218678,22.063024
0,Indiana Pacers,0.113744,0.042745,0.06962,0.03125,0.005698,0.021858,0.153584,0.066914,0.080734,...,0.016453,0.0,-0.022427,0.1875,-0.09615,2023-24,0.573171,3.010499,0.009464,5.77734
6,Dallas Mavericks,0.021327,0.008999,0.014768,0.140625,0.125356,0.008197,-0.027304,-0.068773,0.044037,...,0.001828,0.0,-0.007916,-0.041667,0.105721,2023-24,0.609756,3.017613,8.6e-05,5.511086
5,Los Angeles Lakers,0.035545,-0.015748,0.052743,-0.078125,-0.105413,0.030055,0.085324,0.04461,0.040367,...,0.005484,-0.033058,0.005277,-0.203125,0.035058,2023-24,0.573171,3.015312,1.335598,4.795177
19,Cleveland Cavaliers,-0.009479,-0.019123,0.010549,0.054688,0.048433,0.002732,-0.03413,-0.063197,0.029358,...,-0.021938,0.016529,0.014512,-0.010417,0.062787,2023-24,0.585366,2.995003,0.003544,4.026324
11,Los Angeles Clippers,0.004739,-0.024747,0.031646,-0.015625,-0.054131,0.040984,0.013652,-0.005576,0.018349,...,-0.010969,-0.033058,-0.013193,-0.046875,0.036207,2023-24,0.621951,3.046131,8.2e-05,3.659035
3,Milwaukee Bucks,0.021327,-0.004499,0.027426,0.109375,0.08547,0.019126,-0.010239,-0.063197,0.053211,...,-0.014625,-0.123967,0.01847,-0.015625,-0.034183,2023-24,0.597561,3.013486,0.031044,2.861559
7,Golden State Warriors,0.035545,0.030371,0.006329,0.15625,0.108262,0.038251,-0.013652,-0.020446,0.005505,...,-0.014625,-0.049587,0.003958,0.015625,-0.012032,2023-24,0.560976,3.04128,0.001634,2.373676
9,Phoenix Suns,0.007109,-0.031496,0.040084,-0.03125,-0.071225,0.043716,0.023891,-0.003717,0.029358,...,-0.02011,-0.066116,-0.002639,-0.072917,-0.066342,2023-24,0.597561,3.024859,0.000658,2.163761


In [173]:
def sklearn_fit_predict_probs(clf, X, label, test):
    clf.fit(X, label)
    probs = clf.predict_proba(test.select_dtypes(include=np.number))
    return probs[:,1]

In [178]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    penalty='l2',
    solver='newton-cholesky',
    C=0.1
)

current_year_with_metrics['LR_Champion%'] = 100 * sklearn_fit_predict_probs(lr, X, label, current_year)


In [190]:
champion_columns = current_year_with_metrics.filter(like='_Champion%')
average_champion = current_year_with_metrics.filter(like='_Champion%').mean(axis=1)
current_year_with_metrics['AVG_Champion%'] = average_champion
columns_to_display = ['Team']
columns_to_display.extend(list(champion_columns.columns))

current_year_with_metrics.sort_values(by=['AVG_Champion%'], ascending=False)[columns_to_display]

Unnamed: 0,Team,SVM_Champion%,XGBoost_Champion%,LDA_Champion%,LR_Champion%,AVG_Champion%
1,Boston Celtics,26.00762,0.74553,73.192441,47.860959,36.951638
13,Denver Nuggets,3.083037,42.218678,22.063024,10.858003,19.555685
2,Oklahoma City Thunder,2.968284,0.095717,1.675457,10.748941,3.8721
17,Minnesota Timberwolves,2.928547,0.013612,2.154564,8.088607,3.296332
6,Dallas Mavericks,3.017613,8.6e-05,5.511086,1.446231,2.493754
5,Los Angeles Lakers,3.015312,1.335598,4.795177,0.659979,2.451516
0,Indiana Pacers,3.010499,0.009464,5.77734,0.746177,2.38587
11,Los Angeles Clippers,3.046131,8.2e-05,3.659035,2.013339,2.179647
19,Cleveland Cavaliers,2.995003,0.003544,4.026324,0.92215,1.986755
3,Milwaukee Bucks,3.013486,0.031044,2.861559,1.103174,1.752316


['SVM_Champion%',
 'XGBoost_Champion%',
 'LDA_Champion%',
 'LR_Champion%',
 'AVG_Champion%']