In [6]:
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
from IPython.core.display import display, HTML
from bs4 import BeautifulSoup
from tqdm import tqdm

display(HTML("<style>.container { width:100% !important; }</style>"))

plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')

In [7]:
def convert_season_to_int(season):
    season = season.split('-')[0]
    return int(season) + 1

def convert_int_to_season(year):
    previuos_year = year - 1
    year = str(year)
    previous_year = str(previuos_year)
    year = year[-2:]
    return f'{previous_year}-{year}'

In [8]:
def getSoupFromURL(url):

    try:
        r = requests.get(url)
    except:
        return None

    return BeautifulSoup(r.text, "html.parser")

def relative_table(table, avg_table, skip_names):
    for column_name in table.columns:
        if column_name in skip_names:
            continue
    
        table[f'{column_name}'] = (table[f'{column_name}'] / avg_table[f'{column_name}']) - 1
        
    return table

def get_bbref_teams_dataframe(first_year,
                              last_year,
                              find_champion=True,
                              save=True,
                              relative=True,
                              time_to_sleep=2):

    all_teams = pd.DataFrame()

    for i, year in tqdm(enumerate(range(first_year, last_year + 1)), total=last_year + 1 - first_year):

        url = f'https://www.basketball-reference.com/leagues/NBA_{year}.html'

        advanced_table_id = "advanced-team"
        per_game_table_id = 'per_game-team'

        soup = getSoupFromURL(url)
        time.sleep(time_to_sleep)

        advanced_table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']==advanced_table_id)
        per_game_table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']==per_game_table_id)
        
        if find_champion:
            champion = soup.find(lambda tag: tag.name=='div' and tag.has_attr('id') and tag['id']=='info')
            champion_name = " ".join(champion.find_all('div')[2].find('p').text.split(" ")[2:])

        advanced_stats = pd.read_html(str(advanced_table), header=1)[0]
        advanced_stats = advanced_stats.drop(['Unnamed: 17', 'Unnamed: 22', 'Unnamed: 27', 'Arena', 'Attend.'], axis=1)
        advanced_league_avg = advanced_stats.iloc[-1]
        advanced_stats = advanced_stats[:-1]

        per_game_stats = pd.read_html(str(per_game_table), header=0)[0]
        per_game_league_avg = per_game_stats.iloc[-1]
        per_game_stats = per_game_stats[:-1]
        
        if relative:
            per_game_stats = relative_table(per_game_stats, per_game_league_avg, skip_names=['Team', 'Rk'])
            advanced_stats = relative_table(advanced_stats, advanced_league_avg, skip_names=['Team', 'Rk', 'W', 'L', 'MOV', 'SOS', 'SRS', 'NRtg'])

        combined_team_stats = pd.merge(per_game_stats, advanced_stats, on=['Team'])
        combined_team_stats['Team'] = combined_team_stats['Team'].str.replace('*', '')

        combined_team_stats['Season'] = convert_int_to_season(year)
        combined_team_stats['Win%'] = combined_team_stats['W'] / (combined_team_stats['W'] + combined_team_stats['L'])
        
        combined_team_stats = combined_team_stats.fillna(0)
        combined_team_stats = combined_team_stats.drop(['Rk_x', 'Rk_y', 'G', 'MP'], axis=1)
        
        if find_champion:
            combined_team_stats['Champion'] = 0
            combined_team_stats.loc[combined_team_stats['Team'] == champion_name, 'Champion'] = 1
            
            if combined_team_stats.Champion.sum() != 1:
                print(f'[ERROR] - {year}')
                break
                
        all_teams = pd.concat([all_teams, combined_team_stats])
        
        time.sleep(5)

    if save:
        all_teams.to_csv('historical_relative_data.csv')
    
    return all_teams

In [9]:
first_year = 1980
last_year = 2024

teams = get_bbref_teams_dataframe(first_year, last_year, save=True)

100%|██████████| 45/45 [05:46<00:00,  7.69s/it]


In [10]:
teams

Unnamed: 0,Team,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Attend./G,Season,Win%,Champion
0,San Antonio Spurs,0.077982,0.041943,0.035343,-0.250000,-0.107143,-0.100000,0.081585,0.045506,0.034836,...,-0.050746,0.114894,0.041152,-0.070968,0.004511,-0.080851,-0.345574,1979-80,0.500000,0
1,Los Angeles Lakers,0.089450,-0.007726,0.099792,-0.750000,-0.571429,-0.285714,0.102564,0.007964,0.094262,...,-0.026866,-0.063830,-0.022634,-0.096774,0.006015,-0.229787,-0.720000,1979-80,0.731707,1
2,Cleveland Cavaliers,0.066514,0.082781,-0.014553,-0.500000,-0.178571,-0.310714,0.072261,0.089875,-0.014344,...,-0.011940,-0.097872,0.039095,0.051613,-0.009023,-0.080851,-0.220328,1979-80,0.451220,0
3,New York Knicks,0.064220,0.033113,0.031185,-0.375000,-0.178571,-0.214286,0.069930,0.037543,0.030738,...,0.005970,-0.059574,0.024691,0.058065,-0.037594,0.119149,-0.060984,1979-80,0.475610,0
4,Boston Celtics,0.011468,-0.005519,0.018711,1.500000,0.821429,0.371429,-0.018648,-0.034130,0.016393,...,0.038806,0.097872,-0.022634,0.064516,0.019549,-0.004255,8.501639,1979-80,0.743902,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,Miami Heat,-0.056872,-0.037120,-0.018987,-0.023438,-0.039886,0.010929,-0.064846,-0.035316,-0.033028,...,-0.099174,0.093750,-0.007313,0.049587,0.030343,-0.093750,0.077469,2023-24,0.560976,0
26,Detroit Pistons,-0.030806,-0.007874,-0.023207,-0.140625,-0.096866,-0.049180,0.017065,0.050186,-0.031193,...,-0.012397,0.005208,0.020110,-0.082645,0.019789,0.151042,-0.009329,2023-24,0.170732,0
27,Charlotte Hornets,-0.052133,-0.021372,-0.029536,-0.054688,-0.031339,-0.030055,-0.044369,-0.014870,-0.031193,...,-0.128099,-0.130208,0.045704,0.016529,-0.017150,-0.020833,-0.102619,2023-24,0.256098,0
28,Portland Trail Blazers,-0.066351,0.008999,-0.073840,-0.101562,-0.054131,-0.057377,-0.047782,0.050186,-0.093578,...,0.136364,-0.057292,0.020110,0.057851,-0.023747,0.125000,-0.000164,2023-24,0.256098,0


In [11]:
corr = teams.select_dtypes(include=np.number).corr()
corr.style.background_gradient(cmap='coolwarm').format(precision=2)

Unnamed: 0,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Attend./G,Win%,Champion
FG,1.0,0.65,0.64,0.15,0.13,0.15,0.73,0.35,0.6,0.07,0.04,0.1,0.13,0.35,0.37,0.62,0.24,0.1,-0.15,-0.04,0.87,0.1,0.4,-0.4,0.42,-0.42,0.43,-0.17,0.43,0.64,0.03,0.43,0.62,-0.19,0.04,0.5,0.59,-0.38,0.1,-0.15,0.02,-0.05,-0.07,-0.15,0.07,0.41,0.18
FGA,0.65,1.0,-0.17,0.13,0.19,-0.07,0.46,0.54,-0.14,-0.11,-0.1,-0.04,0.54,0.11,0.47,0.21,0.23,-0.02,-0.12,0.1,0.51,-0.21,-0.11,0.11,-0.11,0.12,-0.11,0.05,-0.11,0.09,0.27,-0.11,0.71,-0.43,0.05,-0.23,-0.14,-0.45,0.23,-0.42,0.31,0.02,-0.21,0.0,-0.06,-0.12,-0.02
FG%,0.64,-0.17,1.0,0.07,-0.02,0.27,0.48,-0.1,0.92,0.2,0.15,0.17,-0.37,0.34,0.01,0.59,0.08,0.16,-0.08,-0.15,0.61,0.35,0.64,-0.64,0.66,-0.66,0.67,-0.27,0.67,0.75,-0.25,0.67,0.08,0.2,0.0,0.89,0.91,-0.03,-0.1,0.24,-0.29,-0.09,0.13,-0.19,0.15,0.66,0.25
3P,0.15,0.13,0.07,1.0,0.96,0.69,-0.36,-0.49,0.23,-0.04,-0.07,0.07,-0.15,0.11,-0.01,0.09,-0.03,-0.12,-0.16,-0.09,0.32,0.16,0.2,-0.2,0.21,-0.21,0.21,-0.1,0.21,0.32,0.01,0.21,0.12,-0.11,0.96,0.32,0.36,-0.19,-0.2,-0.08,0.0,-0.08,-0.0,-0.13,0.1,0.21,0.04
3PA,0.13,0.19,-0.02,0.96,1.0,0.52,-0.4,-0.52,0.2,-0.05,-0.06,0.03,-0.09,0.09,0.01,0.04,0.01,-0.13,-0.13,-0.08,0.31,0.12,0.14,-0.14,0.15,-0.15,0.15,-0.06,0.15,0.26,0.05,0.15,0.18,-0.12,0.99,0.24,0.3,-0.18,-0.19,-0.1,0.05,-0.07,-0.03,-0.13,0.05,0.15,0.03
3P%,0.15,-0.07,0.27,0.69,0.52,1.0,-0.12,-0.25,0.22,0.01,-0.06,0.18,-0.24,0.12,-0.08,0.18,-0.1,-0.06,-0.2,-0.08,0.23,0.21,0.27,-0.28,0.29,-0.29,0.3,-0.15,0.29,0.35,-0.09,0.3,-0.05,-0.03,0.54,0.38,0.4,-0.15,-0.17,0.03,-0.13,-0.09,0.08,-0.07,0.14,0.28,0.05
2P,0.73,0.46,0.48,-0.36,-0.4,-0.12,1.0,0.81,0.28,0.09,0.08,0.04,0.25,0.16,0.29,0.44,0.21,0.14,-0.04,0.03,0.45,-0.06,0.14,-0.14,0.14,-0.14,0.15,-0.05,0.15,0.27,0.06,0.15,0.4,-0.08,-0.48,0.13,0.16,-0.21,0.26,-0.06,0.07,0.02,-0.06,-0.02,0.02,0.15,0.1
2PA,0.35,0.54,-0.1,-0.49,-0.52,-0.25,0.81,1.0,-0.33,-0.04,-0.03,-0.02,0.48,-0.06,0.28,0.09,0.14,0.06,-0.02,0.12,0.05,-0.28,-0.24,0.24,-0.26,0.26,-0.25,0.11,-0.25,-0.19,0.2,-0.26,0.3,-0.21,-0.6,-0.42,-0.42,-0.2,0.35,-0.21,0.23,0.07,-0.12,0.12,-0.06,-0.25,-0.06
2P%,0.6,-0.14,0.92,0.23,0.2,0.22,0.28,-0.33,1.0,0.21,0.18,0.1,-0.38,0.35,0.02,0.55,0.11,0.13,-0.04,-0.15,0.66,0.36,0.63,-0.63,0.65,-0.65,0.66,-0.25,0.66,0.75,-0.23,0.66,0.15,0.22,0.22,0.9,0.94,-0.01,-0.15,0.23,-0.26,-0.09,0.1,-0.22,0.14,0.64,0.26
FT,0.07,-0.11,0.2,-0.04,-0.05,0.01,0.09,-0.04,0.21,1.0,0.92,0.32,0.06,0.17,0.17,0.0,0.12,0.11,0.12,0.14,0.46,0.05,0.24,-0.23,0.25,-0.25,0.25,-0.11,0.25,0.39,0.02,0.25,0.27,0.87,-0.03,0.39,0.16,0.04,0.16,0.95,-0.01,-0.01,-0.01,0.02,0.06,0.24,-0.04


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(teams.loc[:, teams.columns != 'Champion'], teams['Champion'])

In [16]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
clf.fit(teams.select_dtypes(include=np.number), teams.Champion)
accuracy = clf.score(X_test.select_dtypes(include=np.number), y_test)
print(f'Acc: {accuracy}')

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Champion


In [17]:
current_year = get_bbref_teams_dataframe(first_year=2025,
                                         last_year=2025,
                                         find_champion=False,
                                         save=False)

current_year_with_metrics = pd.DataFrame(current_year)

100%|██████████| 1/1 [00:07<00:00,  7.86s/it]


In [18]:
clf = make_pipeline(SVC(gamma='auto', probability=True))

X = teams.loc[:, teams.columns != 'Champion'].select_dtypes(include=np.number)
label = teams['Champion']
clf.fit(X, label)

probs = clf.predict_proba(current_year.select_dtypes(include=np.number))

# print(classification_report(y_true=label, y_pred=probs[:,1]))

current_year_with_metrics['SVM_Champion%'] = 100 * probs[:,1]

In [19]:
import xgboost as xgb

dtrain = xgb.DMatrix(X, label=label)
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'

evallist = [(dtrain, 'train')]

num_round = 100
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	train-auc:0.94097
[1]	train-auc:0.95951
[2]	train-auc:0.97451
[3]	train-auc:0.98202
[4]	train-auc:0.98787
[5]	train-auc:0.99376
[6]	train-auc:0.99515
[7]	train-auc:0.99729
[8]	train-auc:0.99855
[9]	train-auc:0.99906
[10]	train-auc:0.99949
[11]	train-auc:0.99987
[12]	train-auc:0.99998
[13]	train-auc:0.99998
[14]	train-auc:1.00000
[15]	train-auc:1.00000
[16]	train-auc:1.00000
[17]	train-auc:1.00000
[18]	train-auc:1.00000
[19]	train-auc:1.00000
[20]	train-auc:1.00000
[21]	train-auc:1.00000
[22]	train-auc:1.00000
[23]	train-auc:1.00000
[24]	train-auc:1.00000
[25]	train-auc:1.00000
[26]	train-auc:1.00000
[27]	train-auc:1.00000
[28]	train-auc:1.00000
[29]	train-auc:1.00000
[30]	train-auc:1.00000
[31]	train-auc:1.00000
[32]	train-auc:1.00000
[33]	train-auc:1.00000
[34]	train-auc:1.00000
[35]	train-auc:1.00000
[36]	train-auc:1.00000
[37]	train-auc:1.00000
[38]	train-auc:1.00000
[39]	train-auc:1.00000
[40]	train-auc:1.00000
[41]	train-auc:1.00000
[42]	train-auc:1.00000
[43]	train-auc:1.0000

In [20]:
dtest = xgb.DMatrix(current_year.select_dtypes(include=np.number))
ypred = bst.predict(dtest)

In [21]:
current_year_with_metrics['XGBoost_Champion%'] = 100 * ypred

In [22]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()
clf.fit(X, label)

lda_probs = clf.predict_proba(current_year.select_dtypes(include=np.number))
current_year_with_metrics['LDA_Champion%'] = 100 * lda_probs[:,1]

In [23]:
current_year_with_metrics.sort_values(by=['LDA_Champion%'], ascending=False)

Unnamed: 0,Team,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Attend./G,Season,Win%,SVM_Champion%,XGBoost_Champion%,LDA_Champion%
3,Oklahoma City Thunder,0.067146,0.038117,0.029979,0.074074,0.032,0.044444,0.067616,0.042553,0.020183,...,-0.051565,0.181102,-0.002674,0.115789,-0.008113,2024-25,0.822785,73.948009,1.103692,27.031195
0,Cleveland Cavaliers,0.069544,0.015695,0.055675,0.177778,0.104,0.066667,0.021352,-0.05029,0.073394,...,-0.025783,0.007874,-0.002674,-0.042105,0.072406,2024-25,0.797468,31.577536,0.494801,26.389707
7,Boston Celtics,0.0,0.01009,-0.008565,0.325926,0.288,0.027778,-0.153025,-0.193424,0.049541,...,-0.034991,-0.086614,0.018717,-0.178947,0.057174,2024-25,0.746835,2.974627,69.340691,22.307777
18,Los Angeles Lakers,-0.019185,-0.042601,0.025696,-0.014815,-0.026667,0.013889,-0.017794,-0.052224,0.033028,...,-0.007366,-0.047244,0.0,-0.057895,0.032892,2024-25,0.607595,2.736891,0.004349,0.600456
2,Denver Nuggets,0.091127,0.006726,0.083512,-0.103704,-0.144,0.05,0.188612,0.117988,0.06055,...,-0.001842,-0.110236,-0.002674,-0.078947,0.091611,2024-25,0.594937,2.783013,0.003334,0.55127
8,New York Knicks,0.043165,0.001121,0.044968,-0.074074,-0.096,0.025,0.103203,0.071567,0.027523,...,0.007366,0.023622,-0.004011,-0.084211,0.092715,2024-25,0.632911,2.695908,0.000578,0.475712
14,San Antonio Spurs,0.002398,0.008969,-0.004283,0.044444,0.058667,-0.011111,-0.014235,-0.027079,0.011009,...,0.016575,-0.055118,-0.024064,-0.168421,-0.00894,2024-25,0.405063,2.810164,0.000115,0.455905
12,Dallas Mavericks,0.007194,-0.014574,0.023555,-0.074074,-0.085333,0.013889,0.049822,0.03675,0.009174,...,-0.005525,-0.07874,-0.020053,-0.047368,0.07787,2024-25,0.481013,2.798889,0.000358,0.416717
6,Indiana Pacers,0.045564,-0.003363,0.049251,-0.022222,-0.048,0.027778,0.081851,0.029014,0.047706,...,0.005525,0.023622,-0.004011,0.010526,-0.077097,2024-25,0.607595,2.715239,0.000258,0.414123
11,Milwaukee Bucks,0.002398,-0.033632,0.038544,0.037037,-0.029333,0.066667,-0.010676,-0.03675,0.025688,...,-0.020258,-0.102362,0.020053,-0.068421,-0.038079,2024-25,0.56962,2.780701,0.000232,0.360505


In [24]:
def sklearn_fit_predict_probs(clf, X, label, test):
    clf.fit(X, label)
    probs = clf.predict_proba(test.select_dtypes(include=np.number))
    return probs[:,1]

In [25]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    penalty='l2',
    solver='newton-cholesky',
    C=0.1
)

current_year_with_metrics['LR_Champion%'] = 100 * sklearn_fit_predict_probs(lr, X, label, current_year)


In [28]:
champion_columns = current_year_with_metrics.filter(like='_Champion%')
average_champion = current_year_with_metrics.filter(like='_Champion%').mean(axis=1)
current_year_with_metrics['AVG_Champion%'] = average_champion
columns_to_display = ['Team']
columns_to_display.extend(list(champion_columns.columns))

current_year_with_metrics.sort_values(by=['AVG_Champion%'], ascending=False)[columns_to_display]

Unnamed: 0,Team,SVM_Champion%,XGBoost_Champion%,LDA_Champion%,LR_Champion%,AVG_Champion%
3,Oklahoma City Thunder,73.948009,1.103692,27.031195,71.340033,43.355732
7,Boston Celtics,2.974627,69.340691,22.307777,29.489392,31.028122
0,Cleveland Cavaliers,31.577536,0.494801,26.389707,53.773139,28.058796
13,Houston Rockets,2.58688,0.008484,0.131137,5.235132,1.990408
8,New York Knicks,2.695908,0.000578,0.475712,2.945855,1.529513
2,Denver Nuggets,2.783013,0.003334,0.55127,1.340768,1.169596
18,Los Angeles Lakers,2.736891,0.004349,0.600456,1.285345,1.15676
6,Indiana Pacers,2.715239,0.000258,0.414123,1.488419,1.15451
19,Los Angeles Clippers,2.816171,0.104569,0.231438,1.37125,1.130857
1,Memphis Grizzlies,2.821283,6.9e-05,0.280122,1.368765,1.11756


['SVM_Champion%',
 'XGBoost_Champion%',
 'LDA_Champion%',
 'LR_Champion%',
 'AVG_Champion%']