## Grading NBA Finals MVP with Machine Learning
#### *by Noah Ford*


#### Importing Packages and Helper Functions

I've outsourced most of the importing of tools and helper functions of an accompanying workbook [helper_funcs.ipynb](helper_funcs.ipynb).

In [53]:
%run helper_funcs.ipynb
print('success!')

success!


#### Getting Set Up

Here we'll set up the paths to the data we will access throughout.

In [14]:
DIR = 'series'
DIR2 = 'html_tables'
DIR3 = 'csvs'

The first jumping off point for this notebook, is getting access to the playoff history.  We will use [basketball-reference.com](https://www.basketball-reference.com) for the entirety of our data accessing. \
This first link we're accessing takes us to a page with information for every playoff series: winner, loser, finals mvp, hyperlinks to more stats, etc.

In [24]:
BASE = "https://www.basketball-reference.com"
url = "https://www.basketball-reference.com/playoffs/series.html"
# save takes a path and a folder, and fetches the html we're looking for
text = save(url,DIR)
bs = BeautifulSoup(text, 'html.parser')
table = bs.find(id = 'div_playoffs_series')

found!


In [36]:
def get_html_table():
    url = "https://www.basketball-reference.com/playoffs/series.html"
    text = save(url,DIR)
    bs = BeautifulSoup(text, 'html.parser')
    table = bs.find(id = 'div_playoffs_series')
    return table

In [43]:
def read_df(url,header_col=False):
    text = save(url,DIR)
    bs = BeautifulSoup(text, 'html.parser')
    df = pd.read_html(url)[0]
    if header_col: df.columns = df.columns.get_level_values(1)
    return df

In [49]:
def finals_df():
    url = "https://www.basketball-reference.com/playoffs/series.html"
    df = read_df(url,True)
    # df.columns = df.columns.get_level_values(1)
    df.drop(df.columns[[4,7,10,-2,-1]], axis=1, inplace=True)
    df['Yr'] = pd.to_numeric(df['Yr'], errors='coerce').fillna(0).astype(int)
    df = df[df['Lg'].str.contains('NBA', na=False)].reset_index(drop=True)
    df = df[df['Yr']>1968].reset_index(drop=True)
    df = df[~df['Series'].str.contains('Conf', na=True)].reset_index(drop=True)
    df = df[~df['Series'].str.contains('Semi', na=False)].reset_index(drop=True)
    df = df[~df['Series'].str.contains('Div', na=False)].reset_index(drop=True)
    return df 

In [68]:
finals_df().head(5)

found!


Unnamed: 0,Yr,Lg,Series,Unnamed: 3_level_1,Team,W,Team.1,W.1
0,2024,NBA,Finals,"Jun 6 - Jun 17, 2024",Boston Celtics (1),4,Dallas Mavericks (5),1
1,2023,NBA,Finals,"Jun 1 - Jun 12, 2023",Denver Nuggets (1),4,Miami Heat (8),1
2,2022,NBA,Finals,"Jun 2 - Jun 16, 2022",Golden State Warriors (3),4,Boston Celtics (2),2
3,2021,NBA,Finals,"Jul 6 - Jul 20, 2021",Milwaukee Bucks (3),4,Phoenix Suns (2),2
4,2020,NBA,Finals,"Sep 30 - Oct 11, 2020",Los Angeles Lakers (1),4,Miami Heat (5),2


In [69]:
url = 'https://www.basketball-reference.com/awards/finals_mvp.html'
df = read_df('series/finals_mvp.html',True)
df.head(5)

found!


Unnamed: 0,Season,Lg,Player,Age,Tm,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%
0,2023-24,NBA,Jaylen Brown,27,BOS,5,38.6,20.8,5.4,5.0,1.6,0.8,0.44,0.235,0.733
1,2022-23,NBA,Nikola JokiÄ,27,DEN,5,41.2,30.2,14.0,7.2,0.8,1.4,0.583,0.421,0.838
2,2021-22,NBA,Stephen Curry,33,GSW,6,37.5,31.2,6.0,5.0,2.0,0.2,0.482,0.437,0.857
3,2020-21,NBA,Giannis Antetokounmpo,26,MIL,6,39.8,35.2,13.2,5.0,1.2,1.8,0.618,0.2,0.659
4,2019-20,NBA,LeBron James,35,LAL,6,39.3,29.8,11.8,8.5,1.2,0.5,0.591,0.417,0.667


*if you have an eagle eye, you'll notice something a hair off in row 1 there - we'll get to it in a bit*

In [59]:
def add_mvp():
    df = pd.read_csv('csvs/finals_mvp.csv',index_col=0)
    return df

In [71]:
def complete_table():
    # ret = await get_html(url,"#div_playoffs_series")
    table = get_html_table()
    df = finals_df()
    series = [tag for tag in table.find_all('a') if "vs" in tag['href']]
    recent = [tag for tag in series if int(tag['href'].split("/")[2].split("-")[0]) > 1968]
    hrefs = [BASE + a['href'] for a in recent]
    finals = [tag for tag in hrefs if 'nba-finals' in tag]
    finals = pd.DataFrame(finals, columns=["url"])
    full = pd.concat([df,add_mvp()],axis = 1)
    full = pd.concat([full,finals],axis = 1)
    return full

In [72]:
FULL_DF = complete_table()

found!
found!


In [76]:
def winner_abbrev(url):
    bs = BeautifulSoup(save(url,DIR))
    ref = [link['href'] for link in bs.find_all('a') if 'teams' in link['href'] and '.html' in link['href']][0]
    return ref.split('/')[2]

In [78]:
winner_abbrev(FULL_DF['url'][0])

found!


'BOS'

We enter into some dicey territory here because for some reason, basketball reference was keeping their accessible playoff series stats only in the form of a comment.  Meaning, we have to convert the comment into usable html before we can proceed.

In [82]:
def save_table(url):
    p = os.path.join(DIR3, name_csv(url))
    if not(os.path.exists(p)):
        save_tag(url,DIR,f'all_{winner_abbrev(url)}')
        text = save_tag(url,DIR,f'all_{winner_abbrev(url)}')
        bs = BeautifulSoup(text, 'html.parser')
        # table = bs.find(id = f'div_{winner_abbrev(url)}')
        # Find all comments
        comments = bs.find_all(string=lambda text: isinstance(text, Comment))
        table_c = [c for c in comments if len(c) > 10000][0]
        comment_soup = BeautifulSoup(table_c, 'html.parser')
        table = comment_soup.find('table')
        df = pd.read_html(str(table))[0]
        df.columns = df.columns.get_level_values(1)
        df.drop(df.columns[[0]], axis=1, inplace=True)
        df.to_csv(p)
        with open(p, "w+") as f:
            f.write(df.to_csv(p))
    else :
        df = pd.read_csv(p)
        df.drop(df.columns[[0]], axis=1, inplace=True)
    return df

In [None]:
for link in tqdm(FULL_DF['url']):
    save_table()

In [None]:
def get_table(url):
    p = os.path.join(DIR3, name_csv(url))
    df = pd.read_csv(p)
    df.drop(df.columns[[0]], axis=1, inplace=True)
    return df

In [None]:
for i in tqdm(range(len(df))):
    try:
        url = df['url'][i]
        save_table(url)
    except:
        save_table(url)

100%|███████████████████████████████████████████| 56/56 [00:01<00:00, 53.39it/s]


In [None]:
url = df['url'][2]
get_table(url)

Unnamed: 0,Player,Age,G,MP,FG,FGA,3P,3PA,FT,FTA,...,PTS,FG%,3P%,FT%,MP.1,PTS.1,TRB.1,AST.1,STL.1,BLK.1
0,Stephen Curry,33.0,6,225,66,137,31,71,24,28,...,187,0.482,0.437,0.857,37.5,31.2,6.0,5.0,2.0,0.2
1,Andrew Wiggins,26.0,6,235,45,101,11,37,9,13,...,110,0.446,0.297,0.692,39.2,18.3,8.8,2.2,1.5,1.5
2,Klay Thompson,31.0,6,230,36,101,20,57,10,10,...,102,0.356,0.351,1.0,38.3,17.0,3.0,2.0,1.3,0.5
3,Jordan Poole,22.0,6,125,27,62,15,39,10,11,...,79,0.435,0.385,0.909,20.8,13.2,1.8,1.8,0.5,0.2
4,Draymond Green,31.0,6,217,14,42,2,16,7,12,...,37,0.333,0.125,0.583,36.2,6.2,8.0,6.2,1.7,0.7
5,Gary Payton II,29.0,5,93,13,22,2,7,7,10,...,35,0.591,0.286,0.7,18.6,7.0,3.2,1.4,1.6,0.4
6,Otto Porter Jr.,28.0,6,102,10,17,9,16,2,2,...,31,0.588,0.563,1.0,17.0,5.2,2.0,1.0,1.0,0.2
7,Kevon Looney,25.0,6,130,14,22,0,0,2,2,...,30,0.636,,1.0,21.7,5.0,7.5,2.7,0.7,0.8
8,Nemanja Bjelica,33.0,5,29,4,8,1,2,0,0,...,9,0.5,0.5,,5.8,1.8,1.6,0.2,0.4,0.0
9,Andre Iguodala,38.0,4,19,3,4,1,1,0,0,...,7,0.75,1.0,,4.8,1.8,0.0,1.3,0.0,0.0


In [None]:
def clean_table(url):
    t = get_table(url)
    mvp = mvp_from_url(url)
    t.drop(t.columns[[5,7,9,-6,-5,-4,-3,-2,-1]], axis=1, inplace=True)
    t = t.drop(t.index[-1])
    yr = year_from_url(url)
    t['3P'] = pd.to_numeric(t['3P'], errors='coerce').fillna(0).astype(int)
    t['FG%'] = pd.to_numeric(t['FG%'], errors='coerce').fillna(0).astype(float)
    t['3P%'] = pd.to_numeric(t['3P%'], errors='coerce').fillna(0).astype(float)
    t['FT%'] = pd.to_numeric(t['FT%'], errors='coerce').fillna(0).astype(float)
    t['STL'] = pd.to_numeric(t['STL'], errors='coerce').fillna(0).astype(int)
    t['BLK'] = pd.to_numeric(t['BLK'], errors='coerce').fillna(0).astype(int)
    t['ORB'] = pd.to_numeric(t['ORB'], errors='coerce').fillna(0).astype(int)
    t['DRB'] = pd.to_numeric(t['DRB'], errors='coerce').fillna(0).astype(int)
    t['TOV'] = pd.to_numeric(t['TOV'], errors='coerce').fillna(0).astype(int)
    t['Age'] = pd.to_numeric(t['Age'], errors='coerce').fillna(0).astype(int)
    t['mvp'] = t['Player'].apply(lambda x: x == mvp)
    return t

In [None]:
t.columns[2:13].append(t.columns[15:19])

Index(['G', 'MP', 'FG', '3P', 'FT', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
       'PTS', 'FG%', '3P%', 'FT%'],
      dtype='object')

In [None]:
t.columns[13:15]

Index(['TOV', 'PF'], dtype='object')

In [None]:
def rank_table(t):
    # MP through blocks
    cols = t.columns[2:19]
    for i,col in enumerate(cols):
        new_name = col + '!'
        ascending_bool = (i == 11) or (i == 12)
        t[new_name] = t[col].rank(ascending=ascending_bool, method='min').astype(int)
        t.drop(columns = [col],inplace=True)
    return t

In [None]:
def top_table(url,top):
    df = clean_table(url)
    df = df.drop(df.index[top:])
    df = rank_table(df)
    yr = year_from_url(url)
    df.insert(0,'Year',yr)
    return df

In [None]:
clean_table(DF['url'][55])

Unnamed: 0,Player,Age,G,MP,FG,3P,FT,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,FG%,3P%,FT%,mvp
0,John Havlicek,28,7,336,74,0,50,0,0,77,31,0,0,0,25,198,0.457,0.0,0.847,False
1,Sam Jones,35,7,211,56,0,19,0,0,25,16,0,0,0,23,131,0.471,0.0,0.826,False
2,Larry Siegfried,29,7,181,36,0,26,0,0,18,20,0,0,0,30,98,0.391,0.0,0.897,False
3,Don Nelson,28,7,141,32,0,19,0,0,41,8,0,0,0,24,83,0.421,0.0,0.792,False
4,Em Bryant,30,7,233,31,0,15,0,0,35,19,0,0,0,27,77,0.403,0.0,0.882,False
5,Bailey Howell,32,7,193,31,0,12,0,0,37,4,0,0,0,34,74,0.333,0.0,0.6,False
6,Bill Russell,34,7,336,25,0,14,0,0,148,36,0,0,0,29,64,0.397,0.0,0.583,False
7,Tom Sanders,30,5,39,6,0,2,0,0,6,1,0,0,0,13,14,0.462,0.0,1.0,False
8,Don Chaney,22,2,10,0,0,2,0,0,1,0,0,0,0,4,2,0.0,0.0,0.667,False


In [None]:
# full top 8
df = top_table(GLOBAL_DF['url'][0],8)
for i in tqdm(range(len(GLOBAL_DF['url'])-1)):
    df = pd.concat([df,top_table(GLOBAL_DF['url'][i+1],8)],axis=0)

100%|██████████████████████████████████████████| 55/55 [00:00<00:00, 110.39it/s]


In [None]:
# df.reset_index(drop=True).to_csv('top8_full.csv')
complete = df.reset_index(drop=True)
complete

Unnamed: 0,Year,Player,Age,mvp,G!,MP!,FG!,3P!,FT!,ORB!,...,TRB!,AST!,STL!,BLK!,TOV!,PF!,PTS!,FG%!,3P%!,FT%!
0,2024,Jayson Tatum,25,False,1,1,1,3,1,5,...,1,1,4,4,8,5,1,7,5,4
1,2024,Jaylen Brown,27,True,1,2,2,4,2,5,...,4,2,1,3,7,8,2,5,6,6
2,2024,Jrue Holiday,33,False,1,3,3,4,5,1,...,2,3,5,4,6,7,3,2,3,1
3,2024,Derrick White,29,False,1,4,4,1,3,2,...,5,4,2,1,5,5,4,6,4,1
4,2024,Sam Hauser,26,False,1,6,5,2,6,3,...,6,7,6,7,1,3,5,4,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,1969,Don Nelson,28,False,1,7,4,1,3,1,...,3,6,1,1,1,3,4,4,1,6
444,1969,Em Bryant,30,False,1,3,5,1,5,1,...,5,4,1,1,1,5,5,5,1,3
445,1969,Bailey Howell,32,False,1,5,5,1,7,1,...,4,7,1,1,1,8,6,8,1,7
446,1969,Bill Russell,34,False,1,1,7,1,6,1,...,1,1,1,1,1,6,7,6,1,8


In [None]:
X = complete.drop(['Year','Player','mvp'],axis=1)
y = complete['mvp']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, 

                                           stratify = y, random_state=2022)
smote = SMOTE(sampling_strategy='minority')
X_train_SMOTE, y_train_SMOTE = smote.fit_resample(X_train,y_train)

In [None]:
pd.DataFrame(y_test).value_counts()

mvp  
False    118
True      17
dtype: int64

In [None]:
y_train_SMOTE.value_counts()

False    275
True     275
Name: mvp, dtype: int64

In [None]:
logistic_classifier = LogisticRegression(max_iter=200)
logistic_classifier.fit(X_train, y_train)
# logistic_classifier.fit(X_train_SMOTE, y_train_SMOTE)
y_pred = logistic_classifier.predict(X_test)
y_train_pred_proba = logistic_classifier.predict_proba(X_train_SMOTE)[:, 1]
y_test_pred_proba = logistic_classifier.predict_proba(X_test)[:, 1]
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[116   2]
 [  7  10]]
              precision    recall  f1-score   support

       False       0.94      0.98      0.96       118
        True       0.83      0.59      0.69        17

    accuracy                           0.93       135
   macro avg       0.89      0.79      0.83       135
weighted avg       0.93      0.93      0.93       135



In [None]:
test_and_pred_and_percent = pd.DataFrame({'mvp': y_test,'pred':y_pred,'prob':y_test_pred_proba.round(3)})
y_rounded = test_and_pred_and_percent.sort_index()
output = complete.drop(columns=['mvp']).join(y_rounded, how="inner").sort_values(by=['prob'],ascending=False)
winners = output[(output['Year']==1978)]
winners

Unnamed: 0,Year,Player,Age,G!,MP!,FG!,3P!,FT!,ORB!,DRB!,...,BLK!,TOV!,PF!,PTS!,FG%!,3P%!,FT%!,mvp,pred,prob
375,1978,Larry Wright,23,1,8,8,1,8,8,8,...,7,1,1,8,8,1,1,False,False,0.0
374,1978,Wes Unseld,31,1,3,7,1,6,2,1,...,6,2,6,7,2,1,8,True,False,0.0
372,1978,Charles Johnson,28,1,6,4,1,7,6,7,...,7,3,1,5,6,1,3,False,False,0.0


**Stack dataframes to make one giant one**

In [None]:
def test_table(url):
    df = clean_table(url)
    df.drop(df.columns[[0]], axis=1, inplace=True)
    return df

In [None]:
def y_table(url):
    df = clean_table(url)
    df['mvp'] = df['mvp'].apply(lambda x: int(x))
    return df['mvp'].values

In [None]:
test = test_table(df['url'][3])
test;

In [None]:
celtics = test_table(df['url'][0])
celtics

Unnamed: 0,Age,G,TOV,PF,PTS_Total,FG%,3P%,FT%,MP,PTS,TRB,AST,STL,BLK,mvp
0,25,5,16,11,111,0.388,0.263,0.926,40.2,22.2,7.8,7.2,1.0,0.6,False
1,27,5,14,16,104,0.44,0.235,0.733,38.6,20.8,5.4,5.0,1.6,0.8,True
2,33,5,7,14,72,0.536,0.421,1.0,37.8,14.4,7.4,3.8,0.6,0.6,False
3,29,5,6,11,69,0.389,0.395,1.0,36.8,13.8,4.8,2.6,1.2,1.0,False
4,26,5,0,7,41,0.519,0.478,1.0,15.6,8.2,3.0,0.6,0.2,0.0,False
5,28,3,1,5,37,0.583,0.222,0.875,20.0,12.3,3.7,0.3,0.0,1.7,False
6,37,5,3,9,35,0.52,0.471,0.5,30.0,7.0,6.2,2.6,1.2,0.6,False
7,26,5,3,6,17,0.25,0.188,0.0,12.4,3.4,1.4,1.6,0.2,0.0,False
8,25,3,0,0,7,0.5,1.0,1.0,6.3,2.3,1.0,0.0,0.0,0.3,False
9,25,2,1,3,6,0.667,1.0,1.0,9.5,3.0,2.5,0.5,0.0,1.0,False


In [None]:
y = celtics['mvp']
y.values

array([False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False])

In [None]:
train_url = df['url'][0]
test_url = df['url'][3]
X = test_table(train_url).values
test = test_table(test_url).values
y = y_table(train_url)

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=3)
clf.fit(X,y)

In [None]:
Y_pred = clf.predict(test)
Y_pred

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
def year_from_url(url):
    return int(url.split('/')[-1].split('-')[0])

In [None]:
def mvp_from_year(year):
    return add_mvp()['Player'][2024 - year]

In [None]:
def mvp_from_url(url):
    return mvp_from_year(year_from_url(url))

In [None]:
def dirty(url):
    df = clean_table(url)
    df['dirty'] = df.apply(lambda row: row['PTS']+row['TRB']+row['AST'], axis=1)
    player_max = df['dirty'].idxmax()
    return df.loc[player_max, 'Player']

In [None]:
df = add_link()
guess = df['url'].apply(dirty)
df.insert(df.columns.get_loc('Player')+1,'guess',guess)
df.head(5)

Unnamed: 0,Yr,Lg,Series,Unnamed: 3_level_1,Team,W,Team.1,W.1,Player,guess,url
0,2024,NBA,Finals,"Jun 6 - Jun 17, 2024",Boston Celtics (1),4,Dallas Mavericks (5),1,Jaylen Brown,Jayson Tatum,https://www.basketball-reference.com/playoffs/...
1,2023,NBA,Finals,"Jun 1 - Jun 12, 2023",Denver Nuggets (1),4,Miami Heat (8),1,Nikola Jokić,Nikola JokiÄ,https://www.basketball-reference.com/playoffs/...
2,2022,NBA,Finals,"Jun 2 - Jun 16, 2022",Golden State Warriors (3),4,Boston Celtics (2),2,Stephen Curry,Stephen Curry,https://www.basketball-reference.com/playoffs/...
3,2021,NBA,Finals,"Jul 6 - Jul 20, 2021",Milwaukee Bucks (3),4,Phoenix Suns (2),2,Giannis Antetokounmpo,Giannis Antetokounmpo,https://www.basketball-reference.com/playoffs/...
4,2020,NBA,Finals,"Sep 30 - Oct 11, 2020",Los Angeles Lakers (1),4,Miami Heat (5),2,LeBron James,LeBron James,https://www.basketball-reference.com/playoffs/...


In [None]:
clean_table(df['url'][1])

Unnamed: 0,Player,Age,G,TOV,PF,PTS_Total,FG%,3P%,FT%,MP,PTS,TRB,AST,STL,BLK
0,Nikola JokiÄ,27,5,17,17,151,0.583,0.421,0.838,41.2,30.2,14.0,7.2,0.8,1.4
1,Jamal Murray,25,5,17,9,107,0.451,0.387,0.929,42.2,21.4,6.2,10.0,1.0,0.0
2,Aaron Gordon,27,5,3,14,70,0.604,0.556,0.467,35.6,14.0,7.4,3.0,0.8,0.6
3,Bruce Brown,26,5,8,7,57,0.457,0.368,0.727,26.8,11.4,4.4,1.0,1.0,0.8
4,Michael Porter Jr.,24,5,2,6,48,0.328,0.143,0.75,29.4,9.6,8.4,0.8,0.0,0.4
5,Kentavious Caldwell-Pope,29,5,4,13,37,0.355,0.263,0.769,34.2,7.4,3.4,1.4,1.4,1.2
6,Christian Braun,21,5,3,6,29,0.706,0.0,0.556,16.4,5.8,2.0,1.2,1.0,0.4
7,Jeff Green,36,5,2,6,24,0.889,1.0,1.0,13.0,4.8,0.4,0.6,0.2,0.0
8,Thomas Bryant,25,1,0,0,0,,,,0.0,0.0,0.0,0.0,0.0,0.0
9,Vlatko ÄanÄar,25,1,0,0,0,,,,0.0,0.0,0.0,0.0,0.0,0.0


**Moving csv files to separate folder**

In [None]:
# import shutil

# for filename in os.listdir(DIR2):
#     if filename.endswith('.csv'):
#         source_file = os.path.join(DIR2, filename)
#         destination_file = os.path.join(DIR3, filename)
        
#         # Move the file
#         shutil.move(source_file, destination_file)
#         print(f'Moved: {filename}')