In [None]:
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from sklearn.preprocessing import MultiLabelBinarizer
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge, LinearRegression
import tensorflow as tf
pd.set_option('display.max_columns',500)

In [None]:
# Compiling links for NBA stats tables
link_dict = {
    'team_game': 'https://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=',
    'player_game': 'https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=',
    'player_hustle': 'https://stats.nba.com/stats/leaguehustlestatsplayer?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&SeasonSegment=&SeasonType=Regular+Season&TeamID=0&VsConference=&VsDivision=&Weight=',
    'player_general': 'https://stats.nba.com/stats/playerindex?LeagueID=00'
}

In [None]:
# Headers for querying NBA JSON data api
headers  = {
    'Connection': 'keep-alive',
    'Accept': 'application/json, text/plain, */*',
    'x-nba-stats-token': 'true',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://stats.nba.com/',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

In [None]:
seasons = ['2017-18','2018-19','2019-20','2020-21','2021-22']
params = {'season':'2021-22', 'SeasonType':'Regular+Season'}

In [5]:
def get_player_general(season):
    params = {'season':season, 'SeasonType':'Regular+Season'}
    r = requests.get(url = link_dict['player_general'],headers=headers,params=params).json()
    player_general = pd.DataFrame(r['resultSets'][0]['rowSet'],columns=r['resultSets'][0]['headers'])
    player_general = player_general[['PLAYER_FIRST_NAME','PLAYER_LAST_NAME','TEAM_ABBREVIATION','PERSON_ID',
                                     'POSITION','HEIGHT','WEIGHT','DRAFT_YEAR','FROM_YEAR','TO_YEAR']]
    player_general['SEASON'] = season
    player_general.rename({'PERSON_ID':'PLAYER_ID'},axis=1,inplace=True)
    return player_general

In [6]:
def get_player_game(season,min_games=40,min_minutes=15):
    drop_cols = ['_RANK','_FANTASY','CFPARA','CFID','TD3','DD2','TEAM_ABBREVIATION','_NAME','NICKNAME']
    params = {'season':season, 'SeasonType':'Regular+Season'}
    r = requests.get(url = link_dict['player_game'],headers=headers,params=params).json()
    player_game = pd.DataFrame(r['resultSets'][0]['rowSet'],columns=r['resultSets'][0]['headers'])
    player_game['FTMissed']=player_game['FTA']-player_game['FTM']
    player_game['FGMissed']=player_game['FGA']-player_game['FGM']
    player_game['FG2M']=player_game['FGM']-player_game['FG3M']
    player_game['SEASON'] = season
    player_game.drop([col for col in player_game.columns
        if re.search(r"(?=("+'|'.join(drop_cols)+r"))", col)],axis=1,inplace=True) # drop matching columns
    player_game = player_game[(player_game['GP']>=min_games)\
                &(player_game['MIN']>=min_minutes)] # only keep players with enough games and minutes per game
    return player_game

In [7]:
def get_player_hustle(season):
    params = {'season':season, 'SeasonType':'Regular+Season'}
    r = requests.get(url = link_dict['player_hustle'],headers=headers,params=params).json()
    player_hustle = pd.DataFrame(r['resultSets'][0]['rowSet'],columns=r['resultSets'][0]['headers'])
    player_hustle['SEASON'] = season
    return player_hustle

In [8]:
def make_or_concat(seasons, function):
    for idx, season in enumerate(seasons):
        if idx == 0:
            df = function(season)
        else:
            df = pd.concat([df,function(season)], ignore_index=True, sort=False)
    return df

In [None]:
player_general = make_or_concat(seasons, get_player_general)
player_game = make_or_concat(seasons, get_player_game)
player_hustle = make_or_concat(seasons, get_player_hustle)
# all_stats = player_game\
#     .merge(player_general, how='inner', on=['PLAYER_ID'])\
#     .merge(player_hustle.drop('MIN',axis=1),how='inner',on=['PLAYER_ID','SEASON'])\
#     .drop('G',axis=1)
# drop_cols = ['_x','NICK','TEAM_ID','_y']
# drop_col_matches = [col for col in all_stats.columns
#     if re.search(r"(?=("+'|'.join(drop_cols)+r"))", col)]
# all_stats = all_stats.drop(drop_col_matches,axis=1) # drop matching columns
# all_stats

In [None]:
player_game\
    .merge(player_general, how='inner', on=['PLAYER_ID','SEASON'])

In [None]:
player_game.sort_values(by='PLAYER_ID')

In [None]:
player_general.sort_values(by='PLAYER_ID')

In [None]:
player_hustle.sort_values(by='PLAYER_ID')

In [None]:
all_stats = all_stats.drop(['PLAYER_ID','POSITION'],axis=1)
all_stats

In [None]:
drop_cols = ['_x','NICK','TEAM_ID','_y']
[col for col in all_stats.columns
    if re.search(r"(?=("+'|'.join(drop_cols)+r"))", col)]

In [None]:
player_general = player_general.merge(
        player_hustle.drop('MIN',axis=1),
    how='inner',on=['PLAYER_ID','SEASON']).drop('G',axis=1)
drop_cols = ['_x','NICK','TEAM_ID','_y']
player_general.drop([col for col in player_general.columns
    if re.search(r"(?=("+'|'.join(drop_cols)+r"))", col)],axis=1,inplace=True) # drop matching columns
player_general['POSITION2'] = player_general['POSITION'].str.split('-')
player_general = pd.get_dummies(player_general, columns = ['POSITION'])
player_general['HEIGHT'] = player_general['HEIGHT'].str.split('-').str[0].astype(int)*12\
    +player_general['HEIGHT'].str.split('-').str[1].astype(int)
player_general

In [None]:
mlb = MultiLabelBinarizer()
player_general = player_general.merge(pd.DataFrame(mlb.fit_transform(player_general['POSITION2']),columns=mlb.classes_, index=player_general.index),
                     how='inner', left_index=True, right_index=True).drop(['POSITION2'],axis=1)
player_general['TRUE_SHOOTING_PCT'] = player_general['PTS']/(0.44*player_general['FTA']+player_general['FGA'])
player_general['FG2A'] = player_general['FGA'] - player_general['FG3A']
player_general['FG2M'] = player_general['FGM'] - player_general['FG3M']
player_general['FG2_PCT'] = player_general['FG2M']/player_general['FG2A']
player_general

In [None]:
keep_index = ['PLUS_MINUS']
corr_df = player_general.corr()
corr_df[corr_df.index.isin(keep_index)]

In [None]:
predict_plus_minus = player_general.drop(['W_PCT','PLAYER_ID','DRAFT_YEAR','GP','W','L','MIN',
                                          'W_PCT','SEASON','PLAYER_NAME','TEAM_ABBREVIATION',
                                         'FROM_YEAR','TO_YEAR','HEIGHT','WEIGHT'], axis=1)
predict_plus_minus

In [None]:
# # Saving time for salary with code from here: https://medium.com/swlh/linking-nba-salary-to-performance-sample-player-analysis-with-python-2c568455b306
# r = requests.get('https://hoopshype.com/salaries/')
# r_html = r.text

# soup = BeautifulSoup(r_html, 'html.parser')

# salary_table = soup.find('table')
# length=len(salary_table.find_all("td"))

# player_names=[salary_table.find_all("td")[i].text.strip() for i in range(9,length,8)]

# column1=[salary_table.find_all("td")[i].text.strip() for i in range(10,length,8)]
# column2=[salary_table.find_all("td")[i].text.strip() for i in range(11,length,8)]
# column3=[salary_table.find_all("td")[i].text.strip() for i in range(12,length,8)]
# column4=[salary_table.find_all("td")[i].text.strip() for i in range(13,length,8)]
# column5=[salary_table.find_all("td")[i].text.strip() for i in range(14,length,8)]
# column6=[salary_table.find_all("td")[i].text.strip() for i in range(15,length,8)]
# df_dict={'player_names':player_names,
#         '2019/20':column1,
#         '2020/21':column2,
#         '2021/22':column3,
#         '2022/23':column4,
#         '2023/24':column5,
#         '2024/25':column6}
        
# salary_df = pd.DataFrame(df_dict)
# salary_df

In [None]:
def train_test_split_df(df,Y_val,test_size=0.4,random_state=42):
    train_features, test_features = train_test_split(df.dropna(),test_size=test_size,random_state=random_state)
    train_labels = train_features.pop(Y_val)
#     train_labels['Y']=np.where(train_labels.values>0,1,0)
#     train_labels['asd']=np.where(train_labels[Y_val]>0,1,0)
    test_labels = test_features.pop(Y_val)
#     test_labels['asd']=np.where(test_labels[Y_val]>0,1,0)
    return np.asarray(train_features.sort_index()).astype(np.float32),\
        np.asarray(train_labels.sort_index()).astype(np.float32),\
        np.asarray(test_features.sort_index()).astype(np.float32),\
        np.asarray(test_labels.sort_index()).astype(np.float32)

In [None]:
# define a function to create the deep neural network model
def tf_model(norm):
    col_len = train_features.shape[1]
    model = tf.keras.Sequential([
    norm,
    tf.keras.layers.Dense((col_len), activation='relu', input_dim=col_len),
#     tf.keras.layers.Dense((col_len)*2, activation='relu'),
#     tf.keras.layers.Dense((col_len), activation='relu'),
#     tf.keras.layers.Dense(round((col_len)/2), activation='relu'),
#     tf.keras.layers.Dense(col_len, activation='relu'),
    tf.keras.layers.Dense(1)
                 ])
    model.compile(loss='mean_squared_error', optimizer='adam', #'rmsprop', 
                 metrics = ['mean_squared_error'] )
    return model

In [None]:
all_models = {}
y_values = {}
x_values = {}
normalizer = tf.keras.layers.Normalization()
split_nums = [40,600,2432342,4645646,874,9846]
for i in split_nums:
    # Split the data
    train_features, train_labels, test_features, test_labels = train_test_split_df(predict_plus_minus,'PLUS_MINUS',random_state=i)
    # Normalize and run TF model
#     normalizer = tf.keras.layers.Normalization()
    normalizer.adapt(train_features)
    tf_m = tf_model(normalizer)
    tf_m.fit(train_features, train_labels, epochs=50, verbose=False)
    # Run Dummy model
    dummy = DummyRegressor(strategy="mean")
    dummy.fit(train_features,train_labels)
    # Run Ridge model
    ridge = Ridge(alpha=1.0)
    ridge.fit(train_features,train_labels)
    # Run Linear model
    lin = LinearRegression()
    lin.fit(train_features,train_labels)
    # Run Decision Tree model
    dtr = tree.DecisionTreeRegressor(max_depth=10)
    dtr.fit(train_features,train_labels)
    all_models[str(i)+'tf_m'] = tf_m
    all_models[str(i)+'dummy'] = dummy
    all_models[str(i)+'ridge'] = ridge
    all_models[str(i)+'dtr'] = dtr
    all_models[str(i)+'lin'] = lin
    y_values[i] = test_labels
    x_values[i] = test_features
    print(i)

In [None]:
asd = {}
dfasd = pd.DataFrame([[None,None]],columns=['Actual','Pred'])
for i in split_nums:
    for j in ['tf_m','dummy','ridge','dtr','lin']:
        preds = all_models[str(i)+j].predict(x_values[i]).flatten()
        r2 = r2_score(y_values[i],preds)
        mse = round(1000*mean_squared_error(y_values[i],preds),1)
        acc = round(sum(np.sign(preds)==np.sign(y_values[i]))/len(preds),4)
        tn, fp, fn, tp = confusion_matrix(np.where(y_values[i]>0,1,0),np.where(preds>0,1,0)).ravel()
        posacc = round(tp/(fp+tp),2)
        negacc = round(tn/(fn+tn),2)
#         tp = 
#         fn = 
#         tn = 
        if j+'r2' in asd:
            asd[j+'r2'].append(r2)
            asd[j+'mse'].append(mse)
            asd[j+'acc'].append(acc)
            asd[j+'TN,FP,FN,TP'].append([tn,fp,fn,tp])
            asd[j+'posacc'].append(posacc)
            asd[j+'negacc'].append(negacc)
#             for k in [tn,fp,fn,tp]:
#                 asd[j+'TN,FP,FN,TP'].append(k)
        else:
            asd[j+'r2']=[r2]
            asd[j+'mse']=[mse]
            asd[j+'acc']=[acc]
            asd[j+'TN,FP,FN,TP'] = [[tn,fp,fn,tp]]
            asd[j+'posacc'] = [posacc]
            asd[j+'negacc'] = [negacc]
        plt.figure(figsize=(12,12))
        plt.scatter(y_values[i],preds)
        title = str(i)+j+', R2 = '+str(round(r2,4))+', MSE = '+str(mse)+', ACC = '+ str(acc)
        plt.suptitle(title)
        plt.xlabel('Actual')
        plt.ylabel('Predicted')
        plt.show()
        if j == 'tf':
            dfasd=dfasd.append(pd.DataFrame.from_dict({'Actual':y_values[i],'Pred':preds}))
#     plt.title(str(i)+'TF, R2 = '+str(r2_score(y_values[i],all_models[str(i)+'tf'].predict(x_values[i]))))
#     plt.show()
#     plt.scatter(y_values[i],all_models[str(i)+'dummy'].predict(x_values[i]))
#     title(str(i)+'dummy, R2 = '+str(r2_score(y_values[i],all_models[str(i)+'dummy'].predict(x_values[i]))))
#     plt.show()
#     plt.scatter(y_values[i],all_models[str(i)+'ridge'].predict(x_values[i]))
#     title(str(i)+'ridge, R2 = '+str(r2_score(y_values[i],all_models[str(i)+'ridge'].predict(x_values[i]))))
#     plt.show()
#     plt.scatter(y_values[i],all_models[str(i)+'dtr'].predict(x_values[i]))
#     title(str(i)+'dtr, R2 = '+str(r2_score(y_values[i],all_models[str(i)+'dtr'].predict(x_values[i]))))
#     plt.show()

In [None]:
normalizer = tf.keras.layers.Normalization()
train_features, train_labels, test_features, test_labels = train_test_split_df(predict_plus_minus,'PLUS_MINUS',random_state=42)
normalizer.adapt(train_features)
tf_m = tf_model(normalizer)
tf_m.fit(train_features, train_labels, epochs=50, verbose=False)
preds = tf_m.predict(test_features).flatten()
r2 = r2_score(test_labels,preds)
r2

In [None]:
preds

In [None]:
# ToDo
# Make a function for each major processing step
    # Make a combined function
# Try a deep learning model even though sparse points.
# If not enough, get multiple years of data
# Try using rank as a feature because it's inherently more normalized
# Use LinReg of rank features to gauge importance

In [None]:
keep_index = ['PLUS_MINUS']
player_general_rank = player_general.rank()
player_general_rank_norm = player_general_rank/player_general_rank.max()
corr_df_rank = player_general_rank.corr()
corr_df_rank[corr_df_rank.index.isin(keep_index)]

In [None]:
# define a function to create the deep neural network model
def tf_model_no_norm():
    col_len = train_features.shape[1]
    model = tf.keras.Sequential([
#     norm,
    tf.keras.layers.Dense((col_len), activation='relu', input_dim=col_len),
#     tf.keras.layers.Dense((col_len)*2, activation='relu'),
#     tf.keras.layers.Dense((col_len), activation='relu'),
#     tf.keras.layers.Dense(round((col_len)/2), activation='relu'),
#     tf.keras.layers.Dense(col_len, activation='relu'),
    tf.keras.layers.Dense(1)
                 ])
    model.compile(loss='mean_squared_error', optimizer='adam', #'rmsprop', 
                 metrics = ['mean_squared_error'] )
    return model

In [None]:
predict_plus_minus_rank_norm = player_general_rank_norm.drop(['W_PCT','PLAYER_ID','DRAFT_YEAR','GP','W','L','MIN',
                                          'W_PCT','SEASON','PLAYER_NAME','TEAM_ABBREVIATION',
                                         'FROM_YEAR','TO_YEAR','HEIGHT','WEIGHT'], axis=1)
all_models = {}
y_values = {}
x_values = {}
normalizer = tf.keras.layers.Normalization()
split_nums = [40,600,2432342,4645646,874,9846]
for i in split_nums:
    # Split the data
    train_features, train_labels, test_features, test_labels = train_test_split_df(predict_plus_minus_rank_norm,'PLUS_MINUS',random_state=i)
    # Normalize and run TF model
#     normalizer = tf.keras.layers.Normalization()
    normalizer.adapt(train_features)
    tf_m = tf_model_no_norm()
    tf_m.fit(train_features, train_labels, epochs=50, verbose=False)
    # Run Dummy model
    dummy = DummyRegressor(strategy="mean")
    dummy.fit(train_features,train_labels)
    # Run Ridge model
    ridge = Ridge(alpha=1.0)
    ridge.fit(train_features,train_labels)
    # Run Linear model
    lin = LinearRegression()
    lin.fit(train_features,train_labels)
    # Run Decision Tree model
    dtr = tree.DecisionTreeRegressor(max_depth=10)
    dtr.fit(train_features,train_labels)
    all_models[str(i)+'tf_m'] = tf_m
    all_models[str(i)+'dummy'] = dummy
    all_models[str(i)+'ridge'] = ridge
    all_models[str(i)+'dtr'] = dtr
    all_models[str(i)+'lin'] = lin
    y_values[i] = test_labels
    x_values[i] = test_features
    print(i)

In [None]:
asd = {}
dfasd = pd.DataFrame([[None,None]],columns=['Actual','Pred'])
for i in split_nums:
    for j in ['tf_m','dummy','ridge','dtr','lin']:
        preds = all_models[str(i)+j].predict(x_values[i]).flatten()
        r2 = r2_score(y_values[i],preds)
        mse = round(1000*mean_squared_error(y_values[i],preds),1)
        if j+'r2' in asd:
            asd[j+'r2'].append(r2)
            asd[j+'mse'].append(mse)
        else:
            asd[j+'r2']=[r2]
            asd[j+'mse']=[mse]
        plt.figure(figsize=(12,12))
        plt.scatter(y_values[i],preds)
        title = str(i)+j+', R2 = '+str(round(r2,4))+', MSE = '+str(mse)+', ACC = '+ str(acc)
        plt.suptitle(title)
        plt.xlabel('Actual')
        plt.ylabel('Predicted')
        plt.show()
        if j == 'tf':
            dfasd=dfasd.append(pd.DataFrame.from_dict({'Actual':y_values[i],'Pred':preds}))

In [None]:
player_general_rank

In [None]:
predict_plus_minus_rank_norm

In [None]:
player_general_per_min = player_general.drop(['W_PCT','PLAYER_ID','DRAFT_YEAR','GP','W','L','MIN',
                                          'W_PCT','SEASON','PLAYER_NAME','TEAM_ABBREVIATION',
                                         'FROM_YEAR','TO_YEAR','HEIGHT','WEIGHT'],axis=1).div(player_general['MIN'],axis=0)
for col in player_general_per_min[[col for col in player_general_per_min.columns if 'PCT' in col or 'POSITION' in col]].columns:
    player_general_per_min[col] = player_general[col]
player_general_per_min_rank_norm = player_general_per_min.rank()/player_general_per_min.rank().max()
player_general_per_min_rank_norm

In [None]:
lin = LinearRegression()
importance = []
z = 0
# lin.fit(player_general_per_min_rank_norm.drop('PLUS_MINUS',axis=1),player_general_per_min_rank_norm['PLUS_MINUS'])
for i in player_general_per_min_rank_norm.drop('PLUS_MINUS',axis=1).columns:
    lin.fit(player_general_per_min_rank_norm[i].values.reshape(-1,1),player_general_per_min_rank_norm['PLUS_MINUS'])
    importance.append(lin.coef_[0])
    z+=1
plt.figure(figsize=(17,17))
plt.xticks(rotation=90)
plt.bar(player_general_per_min_rank_norm.drop('PLUS_MINUS',axis=1).columns,importance)