In [None]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import re
from sklearn.preprocessing import MultiLabelBinarizer
pd.set_option('display.max_columns',500)

In [None]:
# Compiling links for NBA stats tables
link_dict = {
    'team_game': 'https://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=',
    'player_game': 'https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=',
    'player_hustle': 'https://stats.nba.com/stats/leaguehustlestatsplayer?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&SeasonSegment=&SeasonType=Regular+Season&TeamID=0&VsConference=&VsDivision=&Weight=',
    'player_general': 'https://stats.nba.com/stats/playerindex?LeagueID=00'
}

In [None]:
# Headers for querying NBA JSON data api
headers  = {
    'Connection': 'keep-alive',
    'Accept': 'application/json, text/plain, */*',
    'x-nba-stats-token': 'true',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://stats.nba.com/',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

In [None]:
params = {'season':'2021-22', '&SeasonType':'Regular+Season'}

In [None]:
r = requests.get(url = link_dict['player_general'],headers=headers,params=params).json()
player_general = pd.DataFrame(r['resultSets'][0]['rowSet'],columns=r['resultSets'][0]['headers'])
player_general = player_general[['PERSON_ID','POSITION','HEIGHT','WEIGHT','DRAFT_YEAR','FROM_YEAR','TO_YEAR']]
player_general.rename({'PERSON_ID':'PLAYER_ID'},axis=1,inplace=True)
player_general

In [None]:
min_games = 40
min_minutes = 15
drop_cols = ['_RANK','_FANTASY','CFPARA','CFID','TD3','DD2','TEAM_ABBREVIATION','_NAME']
r = requests.get(url = link_dict['player_game'],headers=headers,params=params).json()
player_game = pd.DataFrame(r['resultSets'][0]['rowSet'],columns=r['resultSets'][0]['headers'])
player_game['FTMissed']=player_game['FTA']-player_game['FTM']
player_game['FGMissed']=player_game['FGA']-player_game['FGM']
player_game['FG2M']=player_game['FGM']-player_game['FG3M']
player_game.drop([col for col in player_game.columns
    if re.search(r"(?=("+'|'.join(drop_cols)+r"))", col)],axis=1,inplace=True) # drop matching columns
player_game = player_game[(player_game['GP']>=min_games)\
            &(player_game['MIN']>=min_minutes)] # only keep players with enough games and minutes per game
player_game

In [None]:
player_general = player_general.merge(
        player_game,
    how='inner',on='PLAYER_ID')
player_general

In [None]:
# drop_cols = []
r = requests.get(url = link_dict['player_hustle'],headers=headers,params=params).json()
player_hustle = pd.DataFrame(r['resultSets'][0]['rowSet'],columns=r['resultSets'][0]['headers'])
# player_hustle.drop([col for col in player_hustle.columns
#     if re.search(r"(?=("+'|'.join(drop_cols)+r"))", col)],axis=1,inplace=True)
player_hustle

In [None]:
player_general = player_general.merge(
        player_hustle,
    how='inner',on='PLAYER_ID').drop('G',axis=1)
drop_cols = ['_x','NICK','TEAM_ID','_y']
player_general.drop([col for col in player_general.columns
    if re.search(r"(?=("+'|'.join(drop_cols)+r"))", col)],axis=1,inplace=True) # drop matching columns
player_general['POSITION2'] = player_general['POSITION'].str.split('-')
player_general = pd.get_dummies(player_general, columns = ['POSITION'])
player_general['HEIGHT'] = player_general['HEIGHT'].str.split('-').str[0].astype(int)*12\
    +player_general['HEIGHT'].str.split('-').str[1].astype(int)
player_general

In [None]:
mlb = MultiLabelBinarizer()
player_general = player_general.merge(pd.DataFrame(mlb.fit_transform(player_general['POSITION2']),columns=mlb.classes_, index=player_general.index),
                     how='inner', left_index=True, right_index=True).drop(['POSITION2'],axis=1)
player_general

In [None]:
keep_index = ['PLUS_MINUS']
corr_df = player_general.corr()
corr_df[corr_df.index.isin(keep_index)]

In [None]:
# ToDo - try a deep learning model even though sparse points.
# If not enough, get multiple years of data