# Player FG% vs. Team Winning Pct, First Half 2018-19

## Imports and Setup

In [1]:
# Data Handling and Processing
import pandas as pd
from sklearn import preprocessing

# Other
import os
import time
from tqdm import tqdm

from IPython.display import display

## Data Acquisition

### Player Single-Game FG%

First, need to isolate the players with at least 200FGA through the All-Star Break. From there, can use that player list to get the game logs for each player to facilitate the analysis.

#### Step 1: Isolate the players with at least 200FGA through the All-Star Break
[**LeagueDashPlayerStats docs**](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguedashplayerstats.md) | 
[**Parameters docs**](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/library/parameters.md#StatCategory)

In [2]:
step_1_exists = os.path.isfile('players_w_200fga.csv')
print(step_1_exists)

True


In [3]:
# Import the appropriate libraries
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.library.parameters import *

In [4]:
# UDF to get the aggregated player stats given a date range
def get_player_stats_df(
    date_from='2018-10-16',
    date_to='2019-02-14',
    team=0,
    opp_team=0,
    measure_type='Base',
    division=DivisionNullable.default,
    vs_division=DivisionNullable.default,
    conference=ConferenceNullable.default,
    vs_conference=ConferenceNullable.default,
    period=0,
    game_segment=GameSegmentNullable.default,
    home_away=LocationNullable.default,
    outcome=OutcomeNullable.default,
    player_position=PlayerPositionAbbreviationNullable.default,
    player_experience=PlayerExperienceNullable.default,
    starter_bench=StarterBenchNullable.default,
    shot_clock_range=ShotClockRangeNullable.default
    ):

    # Convert team name inputs to ID lists, if applicable
    if opp_team != 0:
        opp_team = [team_full_name_to_id(tm) for tm in opp_team]
    if team != 0:
        team = [team_full_name_to_id(tm) for tm in team]
    
    # Get data
    player_stats = leaguedashplayerstats.LeagueDashPlayerStats(
        last_n_games=0,
        season='2018-19',
        measure_type_detailed_defense=measure_type,
        month=0,
        opponent_team_id=opp_team,
        period=period,
        date_from_nullable=date_from,
        date_to_nullable=date_to,
        team_id_nullable=team,
        division_simple_nullable=division,
        vs_division_nullable=vs_division,
        conference_nullable=conference,
        vs_conference_nullable=vs_conference,
        game_segment_nullable=game_segment,
        location_nullable=home_away,
        outcome_nullable=outcome,
        player_position_abbreviation_nullable=player_position,
        player_experience_nullable=player_experience,
        starter_bench_nullable=starter_bench,
        shot_clock_range_nullable=shot_clock_range)
    
    # Convert to DataFrame and return
    header = player_stats.get_dict()['resultSets'][0]['headers']
    data = player_stats.get_dict()['resultSets'][0]['rowSet']
    return pd.DataFrame(data=data, columns=header)

In [5]:
# If data already exists, just read it in
# If not, pull it from the endpoint
if step_1_exists:
    # Read the data
    player_agg_stats_subset = pd.read_csv('players_w_200fga.csv', 
                                          sep='|', 
                                          encoding='utf-8')
    print('Data read')
else:
    # Pull the data
    player_agg_stats = get_player_stats_df(measure_type='Base')
    
    # Isolate the subset of fields and filter out players with fewer than 200 FGA
    col_subset = ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'FGA']
    player_agg_stats_subset = player_agg_stats[player_agg_stats.FGA >= 200].loc[:, col_subset]
    
    # Store as CSV to conserve endpoint calls for next time
    player_agg_stats_subset.to_csv('players_w_200fga.csv', 
                                   sep='|', 
                                   encoding='utf-8',
                                   index=False)
    
    print('Data pulled and saved')

Data read


#### Preview data

In [7]:
#table = TableDisplay(player_agg_stats_subset.head())
#table.setAlignmentProviderForColumn('m3', TableDisplayAlignmentProvider.CENTER_ALIGNMENT)
#table.setRendererForColumn("y10", TableDisplayCellRenderer.getDataBarsRenderer(False))
#table.setRendererForType(ColumnType.Double, TableDisplayCellRenderer.getDataBarsRenderer(True))
#display(table)
player_agg_stats_subset.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,FGA
0,203932,Aaron Gordon,1610612753,ORL,727
1,201143,Al Horford,1610612738,BOS,489
2,202329,Al-Farouq Aminu,1610612757,POR,409
3,202692,Alec Burks,1610612758,SAC,460
4,203458,Alex Len,1610612737,ATL,423


#### What is total sum of players with at least 200 FGA through the All-Star Break?

In [8]:
len(player_agg_stats_subset)

280

#### What is the distribution of players per team with at least 200 FGA through the All-Star Break?

In [9]:
(player_agg_stats_subset.groupby('TEAM_ABBREVIATION')
                         .count()['PLAYER_ID']
                         .sort_values(ascending=False))

TEAM_ABBREVIATION
CHI    11
BKN    11
CHA    11
NYK    10
LAL    10
DEN    10
DAL    10
MIL    10
MIN    10
ATL    10
IND    10
PHI    10
PHX    10
POR    10
SAC    10
TOR    10
LAC    10
CLE     9
WAS     9
UTA     9
MIA     9
HOU     8
GSW     8
DET     8
NOP     8
OKC     8
BOS     8
SAS     8
MEM     8
ORL     7
Name: PLAYER_ID, dtype: int64

#### Step 2: Using the player list above, get the game logs for each player
[**PlayerGameLog docs**](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/playergamelog.md)

In [10]:
# Check if data already exists
step_2_exists = os.path.isfile('playerFGP_vs_teamWP.csv')
print(step_2_exists)

True


In [11]:
# Import the appropriate libraries
from nba_api.stats.endpoints import playergamelog

In [12]:
# UDF to get a player's game logs given player ID
def get_player_game_logs(player_id):
    
    # Get game logs data
    p = playergamelog.PlayerGameLog(player_id=player_id, 
                                season_all='2018-19', 
                                season_type_all_star='Regular Season',
                                league_id_nullable='00')
    
    # Convert to DataFrame and return
    header = p.get_dict()['resultSets'][0]['headers']
    data = p.get_dict()['resultSets'][0]['rowSet']
    return pd.DataFrame(data=data, columns=header)

In [13]:
# If data already exists, just read it in
# If not, pull it from the endpoint
if step_2_exists:
    # Read the data
    player_game_logs_trim = pd.read_csv('playerFGP_vs_teamWP.csv', 
                                        sep='|', 
                                        encoding='utf-8')
    print('Data read')

else:
    
    # Iterate through the players and store game logs in global DataFrame
    global_player_game_logs = pd.DataFrame()

    for player in tqdm(player_agg_stats_subset.PLAYER_ID):
        player_df = get_player_game_logs(player)
        global_player_game_logs = global_player_game_logs.append(player_df)
        time.sleep(3)
        
    # Enrich with player information
    global_player_game_logs = global_player_game_logs.merge(player_agg_stats_subset, 
                                                            left_on='Player_ID', 
                                                            right_on='PLAYER_ID')
    # Pare down columns
    col_subset = ['SEASON_ID', 
                  'Player_ID', 
                  'PLAYER_NAME', 
                  'Game_ID', 
                  'GAME_DATE', 
                  'MATCHUP', 
                  'WL',
                  'MIN', 
                  'FGM', 
                  'FGA_x', 
                  'FG_PCT', 
                  'FG3M', 
                  'FG3A', 
                  'FG3_PCT', 
                  'FTM',
                  'FTA', 
                  'FT_PCT', 
                  'OREB', 
                  'DREB', 
                  'REB', 
                  'AST', 
                  'STL', 
                  'BLK', 
                  'TOV',
                  'PF', 
                  'PTS', 
                  'PLUS_MINUS',
                 ]
    global_player_game_logs = global_player_game_logs.loc[:, col_subset]  

    # Clean up column names
    new_col_names = {existing: existing.lower() for existing in col_subset}
    global_player_game_logs.rename(new_col_names, axis=1, inplace=True)
    global_player_game_logs.rename({'fga_x': 'fga'}, axis=1, inplace=True)
    
    # Save full dataset for possible later use
    global_player_game_logs.to_csv('player_game_logs_200FGA_preAS_201819_full.csv',
                                   sep='|',
                                   encoding='utf-8',
                                   index=False)    
    
    # Trim down columns even further for this use case
    col_subset = ['season_id', 
                  'player_id', 
                  'player_name', 
                  'game_id', 
                  'game_date',
                  'matchup', 
                  'wl',
                  'fgm', 
                  'fga', 
                  'fg_pct',]
    player_game_logs_trim = global_player_game_logs.loc[:, col_subset]  
    
    # Store as CSV for later use
    player_game_logs_trim.to_csv('playerFGP_vs_teamWP.csv',
                                 sep='|',
                                 encoding='utf-8',
                                 index=False)
    
    print('Data pulled and saved')

Data read


#### Preview Data

In [14]:
player_game_logs_trim.sample(5)

Unnamed: 0,season_id,player_id,player_name,game_id,game_date,matchup,wl,fgm,fga,fg_pct
11807,22018,203894,Shabazz Napier,21800853,"FEB 13, 2019",BKN @ CLE,W,4,11,0.364
12443,22018,1627752,Taurean Prince,21800128,"NOV 03, 2018",ATL vs. MIA,W,5,11,0.455
3023,22018,201565,Derrick Rose,21800148,"NOV 05, 2018",MIN @ LAC,L,8,20,0.4
84,22018,201143,Al Horford,21800283,"NOV 24, 2018",BOS @ DAL,L,3,11,0.273
5657,22018,1627759,Jaylen Brown,21800216,"NOV 16, 2018",BOS vs. TOR,W,3,11,0.273


#### Number of records

In [15]:
len(player_game_logs_trim)

13977

#### Verify number of players

In [16]:
player_game_logs_trim.player_id.nunique()

280

#### DataFrame info

In [17]:
player_game_logs_trim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13977 entries, 0 to 13976
Data columns (total 10 columns):
season_id      13977 non-null int64
player_id      13977 non-null int64
player_name    13977 non-null object
game_id        13977 non-null int64
game_date      13977 non-null object
matchup        13977 non-null object
wl             13977 non-null object
fgm            13977 non-null int64
fga            13977 non-null int64
fg_pct         13977 non-null float64
dtypes: float64(1), int64(5), object(4)
memory usage: 1.1+ MB


In [18]:
player_game_logs_trim.describe()

Unnamed: 0,season_id,player_id,game_id,fgm,fga,fg_pct
count,13977.0,13977.0,13977.0,13977.0,13977.0,13977.0
mean,22018.0,729442.8,21800430.0,4.549331,9.835945,0.449468
std,0.0,697444.6,249.1648,3.109229,5.6079,0.206324
min,22018.0,1713.0,21800000.0,0.0,0.0,0.0
25%,22018.0,202339.0,21800210.0,2.0,6.0,0.333
50%,22018.0,203507.0,21800430.0,4.0,9.0,0.455
75%,22018.0,1627750.0,21800640.0,6.0,13.0,0.571
max,22018.0,1629066.0,21800870.0,21.0,38.0,1.0


## Analysis

In [19]:
# Convert WL column to numeric
def wl_to_numeric(val):
    if val is 'W':
        return 1
    else:
        return 0

player_game_logs_trim.loc[:, 'wl_ind'] = player_game_logs_trim.wl.apply(wl_to_numeric)

In [20]:
# Preview data
player_game_logs_trim.sample(5)

Unnamed: 0,season_id,player_id,player_name,game_id,game_date,matchup,wl,fgm,fga,fg_pct,wl_ind
13791,22018,201163,Wilson Chandler,21800667,"JAN 17, 2019",PHI @ IND,W,4,9,0.444,1
8373,22018,1628374,Lauri Markkanen,21800490,"DEC 23, 2018",CHI @ CLE,W,11,18,0.611,1
10299,22018,203943,Noah Vonleh,21800638,"JAN 13, 2019",NYK vs. PHI,L,0,3,0.0,0
209,22018,202692,Alec Burks,21800075,"OCT 27, 2018",UTA @ NOP,W,2,4,0.5,1
5220,22018,1628973,Jalen Brunson,21800821,"FEB 08, 2019",DAL vs. MIL,L,2,6,0.333,0


#### Develop correlation calculation for single player

In [21]:
# Test correlation on single player
filter = player_game_logs_trim.player_name == 'LeBron James'
lebron_logs = player_game_logs_trim[filter]
lebron_logs.head()

Unnamed: 0,season_id,player_id,player_name,game_id,game_date,matchup,wl,fgm,fga,fg_pct,wl_ind
8384,22018,2544,LeBron James,21800848,"FEB 12, 2019",LAL @ ATL,L,8,20,0.4,0
8385,22018,2544,LeBron James,21800835,"FEB 10, 2019",LAL @ PHI,L,8,16,0.5,0
8386,22018,2544,LeBron James,21800814,"FEB 07, 2019",LAL @ BOS,W,11,21,0.524,1
8387,22018,2544,LeBron James,21800798,"FEB 05, 2019",LAL @ IND,L,7,12,0.583,0
8388,22018,2544,LeBron James,21800769,"JAN 31, 2019",LAL @ LAC,W,9,22,0.409,1


In [22]:
# UDF to normalize comparison column
def norm_and_corr_calc(comp_col, corr_to_col):
    
    # Convert Series to DataFrame
    df = pd.DataFrame(comp_col, columns=[comp_col.name])
    
    # Create x, where x the 'scores' column's values as floats
    x = comp_col.values.reshape(-1, 1).astype(float)

    # Create a minimum and maximum processor object
    min_max_scaler = preprocessing.MinMaxScaler()

    # Create an object to transform the data to fit minmax processor
    x_scaled = min_max_scaler.fit_transform(x)

    # Run the normalizer on the dataframe
    comp_col_df = pd.DataFrame(x_scaled, columns=[comp_col.name + '_norm'])
    
    # Add corr_to_col to comp_col_df
    comp_col_df[corr_to_col.name] = corr_to_col.astype('float64').values
    
    # Return correlation matrix
    return comp_col_df.corr(method='spearman').iloc[0,1]

In [23]:
# FGM
norm_and_corr_calc(lebron_logs.fgm, lebron_logs.wl_ind)

0.215271093487328

In [24]:
# FGA
norm_and_corr_calc(lebron_logs.fga, lebron_logs.wl_ind)

0.14057948374044035

In [25]:
# FG%
norm_and_corr_calc(lebron_logs.fg_pct, lebron_logs.wl_ind)

0.1333536728431039

#### Scale up to all players on single team

In [26]:
# Test substring logic for single value
player_game_logs_trim.matchup.sample(1).values[0][0:3]

'TOR'

In [27]:
# Scale to all values
teams = list(set([val[:3] for val in player_game_logs_trim.matchup.unique()]))

In [28]:
# Test for single team
filter = player_game_logs_trim.matchup.str.startswith('LAL')
lakers_player_games = player_game_logs_trim[filter]
lakers_player_games.sample(10)

Unnamed: 0,season_id,player_id,player_name,game_id,game_date,matchup,wl,fgm,fga,fg_pct,wl_ind
1126,22018,1627742,Brandon Ingram,21800235,"NOV 18, 2018",LAL @ MIA,W,6,15,0.4,1
8464,22018,1628366,Lonzo Ball,21800082,"OCT 27, 2018",LAL @ SAS,L,2,8,0.25,0
8451,22018,1628366,Lonzo Ball,21800286,"NOV 25, 2018",LAL vs. ORL,L,4,9,0.444,0
6522,22018,1628404,Josh Hart,21800526,"DEC 28, 2018",LAL vs. LAC,L,5,10,0.5,0
1123,22018,1627742,Brandon Ingram,21800286,"NOV 25, 2018",LAL vs. ORL,L,7,14,0.5,0
7434,22018,203484,Kentavious Caldwell-Pope,21800596,"JAN 07, 2019",LAL @ DAL,W,1,5,0.2,1
8410,22018,2544,LeBron James,21800189,"NOV 11, 2018",LAL vs. ATL,W,10,20,0.5,1
4826,22018,1627826,Ivica Zubac,21800627,"JAN 11, 2019",LAL @ UTA,L,4,10,0.4,0
6527,22018,1628404,Josh Hart,21800453,"DEC 18, 2018",LAL @ BKN,L,2,10,0.2,0
6546,22018,1628404,Josh Hart,21800183,"NOV 10, 2018",LAL @ SAC,W,4,9,0.444,1


In [29]:
rows = []
for player in lakers_player_games.player_name.unique():
    
    # Isolate Player
    player_logs = lakers_player_games[lakers_player_games.player_name == player]

    # Calculate correlations
    fga_corr = norm_and_corr_calc(player_logs.fga, player_logs.wl_ind)
    fgm_corr = norm_and_corr_calc(player_logs.fga, player_logs.wl_ind)
    fgp_corr = norm_and_corr_calc(player_logs.fga, player_logs.wl_ind)
    
    # Append values
    rows.append([player, fga_corr, fgm_corr, fgp_corr])
    
lakers_corr_df = pd.DataFrame(rows, columns=['player', 'fga_corr', 'fgm_corr', 'fg_pct_corr'])

In [30]:
lakers_corr_df.dropna().sort_values('fg_pct_corr', ascending=False)

Unnamed: 0,player,fga_corr,fgm_corr,fg_pct_corr
7,LeBron James,0.140579,0.140579,0.140579
4,Kentavious Caldwell-Pope,0.077221,0.077221,0.077221
2,JaVale McGee,0.071138,0.071138,0.071138
8,Lonzo Ball,0.02058,0.02058,0.02058
1,Ivica Zubac,-0.100279,-0.100279,-0.100279
5,Kyle Kuzma,-0.209163,-0.209163,-0.209163
0,Brandon Ingram,-0.212332,-0.212332,-0.212332
6,Lance Stephenson,-0.258927,-0.258927,-0.258927
3,Josh Hart,-0.324799,-0.324799,-0.324799


#### Scale to all players on all teams

In [41]:
# Empty list to persist rows
rows = []

# Iterate through each team
for team in tqdm(teams):
    
    # Isolate player game logs for team
    filter = player_game_logs_trim.matchup.str.startswith(team)
    tm_df = player_game_logs_trim[filter]
    
    # For each player with game on team
    for player in tm_df.player_name.unique():

        # Isolate Player
        player_logs = tm_df[tm_df.player_name == player]

        # Calculate correlations
        fga_corr = norm_and_corr_calc(player_logs.fga, player_logs.wl_ind)
        fgm_corr = norm_and_corr_calc(player_logs.fgm, player_logs.wl_ind)
        fgp_corr = norm_and_corr_calc(player_logs.fg_pct, player_logs.wl_ind)

        # Append values
        rows.append([team, player, fga_corr, fgm_corr, fgp_corr, len(player_logs)])

corr_df = pd.DataFrame(rows, columns=['team', 'player', 'fga_corr', 'fgm_corr', 'fg_pct_corr', 'gp']).dropna()

100%|██████████| 30/30 [00:03<00:00,  9.43it/s]


In [42]:
corr_df[corr_df.gp >= 20].sort_values('fg_pct_corr', ascending=False).sample(15)

Unnamed: 0,team,player,fga_corr,fgm_corr,fg_pct_corr,gp
215,LAC,Tobias Harris,0.105085,0.252027,0.254523,55
105,UTA,Kyle Korver,0.247469,0.525569,0.540087,35
78,DEN,Trey Lyles,-0.139381,-0.167773,-0.204884,55
283,DAL,Devin Harris,-0.177245,0.078855,0.254726,45
85,OKC,Steven Adams,0.057414,-0.052965,-0.126474,55
318,DET,Andre Drummond,0.1553,0.185439,0.081605,53
5,BKN,Jarrett Allen,0.144197,0.182663,-0.038601,57
109,CHA,Cody Zeller,0.076414,0.18234,0.135175,40
257,GSW,Stephen Curry,-0.193115,0.020037,0.277973,46
274,IND,Domantas Sabonis,0.102039,0.24623,0.231022,56


In [34]:
#!jupyter nbextension enable --py --sys-prefix widgetsnbextension

In [44]:
import ipywidgets as widgets

# Dropdown to select team
team_dropdown = widgets.Dropdown(
                            options=['ALL TEAMS'] + sorted(teams),
                            value='ALL TEAMS',
                            description='Select Team: ',
                            disabled=False,
                        )

# Slider to select minimum number of games played
gp_min_slider = widgets.IntSlider(
                            value=25,
                            min=corr_df.gp.min(),
                            max=corr_df.gp.max(),
                            step=1,
                            description='Min. GP:',
                            disabled=False,
                            continuous_update=False,
                            orientation='horizontal',
                            readout=True,
                            readout_format='d',
                        )
"""
# Button to output DF
output_button = widgets.Button(
                            description='Update Data',
                            disabled=False,
                        )
# Function to be executed on button click
def on_button_clicked(b):
    team_filter = corr_df.team == team_dropdown.value
    min_gp_filter = corr_df.gp >= gp_min_slider.value
    corr_df[(team_filter) & (min_gp_filter)].sort_values('fg_pct_corr', ascending=False)
    
# Set button click action
output_button.on_click(on_button_clicked)
"""

def view(x='',y=25):
    team_filter = corr_df.team == x
    min_gp_filter = corr_df.gp >= y
         
    if x=='ALL TEAMS': return corr_df[min_gp_filter].sort_values('fg_pct_corr', ascending=False).head(15)
    corr_df[(team_filter) & (min_gp_filter)].sort_values('fg_pct_corr', ascending=False)
         
widgets.interactive(view, x=team_dropdown, y=gp_min_slider)

interactive(children=(Dropdown(description='Select Team: ', options=('ALL TEAMS', 'ATL', 'BKN', 'BOS', 'CHA', …

In [45]:
team_filter = corr_df.team == team_dropdown.value
min_gp_filter = corr_df.gp >= gp_min_slider.value
corr_df[(team_filter) & (min_gp_filter)].sort_values('fg_pct_corr', ascending=False)

Unnamed: 0,team,player,fga_corr,fgm_corr,fg_pct_corr,gp
154,PHI,Wilson Chandler,-0.231887,0.112379,0.244591,36
148,PHI,Landry Shamet,-0.011179,0.206302,0.24192,54
145,PHI,Jimmy Butler,-0.438663,-0.154097,0.167398,36
149,PHI,Mike Muscala,-0.032856,0.165081,0.131327,47
142,PHI,Furkan Korkmaz,-0.047758,0.001739,0.10948,46
143,PHI,JJ Redick,-0.232466,-0.147384,0.040393,54
146,PHI,Joel Embiid,-0.092388,0.008899,0.036569,54
140,PHI,Ben Simmons,-0.096657,-0.077879,0.008952,57
152,PHI,T.J. McConnell,0.032007,0.080097,-0.031696,52
