In [1]:
import matplotlib.pyplot as plt
import requests
import pandas as pd
import numpy as np
from json import loads

plt.style.use('seaborn')
%matplotlib inline

HEADERS = {
            'user-agent': ('Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'), 
            'Dnt': ('1'),
            'Accept-Encoding': ('gzip, deflate, sdch'),
            'Accept-Language': ('en'),
            'origin': ('http://stats.nba.com')
            }

In [2]:
def get_pbp(gameid):
    url = "http://stats.nba.com/stats/playbyplayv2?EndPeriod=10&EndRange=55800&GameID={}" +\
    "&RangeType=2&Season=2015-16&SeasonType=Regular+Season&StartPeriod=1&StartRange=0"
    page = requests.get(url.format(gameid), headers = HEADERS)
    
    dic = loads(page.text)
    pbp_headers = dic['resultSets'][0]['headers']
    pbp_data = dic['resultSets'][0]['rowSet']
    
    pbp_df = pd.DataFrame(pbp_data, columns=pbp_headers)
    return pbp_df

In [3]:
def gamelog(season):
    URL = 'http://stats.nba.com/stats/{}'
    endpoint = 'leaguegamelog'

    p_master=None

    seasonstr = str(season)+'-'+str(season+1)[2:]

    json =  requests.get(URL.format(endpoint),
                         params= {'LeagueID': '00',
                                  'Season': seasonstr,
                                  'SeasonType': 'Regular Season',
                                  'PlayerOrTeam': 'T',
    #                               'Counter': '1000',
                                  'Sorter': 'PTS',
                                  'Direction': 'DESC'
                                 },
                         headers = HEADERS
                        ).json()

    headers = json['resultSets'][0]['headers']
    values = json['resultSets'][0]['rowSet']

    df = pd.DataFrame(values, columns = headers)

    gamelist = df['GAME_ID'].unique()
    return gamelist

In [159]:
test = gamelog(2017)
pbp = get_pbp(test[-1])

In [160]:
# Get number of points for each team

# Mark all made field goals
cond_madefg = (pbp['EVENTMSGTYPE'] == 1)
pbp['SHOT_VALUE'] = np.where(cond_madefg, 2, 0)

# Mark all made 3 point field goals
cond_3pt = ((pbp['EVENTMSGTYPE'] == 1) & 
            (pbp['HOMEDESCRIPTION'].map(str) +
             pbp['VISITORDESCRIPTION'].map(str)).str.contains('3PT')
           )
pbp['SHOT_VALUE'] = np.where(cond_3pt, 3, pbp['SHOT_VALUE'])

# Mark all free throws
cond_ft = pbp['EVENTMSGTYPE'] == 3
pbp['SHOT_VALUE'] = np.where(cond_ft, 1, pbp['SHOT_VALUE'])

# Mark all plays that resulted in points for the away team
cond_away = pbp['VISITORDESCRIPTION'].str.contains('PTS\)')
pbp['AWAY_SHOT_VALUE'] = np.where(cond_away, pbp['SHOT_VALUE'], 0)

# Mark all plays that resulted in points for the home team
cond_home = pbp['HOMEDESCRIPTION'].str.contains('PTS\)')
pbp['HOME_SHOT_VALUE'] = np.where(cond_home, pbp['SHOT_VALUE'], 0)

In [161]:
# Mark each change of possession

# Made field goals
cond_madefg = (pbp['EVENTMSGTYPE'] == 1)
pbp['POSS_CHANGE'] = np.where(cond_madefg, 1, 0)

# Made free throws (2 of 2, 3 of 3)
cond_madeft = ((pbp['EVENTMSGTYPE'] == 3) &
           (pbp['EVENTMSGACTIONTYPE'].isin([12, 15])) &
           ((pbp['HOMEDESCRIPTION'].map(str) +
             pbp['VISITORDESCRIPTION'].map(str)).str.contains('PTS\)'))
          )
pbp['POSS_CHANGE'] = np.where(cond_madeft, 1, pbp['POSS_CHANGE'])

# Missed and-1 free throw
cond_missand1 = ((pbp['EVENTMSGTYPE'] == 3) &
                 (pbp['EVENTMSGACTIONTYPE'] == 10) &
                 ((pbp['HOMEDESCRIPTION'].map(str) +
                   pbp['VISITORDESCRIPTION'].map(str)).str.contains('MISS '))
                )
pbp['POSS_CHANGE'] = np.where(cond_missand1, -1, pbp['POSS_CHANGE'])

# Turnovers
cond_turnover = (pbp['EVENTMSGTYPE'] == 5)
pbp['POSS_CHANGE'] = np.where(cond_turnover, 1, pbp['POSS_CHANGE'])

# End of period
cond_period = (pbp['EVENTMSGTYPE'] == 13)
pbp['POSS_CHANGE'] = np.where(cond_period, 1, pbp['POSS_CHANGE'])

# Defensive rebounds

# Determine which team the event corresponds to
cond_hometeam = (pbp['PERSON1TYPE'].isin([2,4]))
pbp['EVENT_TEAM'] = np.where(cond_hometeam, 'HOME', None)

cond_awayteam = (pbp['PERSON1TYPE'].isin([3,5]))
pbp['EVENT_TEAM'] = np.where(cond_awayteam, 'AWAY', pbp['EVENT_TEAM'])

# Get the prior event
pbp['PRIOR_EVENT'] = pbp['EVENTMSGTYPE'].shift(1)
pbp['PRIOR_TEAM'] = pbp['EVENT_TEAM'].shift(1)

# The event preceding the rebound should always be a shot
# Count the defensive rebounds by counting where the rebound and previous event are different teams
cond_defreb = ((pbp['EVENT_TEAM'] != pbp['PRIOR_TEAM']) &
               (pbp['EVENTMSGTYPE'] == 4) &
               (pbp['PRIOR_EVENT'].isin([2,3]))
              )
pbp['POSS_CHANGE'] = np.where(cond_defreb, 1, pbp['POSS_CHANGE'])

In [162]:
# Remove changes in possession which occur with less than 4 seconds in period
pbp['PLAY_CLOCK'] = pd.to_timedelta('00:' + pbp['PCTIMESTRING'].map(str)).dt.total_seconds()
idx = pbp.loc[((pbp['POSS_CHANGE'] != 0) &
               (pbp['EVENTMSGTYPE'] != 13) &
               (pbp['PLAY_CLOCK'] <= 4)
              )].groupby('PERIOD').tail(1).index
pbp.loc[idx, 'POSS_CHANGE'] = 0
pbp['TOTAL_POSS'] = pbp['POSS_CHANGE'].cumsum()

In [163]:
# Get which players are on the court
# Create a column for every player that played in the game
teams = set(pbp['PLAYER1_TEAM_ID'].unique())
players = set(list(pbp['PLAYER1_ID'].unique()) +
              list(pbp['PLAYER2_ID'].unique()) +
              list(pbp['PLAYER3_ID'].unique()))
players = list(players - teams - set([0]))
periods = pbp['PERIOD'].unique()

In [170]:
# Substitution events
total_time = 0
for player_id in players:
    pbp[player_id] = 0
#     player_id = players[2]
    play_time = 0
    for period in periods:
#         period = periods[3]

        period_df = pbp.loc[pbp['PERIOD'] == period]
        period_players = set(list(period_df['PLAYER1_ID'].unique()) +
                             list(period_df['PLAYER2_ID'].unique()) +
                             list(period_df['PLAYER3_ID'].unique()))
        period_players = list(period_players - teams - set([0]))
        if player_id not in period_players:
            continue
            
        subs = pbp.loc[(((pbp['PLAYER2_ID'] == player_id) | (pbp['PLAYER1_ID'] == player_id)) &
                        (pbp['EVENTMSGTYPE'] == 8) &
                        (pbp['PERIOD'] == period)), ['PLAYER2_ID', 'PLAY_CLOCK']].values
        period_length = pbp.loc[pbp['PERIOD'] == period, 'PLAY_CLOCK'].max()
        
        if len(subs) == 0:
            subs = np.insert(subs, 0, [player_id, period_length+1], axis=0)
        elif subs[0][0] != player_id:
            subs = np.insert(subs, 0, [player_id, period_length+1], axis=0)

        for sub in subs:
            # Entering the game
            if sub[0] == player_id:
                pbp.loc[(pbp['PLAY_CLOCK'] < sub[1]) & (pbp['PERIOD'] == period), player_id] = 1
                play_time += sub[1]
            
            # Exiting the game
            else:
                pbp.loc[(pbp['PLAY_CLOCK'] < sub[1]) & (pbp['PERIOD'] == period), player_id] = 0
                play_time -= sub[1]
    print("{} played {} minutes".format(player_id, play_time/60))
    total_time += play_time
print(total_time/60)

202692 played 22.183333333333334 minutes
203079 played 32.43333333333333 minutes
201609 played 32.85 minutes
203918 played 34.68333333333333 minutes
201937 played 22.883333333333333 minutes
1626196 played 34.55 minutes
202324 played 24.4 minutes
202327 played 10.15 minutes
203482 played 17.283333333333335 minutes
1628378 played 35.93333333333333 minutes
204060 played 31.316666666666666 minutes
201949 played 22.9 minutes
203497 played 35.31666666666667 minutes
201961 played 15.266666666666667 minutes
1626159 played 21.716666666666665 minutes
202355 played 30.783333333333335 minutes
204020 played 32.55 minutes
200757 played 15.7 minutes
201973 played 7.766666666666667 minutes
480.666666667


In [165]:
pbp['NUM_PLAYERS'] = pbp[players].sum(axis=1)
pbp.groupby(players)[['HOME_SHOT_VALUE', 'AWAY_SHOT_VALUE', 'POSS_CHANGE']].sum().reset_index()