In [1]:
from nba_api.stats.endpoints import leaguegamelog
import pandas as pd
import numpy as np

In [2]:
games = leaguegamelog.LeagueGameLog(season = '2018')

In [3]:
df = pd.DataFrame(games.get_data_frames()[0])

In [None]:
df

In [None]:
#Keep relevant variables
df = df[['TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]

In [4]:
teams = df['TEAM_NAME'].unique()

In [None]:
teams

In [None]:
stat_dict = {}

In [None]:
home_df = df[df['MATCHUP'].str.contains("vs.")]
away_df = df[df['MATCHUP'].str.contains("@")]

In [None]:
merged = home_df.merge(away_df, on = "GAME_ID",suffixes = ("_H", "_A"))

In [None]:
merged = merged.sort_values(by = ['GAME_DATE'])

In [20]:
#Can use this to get means and standard deviations
def generate_game_stats(game_df, teams): 
    #all_stats_arr = np.empty((30, 77, 19))
    all_teams_stats_df = pd.DataFrame()
    for i, team in enumerate(teams):
        temp_df = df[df['TEAM_NAME'] == team]
        temp_df = temp_df.sort_values(by = ['GAME_DATE'])
        temp_df = temp_df.set_index('GAME_ID')
        key_data = temp_df[['GAME_DATE', 'TEAM_NAME', 'MATCHUP', 'WL']].iloc[5:]
        temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]
        #final_stats_df = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]#.to_dict('list')
        #single_stats_arr = final_stats_df.to_numpy()
        #all_stats_arr[i] = single_stats_arr
        
        final_organized_stats = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]#.to_dict('index')
        stats_with_key = pd.concat([final_organized_stats, key_data], axis = 1)
     
        all_teams_stats_df = pd.concat([all_teams_stats_df, stats_with_key], axis = 0)
        
    home_df = all_teams_stats_df[all_teams_stats_df['MATCHUP'].str.contains("vs.")]
    away_df = all_teams_stats_df[all_teams_stats_df['MATCHUP'].str.contains("@")]
    merged_game_stats_df = home_df.merge(away_df, on = "GAME_ID",suffixes = ("_H", "_A"))
    merged_game_stats_df = merged_game_stats_df.sort_values(by = ['GAME_DATE_H'])
      
    return merged_game_stats_df

In [39]:
#Function to get population mean and standard deviations - might be stupid but I think it makes sense
#Make sure to use this on train and test set separately 
#After this, make these stats for each game their own matrices, so I could do (game_matrix - mean_matrix / std_matrix)
def generate_population_statistics(stats_df):
    game_means = np.empty((stats_df.shape[0], 19))
    game_stds = np.empty((stats_df.shape[0], 19))
    for i in range(stats_df.shape[0]):
        team_dict = {}
        pop_stats_arr = np.empty((30, 19))
        it1 = i
        it2 = i+1
        team_it = 0
             
        while len(team_dict.keys()) < 30:
            if it1 >= 0:
                if stats_df.iloc[it1]['TEAM_NAME_H'] not in team_dict.keys():  
                    team_dict[stats_df.iloc[it1]['TEAM_NAME_H']] = ''
                    
                    pop_stats_arr[team_it] = stats_df.iloc[it1][['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
                   'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H',
                   'BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H']].to_numpy()
                    team_it += 1
                    
                if stats_df.iloc[it1]['TEAM_NAME_A'] not in team_dict.keys():
                    team_dict[stats_df.iloc[it1]['TEAM_NAME_A']] = ''
                    
                    pop_stats_arr[team_it] = stats_df.iloc[it1][['FGM_A', 'FGA_A', 'FG_PCT_A',
                   'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A',
                   'DREB_A', 'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PTS_A',
                   'PLUS_MINUS_A']].to_numpy()
                    team_it += 1
                    
                it1 -= 1    
                           
            if it2 < stats_df.shape[0]:
                if stats_df.iloc[it2]['TEAM_NAME_H'] not in team_dict.keys():  
                    team_dict[stats_df.iloc[it2]['TEAM_NAME_H']] = ''
                    
                    pop_stats_arr[team_it] = stats_df.iloc[it2][['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
                   'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H',
                   'BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H']].to_numpy()
                    team_it += 1
                    
                if stats_df.iloc[it2]['TEAM_NAME_A'] not in team_dict.keys():
                    team_dict[stats_df.iloc[it2]['TEAM_NAME_A']] = ''
                    
                    pop_stats_arr[team_it] = stats_df.iloc[it2][['FGM_A', 'FGA_A', 'FG_PCT_A',
                   'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A',
                   'DREB_A', 'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PTS_A',
                   'PLUS_MINUS_A']].to_numpy()
                    team_it += 1
                    
                it2 += 1
                
        mean_stats = np.mean(pop_stats_arr, axis = 0)        
        std_stats = np.std(pop_stats_arr, axis = 0)
        
        game_means[i] = mean_stats
        game_stds[i] = std_stats
        
    return game_means, game_stds    
    

In [21]:
stats_df = generate_game_stats(df, teams)

In [40]:
arr1, arr2 = generate_population_statistics(stats_df)

In [30]:
stats_df.iloc[0][['FGM_H', 'FGA_H']]

FGM_H    39.6
FGA_H    94.6
Name: 0021800068, dtype: object

In [23]:
stats_df.head(20)

Unnamed: 0_level_0,FGM_H,FGA_H,FG_PCT_H,FG3M_H,FG3A_H,FG3_PCT_H,FTM_H,FTA_H,FT_PCT_H,OREB_H,...,STL_A,BLK_A,TOV_A,PF_A,PTS_A,PLUS_MINUS_A,GAME_DATE_A,TEAM_NAME_A,MATCHUP_A,WL_A
GAME_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21800068,39.6,94.6,0.4178,11.0,31.6,0.3474,16.2,21.4,0.7628,10.8,...,7.8,5.6,17.6,25.4,119.4,9.8,2018-10-26,Golden State Warriors,GSW @ NYK,W
21800081,44.2,93.2,0.4758,16.0,41.8,0.3844,16.2,21.2,0.7474,11.4,...,6.6,6.0,13.8,18.4,102.8,-8.2,2018-10-27,Orlando Magic,ORL @ MIL,L
21800077,40.0,91.0,0.4384,7.2,22.2,0.3172,18.2,22.6,0.8108,13.0,...,7.2,5.2,14.6,22.6,110.2,8.2,2018-10-27,Indiana Pacers,IND @ CLE,W
21800078,42.0,92.8,0.4512,11.8,34.4,0.338,18.2,24.2,0.7646,9.2,...,6.4,7.2,12.0,19.2,116.8,7.8,2018-10-27,Charlotte Hornets,CHA @ PHI,L
21800083,39.8,85.6,0.4668,13.2,33.6,0.3808,14.4,19.6,0.732,10.0,...,8.4,5.4,16.2,22.8,123.4,13.8,2018-10-28,Golden State Warriors,GSW @ BKN,W
21800084,40.6,92.4,0.4386,13.2,40.0,0.3324,18.8,25.0,0.7588,11.4,...,7.8,5.2,16.2,21.4,112.4,5.8,2018-10-28,Utah Jazz,UTA @ DAL,W
21800086,39.2,84.8,0.4634,8.2,24.4,0.3362,26.0,30.2,0.8638,9.0,...,7.8,7.8,13.6,27.2,116.8,-6.0,2018-10-28,Washington Wizards,WAS @ LAC,L
21800091,39.0,82.4,0.4738,11.6,28.4,0.4102,18.4,24.0,0.763,7.4,...,8.6,5.6,15.4,21.4,122.6,14.8,2018-10-29,Golden State Warriors,GSW @ CHI,W
21800092,44.0,92.6,0.477,15.2,41.0,0.3686,17.4,21.8,0.78,10.4,...,7.4,5.8,15.0,21.8,117.0,10.6,2018-10-29,Toronto Raptors,TOR @ MIL,L
21800087,43.8,86.4,0.512,10.4,22.6,0.4448,13.8,19.8,0.719,8.0,...,6.6,6.2,13.8,21.8,122.4,5.2,2018-10-29,Portland Trail Blazers,POR @ IND,W


In [None]:
stats_arr

In [None]:
#Function to flatten stats dict and find averages. Would show average and std dev. of team atttributes for 5th, 
# 6th... 82nd game

num_teams = teams.size

In [None]:
#test
team = 'Oklahoma City Thunder'
temp_df = df[df['TEAM_NAME'] == team]
temp_df = temp_df.sort_values(by = ['GAME_DATE'])
temp_df = temp_df.set_index('GAME_ID')
temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]
final_stats_df = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]
single_stats_arr = final_stats_df.to_numpy()
all_stats_arr = np.empty((30, 77, 19))


In [None]:
stats_calc_dict = {}
descriptive_dict = {}
all_stats_arr = np.empty((30, 77, 19))
team = 'Oklahoma City Thunder'
temp_df = df[df['TEAM_NAME'] == team]
temp_df = temp_df.sort_values(by = ['GAME_DATE'])
temp_df = temp_df.set_index('GAME_ID')
key_data = temp_df[['TEAM_NAME', 'MATCHUP', 'WL']].iloc[5:]
temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]
final_stats_df = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]
single_stats_arr = final_stats_df.to_numpy()
all_stats_arr[0] = single_stats_arr
        
final_organized_stats = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]
stats_with_key = pd.concat([final_organized_stats, key_data], axis = 1)

In [None]:
all_teams_stats_df = pd.DataFrame()

In [None]:
all_teams_stats_df = pd.concat([all_teams_stats_df, stats_with_key], axis = 0)

In [None]:
temp_df = df[df['TEAM_NAME'] == team]
temp_df.head(10)

In [None]:
stats_with_key

In [None]:
final_organized_stats = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]

In [None]:
final_organized_stats

In [None]:
final_stats_df.to_numpy().shape

In [None]:
df.from_dict(dict2['Oklahoma City Thunder'], orient = 'index')

In [None]:
dict1, dict2 = generate_team_stats(df, teams)

In [None]:
dict2

In [None]:
temp_df = df[df['TEAM_NAME'] == 'Oklahoma City Thunder']

In [None]:
temp_df = temp_df.set_index('GAME_ID')

In [None]:
game_ids = temp_df['GAME_ID']

In [None]:
temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]
temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]

In [None]:
game_ids.shift(periods = 5).iloc[5:]

In [None]:
temp_df = temp_df.sort_values(by = ['GAME_DATE'])

In [None]:
temp_df = temp_df[[
       'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]

In [None]:
temp_df.rolling(5).mean().shift(periods = 1).tail(10)

In [None]:
df.head(5)

In [None]:
#Split data frames into home and away teams
home_df = df[df['MATCHUP'].str.contains("vs.")]
away_df = df[df['MATCHUP'].str.contains("@")]

In [None]:
merged = home_df.merge(away_df, on = "GAME_ID",suffixes = ("_H", "_A"))

In [None]:
merged

In [None]:
#make list of all team names, for loop filter by team n times, and make the rolling dict values?
# Can then go in and replace values/make new df with the normalized vals