In [27]:
from nba_api.stats.endpoints import leaguegamelog
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import time
import xgboost as xgb
import json

In [26]:
pd.set_option("display.max_columns", None)

In [5]:
def get_advanced_stats(year):
    data = json.load(open(f"{year}.json"))
    advanced_stats = pd.DataFrame(data['resultSets'][0]['rowSet'], columns = data['resultSets'][0]['headers'])    
    return advanced_stats    

In [6]:
#Can use this to get means and standard deviations
def generate_game_stats(df, teams, year): 
    #all_stats_arr = np.empty((30, 77, 19))
    all_teams_stats_df = pd.DataFrame()
    for i, team in enumerate(teams):
        temp_df = df[df['TEAM_NAME'] == team]
        advanced_stats = get_advanced_stats(year)
        #Merge with temp 
        temp_df = temp_df.merge(advanced_stats, on = ["GAME_ID", "TEAM_ID"], suffixes = ("", "_y"))
        to_drop = [x for x in temp_df if x.endswith('_y')]
        temp_df.drop(to_drop, axis=1, inplace=True)
        
        temp_df = temp_df.sort_values(by = ['GAME_DATE'])
        temp_df = temp_df.set_index('GAME_ID')
        key_data = temp_df[['GAME_DATE', 'TEAM_NAME', 'MATCHUP', 'WL']].iloc[7:]
        temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING',
           'DEF_RATING', 'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TO',
           'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT',
           'TS_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'POSS', 'PIE', 'GP_RANK',
           'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'OFF_RATING_RANK',
           'DEF_RATING_RANK', 'NET_RATING_RANK', 'AST_PCT_RANK', 'AST_TO_RANK',
           'AST_RATIO_RANK', 'OREB_PCT_RANK', 'DREB_PCT_RANK', 'REB_PCT_RANK',
           'TM_TOV_PCT_RANK', 'EFG_PCT_RANK', 'TS_PCT_RANK', 'PACE_RANK',
           'PIE_RANK']]
        
        
        
        final_organized_stats = temp_df.rolling(7).mean().shift(periods = 1).iloc[7:]#.to_dict('index')
        stats_with_key = pd.concat([final_organized_stats, key_data], axis = 1)
     
        all_teams_stats_df = pd.concat([all_teams_stats_df, stats_with_key], axis = 0)
        
    home_df = all_teams_stats_df[all_teams_stats_df['MATCHUP'].str.contains("vs.")]
    away_df = all_teams_stats_df[all_teams_stats_df['MATCHUP'].str.contains("@")]
    merged_game_stats_df = home_df.merge(away_df, on = "GAME_ID",suffixes = ("_H", "_A"))
    merged_game_stats_df = merged_game_stats_df.sort_values(by = ['GAME_DATE_H'])
      
    return merged_game_stats_df

In [7]:
#Function to get population mean and standard deviations - might be stupid but I think it makes sense
#Make sure to use this on train and test set separately 
#After this, make these stats for each game their own matrices, so I could do (game_matrix - mean_matrix / std_matrix)
def generate_population_statistics(stats_df):
    game_means = np.empty((stats_df.shape[0], 19))
    game_stds = np.empty((stats_df.shape[0], 19))
    
    for i in range(stats_df.shape[0]):
        inter_stats_df = stats_df[stats_df['GAME_DATE_H'].str.contains(stats_df.iloc[i]['GAME_DATE_H'][0:4])]
        inter_index = stats_df.index[i]
        it1 = np.where(inter_stats_df.index == inter_index)[0][0]
        team_dict = {}
        pop_stats_arr = np.empty((30, 19))
        it2 = it1+1
        team_it = 0
    
        while len(team_dict.keys()) < 30:

            if it1 >= 0:

                if inter_stats_df.iloc[it1]['TEAM_NAME_H'] not in team_dict.keys():  
                    team_dict[inter_stats_df.iloc[it1]['TEAM_NAME_H']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it1][['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
                   'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H',
                   'BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H']].to_numpy()
                    team_it += 1

                if inter_stats_df.iloc[it1]['TEAM_NAME_A'] not in team_dict.keys():
                    team_dict[inter_stats_df.iloc[it1]['TEAM_NAME_A']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it1][['FGM_A', 'FGA_A', 'FG_PCT_A',
                   'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A',
                   'DREB_A', 'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PTS_A',
                   'PLUS_MINUS_A']].to_numpy()
                    team_it += 1

                it1 -= 1    

            if it2 < inter_stats_df.shape[0]:
                if inter_stats_df.iloc[it2]['TEAM_NAME_H'] not in team_dict.keys():  
                    team_dict[inter_stats_df.iloc[it2]['TEAM_NAME_H']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it2][['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
                   'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H',
                   'BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H']].to_numpy()
                    team_it += 1

                if inter_stats_df.iloc[it2]['TEAM_NAME_A'] not in team_dict.keys():
                    team_dict[inter_stats_df.iloc[it2]['TEAM_NAME_A']] = ''

                    pop_stats_arr[team_it] = stats_df.iloc[it2][['FGM_A', 'FGA_A', 'FG_PCT_A',
                   'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A',
                   'DREB_A', 'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PTS_A',
                   'PLUS_MINUS_A']].to_numpy()
                    team_it += 1

                it2 += 1
                
        mean_stats = np.mean(pop_stats_arr, axis = 0)        
        std_stats = np.std(pop_stats_arr, axis = 0)
        
        game_means[i] = mean_stats
        game_stds[i] = std_stats
        
    return game_means, game_stds    
    

In [8]:
#Function to get z scores and finalize attributes as (Home minus Away)
def normalize_and_standardize(stats_df, game_means, game_stds):
   
    home_stats = np.empty((stats_df.shape[0], 19))
    away_stats = np.empty((stats_df.shape[0], 19))
    
    home_stats = stats_df.iloc[:, 0:19]
    away_stats = stats_df.iloc[:, 23:42]
    win_loss_home = stats_df.iloc[:, 22]
    matchup = stats_df.iloc[:, [43, 20]]
    
    #standardize home and away stats 
    home_z = np.divide((home_stats - game_means), game_stds)
    
    away_z = np.divide((away_stats - game_means), game_stds)
    
    final = np.subtract(home_z, away_z)
    
    final = pd.DataFrame(final)

    final = pd.concat([final, win_loss_home], axis = 1)
    
    return final, matchup

In [47]:
def normalize_and_standardize_sklearn(stats_df):
    
    home_stats = np.empty((stats_df.shape[0], 51))
    away_stats = np.empty((stats_df.shape[0], 51))
    
    home_stats = np.array(stats_df[['FGM_H','FGA_H','FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H', 'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H','BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H',  'OFF_RATING_H', 'DEF_RATING_H',  'NET_RATING_H', 'AST_PCT_H', 'AST_TO_H', 'AST_RATIO_H', 'OREB_PCT_H', 'DREB_PCT_H', 'REB_PCT_H', 'TM_TOV_PCT_H','EFG_PCT_H', 'TS_PCT_H',  'PACE_H', 'PACE_PER40_H', 'POSS_H', 'PIE_H', 'GP_RANK_H', 'W_RANK_H', 'L_RANK_H', 'W_PCT_RANK_H', 'MIN_RANK_H', 'OFF_RATING_RANK_H', 'DEF_RATING_RANK_H', 'NET_RATING_RANK_H', 'AST_PCT_RANK_H', 'AST_TO_RANK_H', 'AST_RATIO_RANK_H', 'OREB_PCT_RANK_H', 'DREB_PCT_RANK_H', 'REB_PCT_RANK_H',  'TS_PCT_RANK_H',  'PIE_RANK_H']])
    away_stats = np.array(stats_df[['FGM_A', 'FGA_A', 'FG_PCT_A', 'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A', 'DREB_A', 'REB_A', 'AST_A', 'STL_A','BLK_A', 'TOV_A', 'PF_A','PTS_A', 'PLUS_MINUS_A', 'OFF_RATING_A', 'DEF_RATING_A', 'NET_RATING_A', 'AST_PCT_A', 'AST_TO_A', 'AST_RATIO_A', 'OREB_PCT_A', 'DREB_PCT_A', 'REB_PCT_A', 'TM_TOV_PCT_A', 'EFG_PCT_A', 'TS_PCT_A',  'PACE_A', 'PACE_PER40_A', 'POSS_A', 'PIE_A', 'GP_RANK_A', 'W_RANK_A', 'L_RANK_A', 'W_PCT_RANK_A', 'MIN_RANK_A', 'OFF_RATING_RANK_A', 'DEF_RATING_RANK_A','NET_RATING_RANK_A', 'AST_PCT_RANK_A', 'AST_TO_RANK_A','AST_RATIO_RANK_A', 'OREB_PCT_RANK_A', 'DREB_PCT_RANK_A', 'REB_PCT_RANK_A', 'TS_PCT_RANK_A', 'PIE_RANK_A']])
    win_loss_home = np.array(stats_df[['WL_H']])
    matchup = stats_df[['TEAM_NAME_A', 'TEAM_NAME_H']]
    
    #standardize home and away stats 
    scaler = StandardScaler()
    final = np.subtract(home_stats, away_stats)
    final = scaler.fit_transform(final)
   
    return final, win_loss_home, matchup

In [48]:
#Include num_years previous years plus current season games
def generate_full_train_test(num_years):
    start_year = 2022 - num_years
    all_years_stats_df = pd.DataFrame()
    #For each year, 
    for i in range(start_year, 2022):
        time.sleep(1)
        games = leaguegamelog.LeagueGameLog(season = str(i))
        df = pd.DataFrame(games.get_data_frames()[0])
        df = df[df['WL'].notnull()]
        teams = df['TEAM_NAME'].unique()
        stats_df = generate_game_stats(df, teams)
        
        all_years_stats_df = pd.concat([all_years_stats_df, stats_df], axis = 0)
        
    
    #Split data into train and test and transform
    train_stats = all_years_stats_df.iloc[:int(all_years_stats_df.shape[0]*0.7), :]
    test_stats = all_years_stats_df.iloc[int(all_years_stats_df.shape[0]*0.7):, :]

    game_means_train, game_stds_train = generate_population_statistics(train_stats)
    game_means_test, game_stds_test = generate_population_statistics(test_stats)
    
    #Finalize and prepare train / test
    final_train, matchup_train = normalize_and_standardize(train_stats, game_means_train, game_stds_train)
    final_test, matchup_test = normalize_and_standardize(test_stats, game_means_test, game_stds_test)
    
    #Prepare actual train and test
    train_X = final_train.iloc[:,:19]
    train_Y = final_train.iloc[:,19]

    test_X = final_test.iloc[:, :19]
    test_Y = final_test.iloc[:,19]
    
    return train_X, train_Y, test_X, test_Y

In [55]:
def generate_full_train_test_sklearn(num_years):
    start_year = 2022 - num_years
    all_years_stats_df = pd.DataFrame()
    #For each year, 
    for i in range(start_year, 2022):
        time.sleep(1)
        games = leaguegamelog.LeagueGameLog(season = str(i))
        df = pd.DataFrame(games.get_data_frames()[0])
        df = df[df['WL'].notnull()]
        teams = df['TEAM_NAME'].unique()
        stats_df = generate_game_stats(df, teams, i)
        
        all_years_stats_df = pd.concat([all_years_stats_df, stats_df], axis = 0)
        
    attributes, target, matchup = normalize_and_standardize_sklearn(all_years_stats_df)
    
    train_X = attributes[:int(attributes.shape[0]*0.7)]
    test_X = attributes[int(attributes.shape[0]*0.7):]
    train_Y = target[:int(target.shape[0]*0.7)]
    test_Y = target[int(target.shape[0]*0.7):]
    
    #return train_X, train_Y, test_X, test_Y
    return train_X, train_Y, test_X, test_Y

In [56]:
train_X, train_Y, test_X, test_Y = generate_full_train_test_sklearn(10)

In [57]:
train_X

array([[-0.44787057,  1.52856637, -1.77867061, ...,  1.5324116 ,
         2.28906467,  2.07625367],
       [-0.6590288 , -1.01020143, -0.0837882 , ...,  1.78399735,
        -0.04268458,  0.5955098 ],
       [-0.6590288 ,  0.12477712, -0.92308093, ..., -2.03489637,
         0.44168846,  0.30064939],
       ...,
       [-0.44787057, -0.77125857,  0.1158494 , ...,  0.08897016,
        -0.73013131, -1.08405255],
       [-0.06778575, -0.95046571,  0.6292032 , ...,  1.64422749,
        -1.2094979 , -0.80002614],
       [-1.92597818, -1.48808713, -1.12679276, ...,  1.3205332 ,
         0.1306634 ,  1.09972988]])

In [None]:
#Test
games = leaguegamelog.LeagueGameLog(season = '2021')
games_df = pd.DataFrame(games.get_data_frames()[0])
#teams = df['TEAM_NAME'].unique()
#stats_df = generate_game_stats(df, teams)

In [None]:
lr = LogisticRegression()
grid_values = {'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'max_iter': [20, 50, 100, 200, 500, 1000]}
clf = GridSearchCV(lr, param_grid=grid_values)
clf.fit(train_X, train_Y)

In [73]:
#Fit model
clf = LogisticRegression(random_state=0, C=0.01).fit(train_X, train_Y)
y_pred = clf.predict(test_X)

  y = column_or_1d(y, warn=True)


In [74]:
#Train accuracy
clf.score(train_X, train_Y)

0.6500722448443452

In [75]:
#Test accuracy
accuracy_score(test_Y, y_pred)

0.6233527428746553

In [68]:
#XGBoost Test
xgb_cl = xgb.XGBClassifier()
# Fit
xgb_cl.fit(train_X, train_Y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [69]:
# Predict
y_pred = xgb_cl.predict(test_X)
accuracy_score(test_Y, y_pred)

0.5982224946368373

In [24]:
games = leaguegamelog.LeagueGameLog(season = '2018')
df = pd.DataFrame(games.get_data_frames()[0])



team_df = df[df['TEAM_NAME'] == 'San Antonio Spurs']
advanced_stats = get_advanced_stats(2018)
temp_df = team_df.merge(advanced_stats, on = ["GAME_ID", "TEAM_ID"], suffixes = ("", "_y"))
to_drop = [x for x in temp_df if x.endswith('_y')]
temp_df.drop(to_drop, axis=1, inplace=True)