In [1]:
from nba_api.stats.endpoints import leaguegamelog
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import norm
import time
import xgboost as xgb
import json

  from pandas import MultiIndex, Int64Index


In [2]:
pd.set_option("display.max_columns", None)

In [133]:
def get_advanced_stats(year):
    reg_data = json.load(open(f"{year}.json"))
    playoff_data = json.load(open(f"{year}_p.json"))
    advanced_stats_reg = pd.DataFrame(reg_data['resultSets'][0]['rowSet'], columns = reg_data['resultSets'][0]['headers'])
    advanced_stats_playoff = pd.DataFrame(playoff_data['resultSets'][0]['rowSet'], columns = playoff_data['resultSets'][0]['headers'])
   
    advanced_stats = pd.concat([advanced_stats_reg, advanced_stats_playoff], axis = 0)
    
    return advanced_stats    

In [140]:
#Can use this to get means and standard deviations
def generate_game_stats(df, teams, year): 
    #all_stats_arr = np.empty((30, 77, 19))
    all_teams_stats_df = pd.DataFrame()
    for i, team in enumerate(teams):
        temp_df = df[df['TEAM_NAME'] == team]
        advanced_stats = get_advanced_stats(year)
        #Merge with temp 
        temp_df = temp_df.merge(advanced_stats, on = ["GAME_ID", "TEAM_ID"], suffixes = ("", "_y"))
        to_drop = [x for x in temp_df if x.endswith('_y')]
        temp_df.drop(to_drop, axis=1, inplace=True)
        
        temp_df = temp_df.sort_values(by = ['GAME_DATE'])
        temp_df = temp_df.set_index('GAME_ID')
        key_data = temp_df[['GAME_DATE', 'TEAM_NAME', 'MATCHUP', 'WL']].iloc[7:]
        temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING',
           'DEF_RATING', 'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TO',
           'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT',
           'TS_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'POSS', 'PIE', 'GP_RANK',
           'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'OFF_RATING_RANK',
           'DEF_RATING_RANK', 'NET_RATING_RANK', 'AST_PCT_RANK', 'AST_TO_RANK',
           'AST_RATIO_RANK', 'OREB_PCT_RANK', 'DREB_PCT_RANK', 'REB_PCT_RANK',
           'TM_TOV_PCT_RANK', 'EFG_PCT_RANK', 'TS_PCT_RANK', 'PACE_RANK',
           'PIE_RANK']]
        
        
        
        final_organized_stats = temp_df.rolling(7).mean().shift(periods = 1).iloc[7:]#.to_dict('index')
        stats_with_key = pd.concat([final_organized_stats, key_data], axis = 1)
     
        all_teams_stats_df = pd.concat([all_teams_stats_df, stats_with_key], axis = 0)
        
    home_df = all_teams_stats_df[all_teams_stats_df['MATCHUP'].str.contains("vs.")]
    away_df = all_teams_stats_df[all_teams_stats_df['MATCHUP'].str.contains("@")]
    merged_game_stats_df = home_df.merge(away_df, on = "GAME_ID",suffixes = ("_H", "_A"))
    merged_game_stats_df = merged_game_stats_df.sort_values(by = ['GAME_DATE_H'])
      
    return merged_game_stats_df

In [5]:
#Function to get population mean and standard deviations - might be stupid but I think it makes sense
#Make sure to use this on train and test set separately 
#After this, make these stats for each game their own matrices, so I could do (game_matrix - mean_matrix / std_matrix)
def generate_population_statistics(stats_df):
    game_means = np.empty((stats_df.shape[0], 19))
    game_stds = np.empty((stats_df.shape[0], 19))
    
    for i in range(stats_df.shape[0]):
        inter_stats_df = stats_df[stats_df['GAME_DATE_H'].str.contains(stats_df.iloc[i]['GAME_DATE_H'][0:4])]
        inter_index = stats_df.index[i]
        it1 = np.where(inter_stats_df.index == inter_index)[0][0]
        team_dict = {}
        pop_stats_arr = np.empty((30, 19))
        it2 = it1+1
        team_it = 0
    
        while len(team_dict.keys()) < 30:

            if it1 >= 0:

                if inter_stats_df.iloc[it1]['TEAM_NAME_H'] not in team_dict.keys():  
                    team_dict[inter_stats_df.iloc[it1]['TEAM_NAME_H']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it1][['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
                   'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H',
                   'BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H']].to_numpy()
                    team_it += 1

                if inter_stats_df.iloc[it1]['TEAM_NAME_A'] not in team_dict.keys():
                    team_dict[inter_stats_df.iloc[it1]['TEAM_NAME_A']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it1][['FGM_A', 'FGA_A', 'FG_PCT_A',
                   'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A',
                   'DREB_A', 'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PTS_A',
                   'PLUS_MINUS_A']].to_numpy()
                    team_it += 1

                it1 -= 1    

            if it2 < inter_stats_df.shape[0]:
                if inter_stats_df.iloc[it2]['TEAM_NAME_H'] not in team_dict.keys():  
                    team_dict[inter_stats_df.iloc[it2]['TEAM_NAME_H']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it2][['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
                   'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H',
                   'BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H']].to_numpy()
                    team_it += 1

                if inter_stats_df.iloc[it2]['TEAM_NAME_A'] not in team_dict.keys():
                    team_dict[inter_stats_df.iloc[it2]['TEAM_NAME_A']] = ''

                    pop_stats_arr[team_it] = stats_df.iloc[it2][['FGM_A', 'FGA_A', 'FG_PCT_A',
                   'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A',
                   'DREB_A', 'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PTS_A',
                   'PLUS_MINUS_A']].to_numpy()
                    team_it += 1

                it2 += 1
                
        mean_stats = np.mean(pop_stats_arr, axis = 0)        
        std_stats = np.std(pop_stats_arr, axis = 0)
        
        game_means[i] = mean_stats
        game_stds[i] = std_stats
        
    return game_means, game_stds    
    

In [6]:
def normalize_and_standardize_sklearn(stats_df):
    columns = ['FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING',
           'DEF_RATING', 'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TO',
           'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT',
           'TS_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'POSS', 'PIE', 'GP_RANK',
           'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'OFF_RATING_RANK',
           'DEF_RATING_RANK', 'NET_RATING_RANK', 'AST_PCT_RANK', 'AST_TO_RANK',
           'AST_RATIO_RANK', 'OREB_PCT_RANK', 'DREB_PCT_RANK', 'REB_PCT_RANK',
           'TM_TOV_PCT_RANK', 'EFG_PCT_RANK', 'TS_PCT_RANK', 'PACE_RANK',
           'PIE_RANK']
    home_columns = [var + "_H" for var in columns]
    away_columns = [var + "_A" for var in columns]
    home_stats = np.empty((stats_df.shape[0], 51))
    away_stats = np.empty((stats_df.shape[0], 51))
    
    home_stats = np.array(stats_df[home_columns])
    away_stats = np.array(stats_df[away_columns])
    ####
    ##NEED TO FIX
    w_l_dict = {"W" : 1, "L" : 0}
    w_l_df = stats_df.replace({"WL_H": w_l_dict})
    print(w_l_df.columns)
    win_loss_home = np.array(w_l_df["WL_H"])
    matchups = stats_df[['TEAM_NAME_A', 'TEAM_NAME_H']]
    
    #standardize home and away stats 
    scaler = StandardScaler()
    final = np.subtract(home_stats, away_stats)
    final = scaler.fit_transform(final)
    final = pd.DataFrame(final, columns = columns)
   
    return final, win_loss_home, matchups

In [132]:
def generate_full_train_test(num_years):
    start_year = 2021 - num_years
    all_years_stats_df = pd.DataFrame()
    #For each year, 
    for i in range(start_year, 2021):
        time.sleep(1)
        games_reg = leaguegamelog.LeagueGameLog(season = str(i))
        df_reg = pd.DataFrame(games_reg.get_data_frames()[0])
        
        time.sleep(1)
        games_playoff = leaguegamelog.LeagueGameLog(season = str(i), season_type_all_star = "Playoffs")
        df_playoff = pd.DataFrame(games_playoff.get_data_frames()[0])
        df = pd.concat([df_reg, df_playoff], axis = 0)
        
        df = df[df['WL'].notnull()]
        teams = df['TEAM_NAME'].unique()
        stats_df = generate_game_stats(df, teams, i)
        
        all_years_stats_df = pd.concat([all_years_stats_df, stats_df], axis = 0)
        
    attributes, target, matchups = normalize_and_standardize_sklearn(all_years_stats_df)
    
    train_X = attributes[:int(attributes.shape[0]*0.7)]
    test_X = attributes[int(attributes.shape[0]*0.7):]
    train_Y = target[:int(target.shape[0]*0.7)]
    test_Y = target[int(target.shape[0]*0.7):]
    
    matchups_train = matchups[:int(matchups.shape[0]*0.7)]
    matchups_test = matchups[int(matchups.shape[0]*0.7):]
    
    #return train_X, train_Y, test_X, test_Y
    return train_X, np.ravel(train_Y), test_X, np.ravel(test_Y), matchups_train, matchups_test

In [8]:
#From David Dale on stackoverflow
def logit_pvalue(model, x):
    """ Calculate z-scores for scikit-learn LogisticRegression.
    parameters:
        model: fitted sklearn.linear_model.LogisticRegression with intercept and large C
        x:     matrix on which the model was fit
    This function uses asymtptics for maximum likelihood estimates.
    """
    p = model.predict_proba(x)
    n = len(p)
    m = len(model.coef_[0]) + 1
    coefs = np.concatenate([model.intercept_, model.coef_[0]])
    x_full = np.matrix(np.insert(np.array(x), 0, 1, axis = 1))
    ans = np.zeros((m, m))
    for i in range(n):
        ans = ans + np.dot(np.transpose(x_full[i, :]), x_full[i, :]) * p[i,1] * p[i, 0]
    vcov = np.linalg.inv(np.matrix(ans))
    se = np.sqrt(np.diag(vcov))
    t =  coefs/se  
    p = (1 - norm.cdf(abs(t))) * 2
    return p

In [59]:
#Get rid of multicolinear variables and keep significant ones
def eliminate_attributes(train_X, test_X):
    X = pd.DataFrame(train_X)
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                              for i in range(len(X.columns))]
    #print(vif_data)

    feat_tokeep = list(vif_data[vif_data["VIF"] < 100]["feature"])
    
    #Modify training data to remove most multi colinear variables
    train_X = train_X[feat_tokeep]
    test_X = test_X[feat_tokeep]
    
    return train_X, test_X

In [51]:
def predict_odds(train_X, train_Y, test_X, matchups_test):
    clf = LogisticRegression(random_state=0, C=0.1).fit(train_X, train_Y)
    pred_prob = clf.predict_proba(test_X)
   
    pred_prob = pd.DataFrame(pred_prob, columns = ["Away", "Home"])    
    
    winners = []
    for i in range(pred_prob.shape[0]):
        if pred_prob.iloc[i, 1] >= 0.5:
            winners.append(matchups_test.iloc[i, 1])
            
        else:
            winners.append(matchups_test.iloc[i, 0])
        
    winners = pd.DataFrame(winners, columns = ["Winner"])
    return pd.concat([pred_prob, winners], axis = 1)

In [None]:
#Get statistics for scheduled games to make predictions
def get_upcoming_games():
    

In [32]:
games = leaguegamelog.LeagueGameLog(season = str(2021))
df = pd.DataFrame(games.get_data_frames()[0])

In [12]:
train_Y

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [141]:
train_X, train_Y, test_X, test_Y, matchups_train, matchups_test = generate_full_train_test(10)

Index(['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
       'FTA_H', 'FT_PCT_H', 'OREB_H',
       ...
       'REB_PCT_RANK_A', 'TM_TOV_PCT_RANK_A', 'EFG_PCT_RANK_A',
       'TS_PCT_RANK_A', 'PACE_RANK_A', 'PIE_RANK_A', 'GAME_DATE_A',
       'TEAM_NAME_A', 'MATCHUP_A', 'WL_A'],
      dtype='object', length=124)


In [142]:
train_X_mod, test_X_mod = eliminate_attributes(train_X, test_X)

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


In [143]:
train_X_mod

Unnamed: 0,FG3A,FG3_PCT,FT_PCT,STL,BLK,PF,AST_TO,OREB_PCT,DREB_PCT,PIE,MIN_RANK,OFF_RATING_RANK,DEF_RATING_RANK,NET_RATING_RANK,AST_PCT_RANK,AST_TO_RANK,AST_RATIO_RANK,OREB_PCT_RANK,DREB_PCT_RANK,REB_PCT_RANK,TM_TOV_PCT_RANK,PACE_RANK,PIE_RANK
0,1.238485,0.094342,-0.988927,-0.812962,-0.671221,-0.114279,0.124047,-0.593033,0.115476,-0.531993,0.005246,-0.324383,0.540838,0.184101,0.013774,-0.281443,-0.070567,0.479275,-0.108758,0.535620,-0.496840,0.819196,0.253932
1,0.389576,-1.238867,-1.070694,0.505862,-0.333790,-0.164088,-0.163410,-0.698685,-0.153270,-1.168438,-1.715908,0.429853,0.486928,0.744662,0.439380,-0.009488,0.744245,0.536486,0.100269,0.982687,-0.826928,0.818135,0.741887
2,0.498411,1.112660,-1.079081,0.066254,-2.189659,0.084958,0.064160,-0.284645,0.091961,-0.410449,0.005246,0.197755,0.202267,0.316361,-0.043185,-0.026671,-0.177212,0.041285,0.059706,-0.044475,0.457921,-0.344212,0.313851
3,-0.067529,0.807927,0.698825,1.824686,0.003641,0.533241,-0.498776,0.189360,-0.173426,0.919899,0.005246,0.091960,-1.634808,-1.187016,0.322570,0.652721,0.904512,-0.113152,-0.156630,-0.209270,0.309774,0.683855,-0.922630
4,-1.700046,-0.835095,0.440945,-1.106034,-1.346082,-1.708173,-0.052619,1.260154,0.088602,-0.702153,0.005246,0.363446,0.172737,0.354062,-0.134263,-0.039228,-0.430044,-0.960996,-0.125568,-0.952507,0.791080,1.405147,0.459193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8023,-0.350498,0.914583,0.472394,-1.618910,-1.599155,1.529424,-0.238268,1.171635,-0.099521,-0.094436,-0.782401,-1.904143,1.630378,-0.423121,0.000474,-0.314157,-0.812586,-1.318016,0.272753,-1.113663,0.506393,-0.096882,0.032693
8024,-0.394032,-0.083419,0.199837,1.385078,-0.671221,-0.662180,0.438453,0.726185,0.595861,1.872358,0.792892,-0.325034,-2.543044,-2.141266,-1.212156,-0.776447,-1.452154,-1.020708,-0.561528,-1.785090,0.828629,-1.383955,-2.348394
8025,1.020816,-0.497349,-0.848456,1.165274,0.003641,-0.612371,-0.914989,-0.692974,-1.087164,-0.618178,0.005246,0.462405,0.429584,0.630634,0.474077,1.517822,0.987191,0.955093,1.424594,1.313934,1.027979,-1.815456,0.634955
8026,1.173185,0.264485,-0.821200,1.238542,-0.333790,1.529424,-0.370019,0.229336,0.105398,-1.758477,0.792892,1.151210,0.470446,0.987859,-1.304968,0.420750,-0.551666,-0.211317,-0.191346,0.369501,1.580969,0.422194,1.530052


In [144]:
#Fit model
clf = LogisticRegression(random_state=0, C=0.1).fit(train_X_mod, train_Y)
y_pred = clf.predict(test_X_mod)

In [145]:
#Train accuracy
clf.score(train_X_mod, train_Y)

0.6444942700548082

In [146]:
#Test accuracy
accuracy_score(test_Y, y_pred)

0.6278326554328879

In [99]:
predict_odds(train_X_mod, train_Y, test_X_mod, matchups_test)

Unnamed: 0,Away,Home,Winner
0,0.508612,0.491388,Atlanta Hawks
1,0.204215,0.795785,Charlotte Hornets
2,0.584471,0.415529,Golden State Warriors
3,0.336322,0.663678,Boston Celtics
4,0.244689,0.755311,San Antonio Spurs
...,...,...,...
3522,0.286825,0.713175,Dallas Mavericks
3523,0.270386,0.729614,Philadelphia 76ers
3524,0.363707,0.636293,New Orleans Pelicans
3525,0.262435,0.737565,Brooklyn Nets


In [None]:
#Grid Search
lr = LogisticRegression()
grid_values = {'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'max_iter': [20, 50, 100, 200, 500, 1000]}
clf = GridSearchCV(lr, param_grid=grid_values)
clf.fit(train_X, train_Y)

In [None]:
clf.best_params_

In [103]:
#Test
games = leaguegamelog.LeagueGameLog(season = '2021', season_type_all_star = "Playoffs")
games_df = pd.DataFrame(games.get_data_frames()[0])
#teams = df['TEAM_NAME'].unique()
#stats_df = generate_game_stats(df, teams)

In [129]:
#XGBoost Test
xgb_cl = xgb.XGBClassifier(n_estimators = 10)
# Fit
xgb_cl.fit(train_X_mod, train_Y)



  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=10, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [130]:
# Predict
y_pred = xgb_cl.predict(test_X_mod)
accuracy_score(test_Y, y_pred)

0.553030303030303

In [None]:
games = leaguegamelog.LeagueGameLog(season = '2018')
df = pd.DataFrame(games.get_data_frames()[0])



team_df = df[df['TEAM_NAME'] == 'San Antonio Spurs']
advanced_stats = get_advanced_stats(2018)
temp_df = team_df.merge(advanced_stats, on = ["GAME_ID", "TEAM_ID"], suffixes = ("", "_y"))
to_drop = [x for x in temp_df if x.endswith('_y')]
temp_df.drop(to_drop, axis=1, inplace=True)

In [None]:
#Things to do
# Add playoff data
# Add ELO Rating
# Add player-based statistics