In [1]:
from nba_api.stats.endpoints import leaguegamelog
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import norm
import time
import xgboost as xgb
import json

  from pandas import MultiIndex, Int64Index


In [2]:
pd.set_option("display.max_columns", None)

In [3]:
def get_advanced_stats(year):
    data = json.load(open(f"{year}.json"))
    advanced_stats = pd.DataFrame(data['resultSets'][0]['rowSet'], columns = data['resultSets'][0]['headers'])    
    return advanced_stats    

In [4]:
#Can use this to get means and standard deviations
def generate_game_stats(df, teams, year): 
    #all_stats_arr = np.empty((30, 77, 19))
    all_teams_stats_df = pd.DataFrame()
    for i, team in enumerate(teams):
        temp_df = df[df['TEAM_NAME'] == team]
        advanced_stats = get_advanced_stats(year)
        #Merge with temp 
        temp_df = temp_df.merge(advanced_stats, on = ["GAME_ID", "TEAM_ID"], suffixes = ("", "_y"))
        to_drop = [x for x in temp_df if x.endswith('_y')]
        temp_df.drop(to_drop, axis=1, inplace=True)
        
        temp_df = temp_df.sort_values(by = ['GAME_DATE'])
        temp_df = temp_df.set_index('GAME_ID')
        key_data = temp_df[['GAME_DATE', 'TEAM_NAME', 'MATCHUP', 'WL']].iloc[7:]
        temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING',
           'DEF_RATING', 'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TO',
           'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT',
           'TS_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'POSS', 'PIE', 'GP_RANK',
           'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'OFF_RATING_RANK',
           'DEF_RATING_RANK', 'NET_RATING_RANK', 'AST_PCT_RANK', 'AST_TO_RANK',
           'AST_RATIO_RANK', 'OREB_PCT_RANK', 'DREB_PCT_RANK', 'REB_PCT_RANK',
           'TM_TOV_PCT_RANK', 'EFG_PCT_RANK', 'TS_PCT_RANK', 'PACE_RANK',
           'PIE_RANK']]
        
        
        
        final_organized_stats = temp_df.rolling(7).mean().shift(periods = 1).iloc[7:]#.to_dict('index')
        stats_with_key = pd.concat([final_organized_stats, key_data], axis = 1)
     
        all_teams_stats_df = pd.concat([all_teams_stats_df, stats_with_key], axis = 0)
        
    home_df = all_teams_stats_df[all_teams_stats_df['MATCHUP'].str.contains("vs.")]
    away_df = all_teams_stats_df[all_teams_stats_df['MATCHUP'].str.contains("@")]
    merged_game_stats_df = home_df.merge(away_df, on = "GAME_ID",suffixes = ("_H", "_A"))
    merged_game_stats_df = merged_game_stats_df.sort_values(by = ['GAME_DATE_H'])
      
    return merged_game_stats_df

In [5]:
#Function to get population mean and standard deviations - might be stupid but I think it makes sense
#Make sure to use this on train and test set separately 
#After this, make these stats for each game their own matrices, so I could do (game_matrix - mean_matrix / std_matrix)
def generate_population_statistics(stats_df):
    game_means = np.empty((stats_df.shape[0], 19))
    game_stds = np.empty((stats_df.shape[0], 19))
    
    for i in range(stats_df.shape[0]):
        inter_stats_df = stats_df[stats_df['GAME_DATE_H'].str.contains(stats_df.iloc[i]['GAME_DATE_H'][0:4])]
        inter_index = stats_df.index[i]
        it1 = np.where(inter_stats_df.index == inter_index)[0][0]
        team_dict = {}
        pop_stats_arr = np.empty((30, 19))
        it2 = it1+1
        team_it = 0
    
        while len(team_dict.keys()) < 30:

            if it1 >= 0:

                if inter_stats_df.iloc[it1]['TEAM_NAME_H'] not in team_dict.keys():  
                    team_dict[inter_stats_df.iloc[it1]['TEAM_NAME_H']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it1][['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
                   'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H',
                   'BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H']].to_numpy()
                    team_it += 1

                if inter_stats_df.iloc[it1]['TEAM_NAME_A'] not in team_dict.keys():
                    team_dict[inter_stats_df.iloc[it1]['TEAM_NAME_A']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it1][['FGM_A', 'FGA_A', 'FG_PCT_A',
                   'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A',
                   'DREB_A', 'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PTS_A',
                   'PLUS_MINUS_A']].to_numpy()
                    team_it += 1

                it1 -= 1    

            if it2 < inter_stats_df.shape[0]:
                if inter_stats_df.iloc[it2]['TEAM_NAME_H'] not in team_dict.keys():  
                    team_dict[inter_stats_df.iloc[it2]['TEAM_NAME_H']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it2][['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
                   'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H',
                   'BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H']].to_numpy()
                    team_it += 1

                if inter_stats_df.iloc[it2]['TEAM_NAME_A'] not in team_dict.keys():
                    team_dict[inter_stats_df.iloc[it2]['TEAM_NAME_A']] = ''

                    pop_stats_arr[team_it] = stats_df.iloc[it2][['FGM_A', 'FGA_A', 'FG_PCT_A',
                   'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A',
                   'DREB_A', 'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PTS_A',
                   'PLUS_MINUS_A']].to_numpy()
                    team_it += 1

                it2 += 1
                
        mean_stats = np.mean(pop_stats_arr, axis = 0)        
        std_stats = np.std(pop_stats_arr, axis = 0)
        
        game_means[i] = mean_stats
        game_stds[i] = std_stats
        
    return game_means, game_stds    
    

In [6]:
def normalize_and_standardize_sklearn(stats_df):
    columns = ['FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING',
           'DEF_RATING', 'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TO',
           'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT',
           'TS_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'POSS', 'PIE', 'GP_RANK',
           'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'OFF_RATING_RANK',
           'DEF_RATING_RANK', 'NET_RATING_RANK', 'AST_PCT_RANK', 'AST_TO_RANK',
           'AST_RATIO_RANK', 'OREB_PCT_RANK', 'DREB_PCT_RANK', 'REB_PCT_RANK',
           'TM_TOV_PCT_RANK', 'EFG_PCT_RANK', 'TS_PCT_RANK', 'PACE_RANK',
           'PIE_RANK']
    home_columns = [var + "_H" for var in columns]
    away_columns = [var + "_A" for var in columns]
    home_stats = np.empty((stats_df.shape[0], 51))
    away_stats = np.empty((stats_df.shape[0], 51))
    
    home_stats = np.array(stats_df[home_columns])
    away_stats = np.array(stats_df[away_columns])
    ####
    ##NEED TO FIX
    w_l_dict = {"W" : 1, "L" : 0}
    w_l_df = stats_df.replace({"WL_H": w_l_dict})
    print(w_l_df.columns)
    win_loss_home = np.array(w_l_df["WL_H"])
    matchups = stats_df[['TEAM_NAME_A', 'TEAM_NAME_H']]
    
    #standardize home and away stats 
    scaler = StandardScaler()
    final = np.subtract(home_stats, away_stats)
    final = scaler.fit_transform(final)
    final = pd.DataFrame(final, columns = columns)
   
    return final, win_loss_home, matchups

In [7]:
def generate_full_train_test(num_years):
    start_year = 2022 - num_years
    all_years_stats_df = pd.DataFrame()
    #For each year, 
    for i in range(start_year, 2022):
        time.sleep(1)
        games = leaguegamelog.LeagueGameLog(season = str(i))
        df = pd.DataFrame(games.get_data_frames()[0])
        df = df[df['WL'].notnull()]
        teams = df['TEAM_NAME'].unique()
        stats_df = generate_game_stats(df, teams, i)
        
        all_years_stats_df = pd.concat([all_years_stats_df, stats_df], axis = 0)
        
    attributes, target, matchups = normalize_and_standardize_sklearn(all_years_stats_df)
    
    train_X = attributes[:int(attributes.shape[0]*0.7)]
    test_X = attributes[int(attributes.shape[0]*0.7):]
    train_Y = target[:int(target.shape[0]*0.7)]
    test_Y = target[int(target.shape[0]*0.7):]
    
    matchups_train = matchups[:int(matchups.shape[0]*0.7)]
    matchups_test = matchups[int(matchups.shape[0]*0.7):]
    
    #return train_X, train_Y, test_X, test_Y
    return train_X, np.ravel(train_Y), test_X, np.ravel(test_Y), matchups_train, matchups_test

In [8]:
#From David Dale on stackoverflow
def logit_pvalue(model, x):
    """ Calculate z-scores for scikit-learn LogisticRegression.
    parameters:
        model: fitted sklearn.linear_model.LogisticRegression with intercept and large C
        x:     matrix on which the model was fit
    This function uses asymtptics for maximum likelihood estimates.
    """
    p = model.predict_proba(x)
    n = len(p)
    m = len(model.coef_[0]) + 1
    coefs = np.concatenate([model.intercept_, model.coef_[0]])
    x_full = np.matrix(np.insert(np.array(x), 0, 1, axis = 1))
    ans = np.zeros((m, m))
    for i in range(n):
        ans = ans + np.dot(np.transpose(x_full[i, :]), x_full[i, :]) * p[i,1] * p[i, 0]
    vcov = np.linalg.inv(np.matrix(ans))
    se = np.sqrt(np.diag(vcov))
    t =  coefs/se  
    p = (1 - norm.cdf(abs(t))) * 2
    return p

In [9]:
#Get rid of multicolinear variables and keep significant ones
def eliminate_attributes(train_X, test_X):
    X = pd.DataFrame(train_X)
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                              for i in range(len(X.columns))]
    #print(vif_data)

    feat_tokeep = list(vif_data[vif_data["VIF"] < 5000]["feature"])
    
    #Modify training data to remove most multi colinear variables
    train_X = train_X[feat_tokeep]
    test_X = test_X[feat_tokeep]
    
    return train_X, test_X

In [26]:
def predict_odds(train_X, train_Y, test_X, matchups_test):
    clf = LogisticRegression(random_state=0, C=0.1).fit(train_X, train_Y)
    pred_prob = clf.predict_proba(test_X)
   
    pred_prob = pd.DataFrame(pred_prob, columns = ["Away", "Home"])    
    
    winners = []
    for i in range(pred_prob.shape[0]):
        if pred_prob.iloc[i, 1] >= 0.5:
            winners.append(matchups_test.iloc[i, 1])
            
        else:
            winners.append(matchups_test.iloc[i, 0])
        
    winners = pd.DataFrame(winners, columns = ["Winner"])
    return pd.concat([pred_prob, winners], axis = 1)

In [None]:
def get_upcoming_games():
    

In [32]:
games = leaguegamelog.LeagueGameLog(season = str(2021))
df = pd.DataFrame(games.get_data_frames()[0])

In [33]:
df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22021,1610612747,LAL,Los Angeles Lakers,0022100002,2021-10-19,LAL vs. GSW,L,240,45,95,0.474,15,42,0.357,9,19,0.474,5,40,45,21,7,4,18,25,114,-7,1
1,22021,1610612744,GSW,Golden State Warriors,0022100002,2021-10-19,GSW @ LAL,W,240,41,93,0.441,14,39,0.359,25,30,0.833,9,41,50,30,9,2,17,18,121,7,1
2,22021,1610612751,BKN,Brooklyn Nets,0022100001,2021-10-19,BKN @ MIL,L,240,37,84,0.440,17,32,0.531,13,23,0.565,5,39,44,19,3,9,13,17,104,-23,1
3,22021,1610612749,MIL,Milwaukee Bucks,0022100001,2021-10-19,MIL vs. BKN,W,240,48,105,0.457,17,45,0.378,14,18,0.778,13,41,54,25,8,9,8,19,127,23,1
4,22021,1610612754,IND,Indiana Pacers,0022100003,2021-10-20,IND @ CHA,L,240,42,90,0.467,17,47,0.362,21,24,0.875,8,43,51,29,2,10,17,24,122,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2455,22021,1610612738,BOS,Boston Celtics,0022101223,2022-04-10,BOS @ MEM,W,240,54,99,0.545,18,48,0.375,13,13,1.000,14,42,56,34,5,2,15,20,139,29,1
2456,22021,1610612755,PHI,Philadelphia 76ers,0022101228,2022-04-10,PHI vs. DET,W,240,46,88,0.523,5,25,0.200,21,23,0.913,10,32,42,25,13,6,11,23,118,12,1
2457,22021,1610612765,DET,Detroit Pistons,0022101228,2022-04-10,DET @ PHI,L,240,38,83,0.458,11,34,0.324,19,29,0.655,15,27,42,26,4,4,20,16,106,-12,1
2458,22021,1610612750,MIN,Minnesota Timberwolves,0022101224,2022-04-10,MIN vs. CHI,L,240,46,91,0.505,11,31,0.355,17,22,0.773,9,23,32,30,7,9,13,23,120,-4,1


In [12]:
train_Y

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [11]:
train_X, train_Y, test_X, test_Y, matchups_train, matchups_test = generate_full_train_test(10)
train_X, test_X = eliminate_attributes(train_X, test_X)

Index(['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
       'FTA_H', 'FT_PCT_H', 'OREB_H',
       ...
       'REB_PCT_RANK_A', 'TM_TOV_PCT_RANK_A', 'EFG_PCT_RANK_A',
       'TS_PCT_RANK_A', 'PACE_RANK_A', 'PIE_RANK_A', 'GAME_DATE_A',
       'TEAM_NAME_A', 'MATCHUP_A', 'WL_A'],
      dtype='object', length=124)


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


In [27]:
predict_odds(train_X, train_Y, test_X, matchups_test)

Unnamed: 0,Away,Home,Winner
0,0.363878,0.636122,Chicago Bulls
1,0.351844,0.648156,LA Clippers
2,0.384905,0.615095,Philadelphia 76ers
3,0.595627,0.404373,Milwaukee Bucks
4,0.287498,0.712502,Indiana Pacers
...,...,...,...
3258,0.517776,0.482224,Boston Celtics
3259,0.359625,0.640375,New Orleans Pelicans
3260,0.241426,0.758574,Philadelphia 76ers
3261,0.290880,0.709120,Charlotte Hornets


In [28]:
#Fit model
clf = LogisticRegression(random_state=0, C=0.1).fit(train_X, train_Y)
y_pred = clf.predict(test_X)

In [29]:
#Train accuracy
clf.score(train_X, train_Y)

0.6525679758308157

In [30]:
#Test accuracy
accuracy_score(test_Y, y_pred)

0.6236592093165798

In [None]:
#Grid Search
lr = LogisticRegression()
grid_values = {'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'max_iter': [20, 50, 100, 200, 500, 1000]}
clf = GridSearchCV(lr, param_grid=grid_values)
clf.fit(train_X, train_Y)

In [None]:
clf.best_params_

In [None]:
#Test
games = leaguegamelog.LeagueGameLog(season = '2021')
games_df = pd.DataFrame(games.get_data_frames()[0])
#teams = df['TEAM_NAME'].unique()
#stats_df = generate_game_stats(df, teams)

In [None]:
#XGBoost Test
xgb_cl = xgb.XGBClassifier()
# Fit
xgb_cl.fit(train_X, train_Y)

In [None]:
# Predict
y_pred = xgb_cl.predict(test_X)
accuracy_score(test_Y, y_pred)

In [None]:
games = leaguegamelog.LeagueGameLog(season = '2018')
df = pd.DataFrame(games.get_data_frames()[0])



team_df = df[df['TEAM_NAME'] == 'San Antonio Spurs']
advanced_stats = get_advanced_stats(2018)
temp_df = team_df.merge(advanced_stats, on = ["GAME_ID", "TEAM_ID"], suffixes = ("", "_y"))
to_drop = [x for x in temp_df if x.endswith('_y')]
temp_df.drop(to_drop, axis=1, inplace=True)

In [None]:
#Things to do
# Add playoff data
# Add ELO Rating
# Add player-based statistics