In [3]:
from nba_api.stats.endpoints import leaguegamelog
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import time
import xgboost as xgb

In [4]:
#Can use this to get means and standard deviations
def generate_game_stats(df, teams): 
    #all_stats_arr = np.empty((30, 77, 19))
    all_teams_stats_df = pd.DataFrame()
    for i, team in enumerate(teams):
        temp_df = df[df['TEAM_NAME'] == team]
        temp_df = temp_df.sort_values(by = ['GAME_DATE'])
        temp_df = temp_df.set_index('GAME_ID')
        key_data = temp_df[['GAME_DATE', 'TEAM_NAME', 'MATCHUP', 'WL']].iloc[5:]
        temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]
        #final_stats_df = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]#.to_dict('list')
        #single_stats_arr = final_stats_df.to_numpy()
        #all_stats_arr[i] = single_stats_arr
        
        final_organized_stats = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]#.to_dict('index')
        stats_with_key = pd.concat([final_organized_stats, key_data], axis = 1)
     
        all_teams_stats_df = pd.concat([all_teams_stats_df, stats_with_key], axis = 0)
        
    home_df = all_teams_stats_df[all_teams_stats_df['MATCHUP'].str.contains("vs.")]
    away_df = all_teams_stats_df[all_teams_stats_df['MATCHUP'].str.contains("@")]
    merged_game_stats_df = home_df.merge(away_df, on = "GAME_ID",suffixes = ("_H", "_A"))
    merged_game_stats_df = merged_game_stats_df.sort_values(by = ['GAME_DATE_H'])
      
    return merged_game_stats_df

In [5]:
#Function to get population mean and standard deviations - might be stupid but I think it makes sense
#Make sure to use this on train and test set separately 
#After this, make these stats for each game their own matrices, so I could do (game_matrix - mean_matrix / std_matrix)
def generate_population_statistics(stats_df):
    game_means = np.empty((stats_df.shape[0], 19))
    game_stds = np.empty((stats_df.shape[0], 19))
    
    for i in range(stats_df.shape[0]):
        inter_stats_df = stats_df[stats_df['GAME_DATE_H'].str.contains(stats_df.iloc[i]['GAME_DATE_H'][0:4])]
        inter_index = stats_df.index[i]
        it1 = np.where(inter_stats_df.index == inter_index)[0][0]
        team_dict = {}
        pop_stats_arr = np.empty((30, 19))
        it2 = it1+1
        team_it = 0
    
        while len(team_dict.keys()) < 30:

            if it1 >= 0:

                if inter_stats_df.iloc[it1]['TEAM_NAME_H'] not in team_dict.keys():  
                    team_dict[inter_stats_df.iloc[it1]['TEAM_NAME_H']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it1][['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
                   'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H',
                   'BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H']].to_numpy()
                    team_it += 1

                if inter_stats_df.iloc[it1]['TEAM_NAME_A'] not in team_dict.keys():
                    team_dict[inter_stats_df.iloc[it1]['TEAM_NAME_A']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it1][['FGM_A', 'FGA_A', 'FG_PCT_A',
                   'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A',
                   'DREB_A', 'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PTS_A',
                   'PLUS_MINUS_A']].to_numpy()
                    team_it += 1

                it1 -= 1    

            if it2 < inter_stats_df.shape[0]:
                if inter_stats_df.iloc[it2]['TEAM_NAME_H'] not in team_dict.keys():  
                    team_dict[inter_stats_df.iloc[it2]['TEAM_NAME_H']] = ''

                    pop_stats_arr[team_it] = inter_stats_df.iloc[it2][['FGM_H', 'FGA_H', 'FG_PCT_H', 'FG3M_H', 'FG3A_H', 'FG3_PCT_H', 'FTM_H',
                   'FTA_H', 'FT_PCT_H', 'OREB_H', 'DREB_H', 'REB_H', 'AST_H', 'STL_H',
                   'BLK_H', 'TOV_H', 'PF_H', 'PTS_H', 'PLUS_MINUS_H']].to_numpy()
                    team_it += 1

                if inter_stats_df.iloc[it2]['TEAM_NAME_A'] not in team_dict.keys():
                    team_dict[inter_stats_df.iloc[it2]['TEAM_NAME_A']] = ''

                    pop_stats_arr[team_it] = stats_df.iloc[it2][['FGM_A', 'FGA_A', 'FG_PCT_A',
                   'FG3M_A', 'FG3A_A', 'FG3_PCT_A', 'FTM_A', 'FTA_A', 'FT_PCT_A', 'OREB_A',
                   'DREB_A', 'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PTS_A',
                   'PLUS_MINUS_A']].to_numpy()
                    team_it += 1

                it2 += 1
                
        mean_stats = np.mean(pop_stats_arr, axis = 0)        
        std_stats = np.std(pop_stats_arr, axis = 0)
        
        game_means[i] = mean_stats
        game_stds[i] = std_stats
        
    return game_means, game_stds    
    

In [6]:
#Function to get z scores and finalize attributes as (Home minus Away)
def normalize_and_standardize(stats_df, game_means, game_stds):
   
    home_stats = np.empty((stats_df.shape[0], 19))
    away_stats = np.empty((stats_df.shape[0], 19))
    
    home_stats = stats_df.iloc[:, 0:19]
    away_stats = stats_df.iloc[:, 23:42]
    win_loss_home = stats_df.iloc[:, 22]
    matchup = stats_df.iloc[:, [43, 20]]
    
    #standardize home and away stats 
    home_z = np.divide((home_stats - game_means), game_stds)
    
    away_z = np.divide((away_stats - game_means), game_stds)
    
    final = np.subtract(home_z, away_z)
    
    final = pd.DataFrame(final)

    final = pd.concat([final, win_loss_home], axis = 1)
    
    return final, matchup

In [7]:
def normalize_and_standardize_sklearn(stats_df):
    
    home_stats = np.empty((stats_df.shape[0], 19))
    away_stats = np.empty((stats_df.shape[0], 19))
    
    home_stats = stats_df.iloc[:, 0:19]
    away_stats = stats_df.iloc[:, 23:42]
    win_loss_home = stats_df.iloc[:, 22]
    matchup = stats_df.iloc[:, [43, 20]]
    
    #standardize home and away stats 
    scaler = StandardScaler()
    home_z = scaler.fit_transform(home_stats)
    
    away_z = scaler.fit_transform(away_stats)
    
    final = np.subtract(home_z, away_z)
    
    final = pd.DataFrame(final)

    #final = pd.concat([final, win_loss_home], axis = 1)
    
    return final, win_loss_home, matchup

In [22]:
#Include num_years previous years plus current season games
def generate_full_train_test(num_years):
    start_year = 2022 - num_years
    all_years_stats_df = pd.DataFrame()
    #For each year, 
    for i in range(start_year, 2022):
        time.sleep(1)
        games = leaguegamelog.LeagueGameLog(season = str(i))
        df = pd.DataFrame(games.get_data_frames()[0])
        df = df[df['WL'].notnull()]
        teams = df['TEAM_NAME'].unique()
        stats_df = generate_game_stats(df, teams)
        
        all_years_stats_df = pd.concat([all_years_stats_df, stats_df], axis = 0)
        
    
    #Split data into train and test and transform
    train_stats = all_years_stats_df.iloc[:int(all_years_stats_df.shape[0]*0.7), :]
    test_stats = all_years_stats_df.iloc[int(all_years_stats_df.shape[0]*0.7):, :]

    game_means_train, game_stds_train = generate_population_statistics(train_stats)
    game_means_test, game_stds_test = generate_population_statistics(test_stats)
    
    #Finalize and prepare train / test
    final_train, matchup_train = normalize_and_standardize(train_stats, game_means_train, game_stds_train)
    final_test, matchup_test = normalize_and_standardize(test_stats, game_means_test, game_stds_test)
    
    #Prepare actual train and test
    train_X = final_train.iloc[:,:19]
    train_Y = final_train.iloc[:,19]

    test_X = final_test.iloc[:, :19]
    test_Y = final_test.iloc[:,19]
    
    return train_X, train_Y, test_X, test_Y

In [20]:
def generate_full_train_test_sklearn(num_years):
    start_year = 2022 - num_years
    all_years_stats_df = pd.DataFrame()
    #For each year, 
    for i in range(start_year, 2022):
        time.sleep(1)
        games = leaguegamelog.LeagueGameLog(season = str(i))
        df = pd.DataFrame(games.get_data_frames()[0])
        df = df[df['WL'].notnull()]
        teams = df['TEAM_NAME'].unique()
        stats_df = generate_game_stats(df, teams)
        
        all_years_stats_df = pd.concat([all_years_stats_df, stats_df], axis = 0)
        
    
    #Split data into train and test and transform
    train_stats = all_years_stats_df.iloc[:int(all_years_stats_df.shape[0]*0.7), :]
    test_stats = all_years_stats_df.iloc[int(all_years_stats_df.shape[0]*0.7):, :]

    #Finalize and prepare train / test
    train_X, train_Y, matchup_train = normalize_and_standardize_sklearn(train_stats)
    test_X, test_Y, matchup_test = normalize_and_standardize_sklearn(test_stats)
    
    return train_X, train_Y, test_X, test_Y

In [29]:
train_X, train_Y, test_X, test_Y = generate_full_train_test(5)

  final = np.subtract(home_z, away_z)
  final = np.subtract(home_z, away_z)


In [23]:
train_X, train_Y, test_X, test_Y = generate_full_train_test_sklearn(5)

In [15]:
#Test
games = leaguegamelog.LeagueGameLog(season = '2021')
df = pd.DataFrame(games.get_data_frames()[0])
teams = df['TEAM_NAME'].unique()
stats_df = generate_game_stats(df, teams)

In [35]:
#Fit model
clf = LogisticRegression(random_state=420).fit(train_X, train_Y)
y_pred = clf.predict(test_X)

In [36]:
#Train accuracy
clf.score(train_X, train_Y)

0.625692429438143

In [37]:
#Test accuracy
accuracy_score(test_Y, y_pred)

0.6002460024600246

In [None]:
dumby_check = np.repeat('W', 1626)

In [None]:
#Winners edge is ~ 5%
accuracy_score(test_Y, dumby_check)

In [None]:
y_pred.shape

In [33]:
#XGBoost Test
xgb_cl = xgb.XGBClassifier()
# Fit
xgb_cl.fit(train_X, train_Y)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [34]:
# Predict
y_pred = xgb_cl.predict(test_X)
accuracy_score(test_Y, y_pred)

0.5596555965559655

In [None]:
#Pull and store data
#Modify this to concat yearly statistics for larger training set
games = leaguegamelog.LeagueGameLog(season = '2018')
df = pd.DataFrame(games.get_data_frames()[0])
teams = df['TEAM_NAME'].unique()

In [None]:
#Get rolling averages and join like games into (home, away)
stats_df = generate_game_stats(df, teams)

In [None]:
np.where(stats_df.index == '0021801228')[0][0]

In [None]:
stats_df['GAME_DATE_H'].str.contains(stats_df.iloc[2]['GAME_DATE_H'][0:4]) 

In [None]:
#Split data into train and test and transform
train_stats = stats_df.iloc[:int(stats_df.shape[0]*0.7), :]
test_stats = stats_df.iloc[int(stats_df.shape[0]*0.7):, :]

game_means_train, game_stds_train = generate_population_statistics(train_stats)
game_means_test, game_stds_test = generate_population_statistics(test_stats)

In [None]:
#Finalize and prepare train / test
final_train, matchup_train = normalize_and_standardize(train_stats, game_means_train, game_stds_train)
final_test, matchup_test = normalize_and_standardize(test_stats, game_means_test, game_stds_test)

In [None]:
#Prepare actual train and test
train_X = final_train.iloc[:,:19]
train_Y = final_train.iloc[:,19]

test_X = final_test.iloc[:, :19]
test_Y = final_test.iloc[:,19]

In [None]:
train_X

In [None]:
#Fit model
clf = LogisticRegression(random_state=420).fit(train_X, train_Y)
y_pred = clf.predict(test_X)

In [None]:
#Train accuracy
clf.score(train_X, train_Y)

In [None]:
#Test accuracy
accuracy_score(test_Y, y_pred)

In [None]:
clf.predict([[-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3]])

In [None]:
train_X

In [None]:
#Next steps
#See if more data helps
#See if more attributes help (advanced stats)
#See if different rolling averages have different performance
#Try ML (XGBoost)

In [None]:
stats_df.iloc[:, [20,43]]

In [None]:
stats_df.columns

In [None]:
game_means, game_stds = generate_population_statistics(stats_df)

In [None]:
game_means

In [None]:
final, matchup = normalize_and_standardize(stats_df, game_means, game_stds)

In [None]:
matchup

In [None]:
#Prepare training and test data



In [None]:
stats_arr

In [None]:
#Function to flatten stats dict and find averages. Would show average and std dev. of team atttributes for 5th, 
# 6th... 82nd game

num_teams = teams.size

In [None]:
#test
team = 'Oklahoma City Thunder'
temp_df = df[df['TEAM_NAME'] == team]
temp_df = temp_df.sort_values(by = ['GAME_DATE'])
temp_df = temp_df.set_index('GAME_ID')
temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]
final_stats_df = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]
single_stats_arr = final_stats_df.to_numpy()
all_stats_arr = np.empty((30, 77, 19))


In [None]:
stats_calc_dict = {}
descriptive_dict = {}
all_stats_arr = np.empty((30, 77, 19))
team = 'Oklahoma City Thunder'
temp_df = df[df['TEAM_NAME'] == team]
temp_df = temp_df.sort_values(by = ['GAME_DATE'])
temp_df = temp_df.set_index('GAME_ID')
key_data = temp_df[['TEAM_NAME', 'MATCHUP', 'WL']].iloc[5:]
temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]
final_stats_df = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]
single_stats_arr = final_stats_df.to_numpy()
all_stats_arr[0] = single_stats_arr
        
final_organized_stats = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]
stats_with_key = pd.concat([final_organized_stats, key_data], axis = 1)

In [None]:
all_teams_stats_df = pd.DataFrame()

In [None]:
all_teams_stats_df = pd.concat([all_teams_stats_df, stats_with_key], axis = 0)

In [None]:
temp_df = df[df['TEAM_NAME'] == team]
temp_df.head(10)

In [None]:
stats_with_key

In [None]:
final_organized_stats = temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]

In [None]:
final_organized_stats

In [None]:
final_stats_df.to_numpy().shape

In [None]:
df.from_dict(dict2['Oklahoma City Thunder'], orient = 'index')

In [None]:
dict1, dict2 = generate_team_stats(df, teams)

In [None]:
dict2

In [None]:
temp_df = df[df['TEAM_NAME'] == 'Oklahoma City Thunder']

In [None]:
temp_df = temp_df.set_index('GAME_ID')

In [None]:
game_ids = temp_df['GAME_ID']

In [None]:
temp_df = temp_df[[
           'FGM', 'FGA', 'FG_PCT', 'FG3M',
           'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
           'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]
temp_df.rolling(5).mean().shift(periods = 1).iloc[5:]

In [None]:
game_ids.shift(periods = 5).iloc[5:]

In [None]:
temp_df = temp_df.sort_values(by = ['GAME_DATE'])

In [None]:
temp_df = temp_df[[
       'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']]

In [None]:
temp_df.rolling(5).mean().shift(periods = 1).tail(10)

In [None]:
df.head(5)

In [None]:
#Split data frames into home and away teams
home_df = df[df['MATCHUP'].str.contains("vs.")]
away_df = df[df['MATCHUP'].str.contains("@")]

In [None]:
merged = home_df.merge(away_df, on = "GAME_ID",suffixes = ("_H", "_A"))

In [None]:
merged

In [None]:
#make list of all team names, for loop filter by team n times, and make the rolling dict values?
# Can then go in and replace values/make new df with the normalized vals