In [1]:
import json
import os
import re
import time
from datetime import datetime, timedelta
from urllib.error import HTTPError

import bs4 as bs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import sklearn

from sklearn.compose import make_column_transformer
# new import statements
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler

from sklearn.decomposition import PCA

import random
stats = ['mp', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']

# DF which holds data required for prediction is called 'predict_data'

In [44]:
def get_all_data(start_date, end_date):
    '''
    This function returns a dataframe of all data of games from start_date to end_date
    '''
    start = time.time()
    dates = pd.date_range(start_date, end_date, freq='MS').strftime("%Y-%B").tolist() # get months from which to retrieve data. 
    dates = [date.lower() for date in dates]
    all_dataframes = []
    missing_data = []

    for date in dates:
        path = 'data/' + date.split('-')[0] + '/' + date + '.csv'
        try:
            # Get all dataframes for each month.
            with open(path, 'r') as f:
                all_dataframes.append(pd.read_csv(f))
        except Exception as e:
            missing_data.append(date)
            continue

    if all_dataframes != []:
        df = pd.concat(all_dataframes)
    else:
        df = pd.DataFrame()
    df.columns = df.columns.str.strip() # vscode extension that adds whitespace for readability in csv. this undos it. 
    for col in df.columns:
        if type(df[col]) == str:
            df[col] = df[col].str.strip()

    df.fillna(0, inplace=True)

    return df

# get_all_data('2020-10-10', '2023-10-10')

In [None]:
player_averages = {} # year: {player: avg}

In [45]:
def get_player_avg(player_name, start_date, end_date, data):
    '''
    Gets a player's average stats over the timeframe from start_date to end_date. Note that this runtime is linearly proportional to the size of data, 
    so it is recommended to pass in a dataframe of only the relevant data. We may try binary searching through the dataframe, so that we get O(log n + k) runtime. 
    Where n is the size of data, and k is the number of games played by the player in the start_date to end_date timeframe. Data should already be sorted. 

    player_name: Name of player
    start_date: Start date. YYYY-MM-DD. datetime object
    end_date: End date. YYYY-MM-DD. datetime object
    data: Dataframe of all data/all data which may be relevant to previous func call.
    returns: A pandas series of the player's average stats over the timeframe.
    '''
    player_data = data[data['player'] == player_name]

    start_date = str(start_date)
    end_date = str(end_date)
    
    # compare dates so that we only get data from the timeframe we want.
    player_data = player_data[player_data['game_date'] >= start_date]
    player_data = player_data[player_data['game_date'] <= end_date]

    # get average stats
    global stats
    player_data = player_data[stats]

    return player_data.mean().to_frame().transpose().fillna(0)

get_player_avg('LeBron James', datetime(2019, 1, 1), datetime(2019, 12, 31), get_all_data('2019-01-01', '2023-12-31'))

Unnamed: 0,mp,fg,fga,fg3,fg3a,ft,fta,orb,drb,ast,stl,blk,tov,pf,pts,plus_minus
0,35.763782,9.865385,20.096154,2.288462,6.480769,3.653846,5.673077,1.096154,6.884615,11.038462,1.5,0.576923,3.711538,1.923077,25.673077,7.153846


In [51]:
def create_row_for_player_game(player, game, data, order_by):
    '''
    player: player name
    game: link to game. 
    data: raw data for all games in past (year)
    player_averages: dictionary of player averages for the past year. Reused for efficiency.
    returns a single row of data aggregated from the game. Includes stats from both teams. 
    '''
    num_chosen = 5 # Number of players to choose from each team. 
    global stats

    start = time.time()
    
    game_df = data[data['link'].str.strip() == game] # vscode extension that adds whitespace for readability in csv. strip undos it.
    same_team_abbr = game_df[game_df['player'].str.strip() == player]['team'].iloc[0] # same shit
    opp_team_abbr = game_df[game_df['team'].str.strip() != same_team_abbr]['team'].iloc[0]
    
    game_date = game_df['game_date'].iloc[0]
    day_before_game_date = datetime.strptime(game_date, '%Y-%m-%d') - timedelta(days=1) # Used to get average stats for all players over past year. 
    year_before_game_date = datetime.strptime(game_date, '%Y-%m-%d') - timedelta(days=365)

    opp_team_roster = list(game_df[game_df['team'] == opp_team_abbr]['player'])
    same_team_roster = list(game_df[game_df['team'] == same_team_abbr]['player'])
   
    # This section is veryyyy slow. Mostly because there are a lot of func calls. 
    opp_team_avgs = []
    same_team_avgs = []
    for player_name in opp_team_roster:
        player_avg = get_player_avg(player_name, year_before_game_date, day_before_game_date, data)
        opp_team_avgs.append(player_avg) 
    for player_name in same_team_roster:
        player_avg = get_player_avg(player_name, year_before_game_date, day_before_game_date, data)
        same_team_avgs.append(player_avg)
    opp_team_avgs = pd.concat(opp_team_avgs).nlargest(num_chosen, order_by)
    same_team_avgs = pd.concat(same_team_avgs).nlargest(num_chosen, order_by)

    avgs_and_player_game_stats = []
    
    opp_team_avg_wide = opp_team_avgs.stack().reset_index(drop=True).to_frame().transpose()
    new_columns = list(opp_team_avgs.columns)*num_chosen
    new_columns = ['opp' + str(((i)//16)+1) + '_avg_' + str(col) for i, col in enumerate(new_columns)]
    opp_team_avg_wide.columns = new_columns
    avgs_and_player_game_stats.append(opp_team_avg_wide)

    same_team_avg_wide = same_team_avgs.stack().reset_index(drop=True).to_frame().transpose()
    new_columns = list(same_team_avgs.columns)*num_chosen
    new_columns = ['same_avg' + str(((i)//16)+1) + '_avg_' + str(col) for i, col in enumerate(new_columns)]
    same_team_avg_wide.columns = new_columns
    avgs_and_player_game_stats.append(same_team_avg_wide)

    # Add player performance in this game to the avg stats
    player_df = game_df[game_df['player'].str.strip() == player]
    avgs_and_player_game_stats.append(player_df)
    avgs_and_player_game_stats = pd.concat(avgs_and_player_game_stats, axis = 0)
    avgs_and_player_game_stats = pd.DataFrame(avgs_and_player_game_stats.mean(numeric_only=True)).transpose() # mean to remove all nans because we'll have 10 rows due to column inconsistency.
    return avgs_and_player_game_stats
# data = get_all_data('2020-10-10', '2022-12-13')
# create_row_for_player_game('LeBron James', '/boxscores/202212130LAL.html', data, 'mp')

0.13040947914123535


Unnamed: 0,opp1_avg_mp,opp1_avg_fg,opp1_avg_fga,opp1_avg_fg3,opp1_avg_fg3a,opp1_avg_ft,opp1_avg_fta,opp1_avg_orb,opp1_avg_drb,opp1_avg_ast,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus
0,37.263468,9.232323,20.080808,3.191919,8.69697,5.909091,6.939394,1.080808,6.505051,4.949495,...,0.0,9.0,9.0,9.0,2.0,1.0,4.0,4.0,33.0,10.0


In [15]:
predict_data = None


In [5]:
def create_predict_data(player, game_date, order_by):
    '''
    This function will take in a player's name and return a dataframe of their stats and other stats which can be used to predict. 
    Predictors for now, include average stats of opponent players with x most minutes, and average stats of same team with x most minutes.
    Only uses data from the past year.
    player: player full name
    game_date: 'YYYY-MM-DD'
    order_by: Stat by which to order players for consistency against different teams. Position is also a good one but that sounds hard. 
    # return: dataframe with predictors and stats
    '''
    global predict_data
    
    # 2 years because we want average stats for all players, potentially including stats from the previous year. 
    start_date = datetime.strptime(
        game_date, '%Y-%m-%d') - pd.DateOffset(years=2)
    raw_data = get_all_data(start_date, game_date)

    # Select only data for games in which this player played.
    player_data = raw_data[(raw_data['player'] == player)]
    games = set(player_data['link'])

    game_dfs = []
    for game in games:
        game_dfs.append(create_row_for_player_game(
            player, game, raw_data, order_by))

    games_df = pd.DataFrame(pd.concat(game_dfs, axis=0).dropna(axis=0))

    # Keep track of these flags so that data does not have to be recreated constantly by predict.
    # This function takes a while to run (15sec) sooooo.
    games_df.player = player
    games_df.game_date = game_date
    games_df.order_by = order_by

    predict_data = games_df

In [6]:
def add_over_under_flag(stat, over_under):
    '''
    Add or mutate stat flag column to dataframe based on stat and over/under
    game_df: Dataframe formatted by create_predict_data
    stat: Stat to add or mutate flag column for
    over_under: Value which determines whether flag is 1 or 0
    '''
    global predict_data
    predict_data[f'{stat}_flag'] = predict_data[stat].apply(lambda x: 1 if int(x) > over_under else 0)

    flag_attribute = f'{stat}_over_under'
    predict_data.flag_attribute = over_under

In [18]:
def predict(player, game_date, order_by, stat, over_under):
    '''
    Predicts a stat for a given player or team on a given game date.
    # data: dataframe holding training data. Keep this out of the function to reduce runtime.
    # name: Name of player or abbreviation of team
    # game_date: Date of game. Format: 'YYYY-MM-DD'
    # stat: Stat to predict
    # order_by: Stat by which to order players for consistency against different teams
    # player_or_team: 'player' or 'team'
    # returns: predicted stat for that player/team on that date. 
    '''
   
    global predict_data
    global stats

    # See if new creation of data is necessary. Data is stored in predict_data, which might already hold 
    flag_attribute = f'{stat}_over_under'
    # Sometimes we might just be missing over/under. Other times the whole df might be wrong.
    if predict_data is not None and hasattr(predict_data, 'player') and hasattr(predict_data, 'game_date') and hasattr(predict_data, 'order_by') and predict_data.player == player and predict_data.game_date == game_date and predict_data.order_by == order_by:
        # The dataframe is correct.
        if hasattr(predict_data, flag_attribute) and predict_data.flag_attribute == over_under:
            # Dataframe correct, and over/under is correct.
            pass
        else:
            # Dataframe correct, but over/under is incorrect.
            add_over_under_flag(stat, over_under)
    else:
        create_predict_data(
            player, game_date, order_by)
        add_over_under_flag(stat, over_under)

    # 5 is the number of players for each team. 2 is the number of teams.
    xcols = predict_data.columns[:len(stats)*5*2]
    ycol = [f'{stat}_flag']  # stat to predict

    pipeline = Pipeline([
        ("pf", PolynomialFeatures(degree=1, include_bias=False)),
        ("std", StandardScaler()),
        ("lr", LogisticRegression(fit_intercept=False, max_iter=1000)),
    ])

    scores = cross_val_score(
        pipeline, predict_data[xcols], np.array(predict_data[ycol]).ravel(), cv=10)

    return (scores.mean(), scores.std())


predict('LeBron James', '2023-10-1', 'pts', 'drb', 9)


(0.5813186813186814, 0.09404436437509744)

In [19]:
all_games = get_all_data('1980-10-1', '2023-10-1')
all_players = tuple(all_games['player'])

In [21]:
def test_stat(stat):
    global stats
    global all_games
    global all_players
    global predict_data
    global cv_scores

    if os.path.exists(f'./test/{stat}.csv'):
        cv_scores = pd.read_csv(f'./test/{stat}.csv')
    else:
        cv_scores = pd.DataFrame(columns = ['player', 'game_date', 'order_by', 'stat', 'cross_val_score_mean', 'cross_val_score_std'], index = [0])

    # Nested lists are typically terrible, but we have limited range, and this reduces run-time by preventing the recreation of dataframes. 
    # Randomly select a player.
    for player in random.sample(all_players, 1):
        # Randomly select some games played by that player. 
        all_player_games = all_games[all_games['player'] == player]
        for game in random.sample(list(all_player_games['link']), 1):
            game_date = all_games[all_games['link'] == game]['game_date'].iloc[0]
            for order_by in stats:
                # Randomly select over/under values for that player. Represented by some mean + std. Runtime of this is very cheap, so why not lol.
                over_under_vals = np.random.normal(all_player_games[stat].mean(), all_player_games[stat].std()/2, 5)
                over_under_vals = [int(x) for x in over_under_vals]
                over_under_vals = [0 if x < 0 else x for x in over_under_vals]
                for over_under in over_under_vals: 
                    cv_score, cv_score_std = predict(player, game_date, order_by, stat, over_under)
                    cv_score = pd.DataFrame({'player': [player], 'game_date': [game_date], 'order_by': [order_by], 'stat': [stat], 'over_under': [over_under], 'cross_val_score_mean': [cv_score], 'cross_val_score_std': [cv_score_std]})
                    cv_scores = pd.concat([cv_scores, cv_score], axis=0)


    # Write cv_scores to csv.
    cv_scores.to_csv(f'cv_scores_{stat}.csv')
            
            

test_stat('pts')   

         player   game_date order_by stat  cross_val_score_mean  \
0  Jrue Holiday  2022-02-26       mp  pts              0.504412   

   cross_val_score_std  
0             0.156609  
         player   game_date order_by stat  cross_val_score_mean  \
0  Jrue Holiday  2022-02-26       mp  pts               0.45625   

   cross_val_score_std  
0              0.11787  
         player   game_date order_by stat  cross_val_score_mean  \
0  Jrue Holiday  2022-02-26       mp  pts              0.519853   

   cross_val_score_std  
0             0.097855  
         player   game_date order_by stat  cross_val_score_mean  \
0  Jrue Holiday  2022-02-26       mp  pts              0.492279   

   cross_val_score_std  
0             0.101931  
         player   game_date order_by stat  cross_val_score_mean  \
0  Jrue Holiday  2022-02-26       mp  pts              0.500735   

   cross_val_score_std  
0             0.117034  


KeyboardInterrupt: 

In [13]:
def test_model():
    '''
    Randomly sample player/player games, stat/stat overunder, and order_bys to see the best combinations for each stat. 
    '''
    global stats
    for stat in stats:
        optimize_stat(stat)
        

In [14]:
p = PCA()
xcols = data.columns[:-20]
ycol = 'drb'

W = p.fit_transform(data[xcols])
C = p.components_

p.explained_variance_ratio_

array([2.45608150e-01, 1.41280193e-01, 9.38738115e-02, 8.32967052e-02,
       5.05639740e-02, 3.92894561e-02, 3.23269079e-02, 2.99778411e-02,
       2.75314568e-02, 2.53510515e-02, 2.04198971e-02, 1.78216107e-02,
       1.67579338e-02, 1.54490960e-02, 1.31566138e-02, 1.19472328e-02,
       1.07315143e-02, 9.38861357e-03, 8.49556125e-03, 8.24508786e-03,
       7.64420943e-03, 6.74919855e-03, 6.46615923e-03, 6.29811189e-03,
       5.47461840e-03, 5.37269036e-03, 4.51669987e-03, 4.36277743e-03,
       3.94039993e-03, 3.61903193e-03, 3.49327885e-03, 3.41893945e-03,
       2.96327606e-03, 2.57609437e-03, 2.44094566e-03, 2.33250576e-03,
       2.04776448e-03, 1.96629615e-03, 1.77067351e-03, 1.60893485e-03,
       1.47832794e-03, 1.40357904e-03, 1.30021929e-03, 1.15528188e-03,
       1.10408860e-03, 1.04579260e-03, 9.73936635e-04, 9.21789131e-04,
       8.26877613e-04, 7.46328788e-04, 6.79485007e-04, 6.16134008e-04,
       6.07011959e-04, 5.65010038e-04, 4.83964386e-04, 4.49195212e-04,
      

In [29]:
a = ['a', 'b', 'c']
print(a[:2])

['a', 'b']
