In [15]:
import json
import os
import re
import time
from datetime import datetime, timedelta
from urllib.error import HTTPError

import bs4 as bs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import sklearn

from sklearn.compose import make_column_transformer
# new import statements
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler

from sklearn.decomposition import PCA

import random
stats = ['mp', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']

In [3]:
def get_all_data(start_date, end_date):
    '''
    This function returns a dataframe of all data of games from start-date to end_date
    '''
    dates = pd.date_range(start_date, end_date, freq='MS').strftime("%Y-%B").tolist() # get months from which to retrieve data. 
    dates = [date.lower() for date in dates]
    all_dataframes = []
    missing_data = []
    for date in dates:
        path = 'data/' + date.split('-')[0] + '/' + date + '.csv'
        try:
            # Get all dataframes for each month.
            with open(path, 'r') as f:
                all_dataframes.append(pd.read_csv(f))
        except Exception as e:
            missing_data.append(date)
            continue

    df = pd.concat(all_dataframes)
    df.columns = df.columns.str.strip() # vscode extension that adds whitespace for readability in csv. this undos it. 
    for col in df.columns:
        if type(df[col]) == str:
            df[col] = df[col].str.strip()

    df.fillna(0, inplace=True)

    return df

# raw = get_all_data('2022-9-1', '2023-2-28')
# raw

In [4]:
def get_player_avg(player_name, start_date, end_date, data):
    '''
    Gets a player's average stats over the timeframe from start_date to end_date. 
    # player_name: Name of player
    # start_date: Start date. YYYY-MM-DD. datetime object
    # end_date: End date. YYYY-MM-DD. datetime object
    # data: Dataframe of all data/all data which may be relevant to previous func call.
    # returns: A pandas series of the player's average stats over the timeframe.
    '''
    player_data = data[data['player'] == player_name]

    start_date = str(start_date)
    end_date = str(end_date)
    
    # compare dates so that we only get data from the timeframe we want.
    player_data = player_data[player_data['game_date'] >= start_date]
    player_data = player_data[player_data['game_date'] <= end_date]

    # get average stats
    # stats = ['mp', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']
    global stats
    player_data = player_data[stats]
    return player_data.mean().to_frame().transpose().fillna(0)

# get_player_avg('LeBron James', '2020-10-10', '2023-5-17', raw)

In [5]:
def create_row_for_player_game(player, game, data, order_by):
    '''
    player: player name
    game: link to game. 
    data: raw data for all games in past (year)
    player_averages: dictionary of player averages for the past year. Reused for efficiency.
    returns a single row of data aggregated from the game. Includes stats from both teams. 
    '''
    
    num_chosen = 5 # Number of players to choose from each team. 
    global stats
    # stats = ['mp', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']

    game_df = data[data['link'].str.strip() == game] # vscode extension that adds whitespace for readability in csv. strip undos it.
    same_team_abbr = game_df[game_df['player'].str.strip() == player]['team'].iloc[0] # same shit
    opp_team_abbr = game_df[game_df['team'].str.strip() != same_team_abbr]['team'].iloc[0]

    game_date = game_df['game_date'].iloc[0]
    day_before_game_date = datetime.strptime(game_date, '%Y-%m-%d') - timedelta(days=1)
    year_before_game_date = datetime.strptime(game_date, '%Y-%m-%d') - timedelta(days=365)

    opp_team_roster = list(game_df[game_df['team'] == opp_team_abbr]['player'])
    same_team_roster = list(game_df[game_df['team'] == same_team_abbr]['player'])
    
    opp_team_avgs = []
    same_team_avgs = []
    for player_name in opp_team_roster:
        player_avg = get_player_avg(player_name, year_before_game_date, day_before_game_date, data)
        opp_team_avgs.append(player_avg) 
    for player_name in same_team_roster:
        player_avg = get_player_avg(player_name, year_before_game_date, day_before_game_date, data)
        same_team_avgs.append(player_avg)
    opp_team_avgs = pd.concat(opp_team_avgs).nlargest(num_chosen, order_by)
    same_team_avgs = pd.concat(same_team_avgs).nlargest(num_chosen, order_by)

    avgs_and_player_game_stats = []

    opp_team_avg_wide = opp_team_avgs.stack().reset_index(drop=True).to_frame().transpose()
    new_columns = list(opp_team_avgs.columns)*num_chosen
    new_columns = ['opp' + str(((i)//16)+1) + '_avg_' + str(col) for i, col in enumerate(new_columns)]
    opp_team_avg_wide.columns = new_columns
    avgs_and_player_game_stats.append(opp_team_avg_wide)

    same_team_avg_wide = same_team_avgs.stack().reset_index(drop=True).to_frame().transpose()
    new_columns = list(same_team_avgs.columns)*num_chosen
    new_columns = ['same_avg' + str(((i)//16)+1) + '_avg_' + str(col) for i, col in enumerate(new_columns)]
    same_team_avg_wide.columns = new_columns
    avgs_and_player_game_stats.append(same_team_avg_wide)
    
    # Add player performance in this game to the avg stats
    player_df = game_df[game_df['player'].str.strip() == player]
    avgs_and_player_game_stats.append(player_df)

    avgs_and_player_game_stats = pd.concat(avgs_and_player_game_stats, axis = 0)
    avgs_and_player_game_stats = pd.DataFrame(avgs_and_player_game_stats.mean(numeric_only=True)).transpose() # mean to remove all nans because we'll have 10 rows due to column inconsistency.
    return avgs_and_player_game_stats

# data = get_all_data('2021-2-2', '2023-2-2')
# a = create_row_for_player_game('LeBron James', '/boxscores/202210180GSW.html', data, 'drb')
# a


In [8]:
def create_data_for_player_prediction(player, game_date, order_by, stat, over_under):
    '''
    This function will take in a player's name and return a dataframe of their stats and other stats which can be used to predict. 
    Predictors for now, include average stats of opponent players with x most minutes, and average stats of same team with x most minutes.
    Only uses data from the past year.
    player: player full name
    game_date: 'YYYY-MM-DD'
    order_by: Stat by which to order players for consistency against different teams. Position is also a good one but that sounds hard. 
    # return: dataframe with predictors and stats
    '''
    # Get all data to be used for prediction.
    # 2 years because we need avg stats for games played 1 yr ago.
    start_date = datetime.strptime(
        game_date, '%Y-%m-%d') - pd.DateOffset(years=2)
    raw_data = get_all_data(start_date, game_date)

    # Select only data for games in which this player played.
    player_data = raw_data[(raw_data['player'] == player)]
    games = set(player_data['link'])

    game_dfs = []
    for game in games:
        game_dfs.append(create_row_for_player_game(
            player, game, raw_data, order_by))

    game_df = pd.DataFrame(pd.concat(game_dfs, axis=0).dropna(axis=0))

    game_df[stat] = game_df[stat].apply(lambda x: 1 if int(x) > over_under else 0)

    # Keep track of these flags so that data does not have to be recreated constantly by predict.
    # This function takes a while to run (15sec) sooooo.
    game_df.player = player
    game_df.game_date = game_date
    game_df.order_by = order_by
    game_df.stat = stat
    game_df.over_under = over_under

    return game_df # for some reason it's a tuple wtf? [0] fixes it. EDIT: lol i had a comma at the end of some line of code.

In [9]:
data = create_data_for_player_prediction('LeBron James', '2023-10-1', 'pts', 'drb', 8)

In [131]:
def predict(player, game_date, order_by, stat, over_under):
    '''
    Predicts a stat for a given player or team on a given game date.
    # data: dataframe holding training data. Keep this out of the function to reduce runtime.
    # name: Name of player or abbreviation of team
    # game_date: Date of game. Format: 'YYYY-MM-DD'
    # stat: Stat to predict
    # order_by: Stat by which to order players for consistency against different teams
    # player_or_team: 'player' or 'team'
    # returns: predicted stat for that player/team on that date. 
    '''
    # See if new creation of data is necessary.
    global data
    if hasattr(data, 'player') and hasattr(data, 'game_date') and hasattr(data, 'order_by') and hasattr(data, 'stat') and hasattr(data, 'over_under'):
        if data.player != player or data.game_date != game_date or data.order_by != order_by or data.stat != stat or data.over_under != over_under:
            data = create_data_for_player_prediction(player, game_date, order_by, stat, over_under)
        else:
            pass
    else:
        data = create_data_for_player_prediction(player, game_date, order_by, stat, over_under)
    
    # data.to_excel('temp.xlsx')

    # last 20 columns are player stats for that game. Can't be used for prediction.
    xcols = data.columns[:-20]
    ycol = [stat] # stat to predict
    
    pipeline = Pipeline([
        ("pf", PolynomialFeatures(degree=1, include_bias=False)),
        ("std", StandardScaler()),
        ("lr", LogisticRegression(fit_intercept=False, max_iter=1000)),
    ])

    train, test = train_test_split(data, test_size=0.25)
    

    pipeline.fit(train[xcols], np.array(train[ycol]).ravel())
    print(pipeline.score(test[xcols], np.array(test[ycol]).ravel()))

    scores = cross_val_score(pipeline, data[xcols], np.array(data[ycol]).ravel(), cv=10)
    print(scores.mean())

predict('LeBron James', '2023-10-1', 'pts', 'drb', 9)


0.5294117647058824
0.5450549450549451


In [10]:
cv_scores = pd.DataFrame(columns = ['player', 'game_date', 'order_by', 'stat', 'cross_val_score'])

In [11]:
all_games = get_all_data('1980-10-1', '2023-10-1')
all_players = tuple(all_games['player'])

In [18]:
def test_stat(stat):
    # stats = ['mp', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']
    global stats
    global all_games
    global all_players
    for stat in stats:
        for i in range(10):
            # Randomly select a player, and do a couple games with over/under values which represents that players mean performance.
            player = all_players[random.randint(1, len(all_players) - 1)]
            

test_stat('pts')   

In [None]:
def optimize_model():
    '''
    Test what variable is the best to order by for a given stat. 
    '''
    global stats
    for stat in stats:
        optimize_stat(stat)
        

In [136]:
p = PCA()
xcols = data.columns[:-20]
ycol = 'drb'

W = p.fit_transform(data[xcols])
C = p.components_

p.explained_variance_ratio_

array([2.74419356e-01, 1.34248513e-01, 9.77373287e-02, 5.68839094e-02,
       4.41762686e-02, 3.63980889e-02, 3.37267210e-02, 3.11827308e-02,
       2.85892634e-02, 2.29903368e-02, 2.01163173e-02, 1.90506478e-02,
       1.75054277e-02, 1.47883655e-02, 1.38121769e-02, 1.26674711e-02,
       1.05795147e-02, 9.84247357e-03, 9.32091581e-03, 8.59661908e-03,
       7.58761649e-03, 7.27779503e-03, 7.21862980e-03, 6.21784878e-03,
       6.07192866e-03, 5.08030447e-03, 4.90792786e-03, 4.48145676e-03,
       4.24421204e-03, 3.99119977e-03, 3.84304051e-03, 3.49459086e-03,
       2.91071325e-03, 2.76938065e-03, 2.62182244e-03, 2.30436546e-03,
       2.24192094e-03, 2.03450419e-03, 1.90097899e-03, 1.66368375e-03,
       1.57771639e-03, 1.48910912e-03, 1.31379090e-03, 1.28299017e-03,
       1.19231359e-03, 1.11638623e-03, 1.06432961e-03, 9.32349434e-04,
       8.41237776e-04, 7.65862890e-04, 7.44691078e-04, 6.84083814e-04,
       6.37237959e-04, 5.50624249e-04, 5.04912629e-04, 4.49154128e-04,
      