In [2]:
import json
import os
import random
import re
import time
from datetime import datetime, timedelta
from urllib.error import HTTPError

import bs4 as bs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import sklearn
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA
# new import statements
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                             confusion_matrix, precision_score, recall_score)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (OneHotEncoder, PolynomialFeatures,
                                   StandardScaler)
from sklearn.model_selection import cross_val_predict

stats = ['mp', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb',
         'drb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']

from sklearn.metrics import log_loss

from sklearn import model_selection
from sklearn.metrics import classification_report

from unidecode import unidecode
import csv

In [3]:
# Global variables which are initialized here, but take on no relevant values until later 
raw_data = None # Raw data. Eventually should hold past 2 years of data, ending on the game date we are predicting 
predict_data = None # Data that we are predicting on. Should hold the past year of data, ending on the game date we are predicting
stats = ['mp', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus'] # Stats that we are predicting/using to predict 

In [4]:
def get_all_data(start_date, end_date):
    '''
    This function returns a dataframe of all data of games from start_date to end_date
    start_date: datetime object. 
    end_date: datetime object. 
    '''
    # For some reason start_date is exclusive?
    start_date = start_date - timedelta(days=31)
    dates = pd.date_range(start_date, end_date, freq='MS').strftime("%Y-%m").tolist() # get months from which to retrieve data. 
    all_dataframes = []

    for date in dates:
        path = 'data2/' + date.split('-')[0] + '/' + date + '.csv'
        try:
            # Get all dataframes for each month.
            with open(path, 'r') as f:
                all_dataframes.append(pd.read_csv(f))
        except Exception as e:
            continue

    df = pd.concat(all_dataframes)
   
    df.columns = df.columns.str.strip() # vscode extension that adds whitespace for readability in csv. this undos it. 
    for col in df.columns:
        if type(df[col]) == str:
            df[col] = df[col].str.strip()
    df.fillna(0, inplace=True)

    df['player'] = df['player'].apply(lambda x: unidecode(x)) # remove accents from names

    # Sort by dates to make sure that it's pretty :))
    df['game_date'] = pd.to_datetime(df['game_date'])
    df.sort_values(by='game_date', inplace=True)

    return df

# start_date = pd.to_datetime('2019-10-01')
# end_date = pd.to_datetime('2020-10-01')

# get_all_data(start_date, end_date)


In [5]:
def get_player_avgs(player_names, start_date, end_date):
    '''
    Gets a player's average stats from start_date to end_date.
    player_names: [player_name1, player_name2, ...]
    start_date: datetime object
    end_date: datetime object
    '''
    global stats
    global raw_data
    
    if raw_data is None:
        print('raw_data is None. Please run get_all_data() first.')
        return None

    player_data = raw_data[raw_data['player'].isin(player_names)]
    
    # compare dates so that we only get data from the timeframe we want.
    player_data = player_data[player_data['game_date'] >= start_date]
    player_data = player_data[player_data['game_date'] <= end_date]

    return player_data.groupby('player').mean()[stats]

# start_date = pd.to_datetime('2019-10-01')
# end_date = pd.to_datetime('2020-10-01')
# raw_data = get_all_data(start_date, end_date)
# a = get_player_avgs(['Luka Doncic', 'Kristaps Porzingis'], start_date, end_date)


In [6]:
def create_row_for_player_game(player: str, game_link: str, order_by: str):
    '''
    player: player name
    game: link to game. 
    order_by: Stat by which to order opponent stats. 
    returns a single row of data aggregated from the game.
    '''
    num_chosen = 5  # Number of players to choose from each team. Arbitrary, but there are 5 players per game. Ideally we would use position, but how?
    global stats
    global raw_data

    # vscode extension that adds whitespace for readability in csv. strip undos it.
    game_df = raw_data[raw_data['link'].str.strip() == game_link]
    same_team_abbr = game_df[game_df['player'].str.strip(
    ) == player]['team'].iloc[0]  # same shit
    opp_team_abbr = game_df[game_df['team'].str.strip()
                            != same_team_abbr]['team'].iloc[0]

    game_date = game_df['game_date'].iloc[0]
    # Can't include game in game calculations
    day_before_game_date = game_date - timedelta(days=1)
    year_before_game_date = game_date - timedelta(days=365)

    opp_team_roster = list(game_df[game_df['team'] == opp_team_abbr]['player'])    

    opp_team_avgs = get_player_avgs(opp_team_roster, year_before_game_date, day_before_game_date)
    
    opp_team_avgs = opp_team_avgs.nlargest(num_chosen, order_by)

    if len(opp_team_avgs) == 0:
        return
    
    game_row = []
    # This part is pretty janky. I'm sure there's a better way to do this. But.... its fast enough so I'm not gonna bother.
    # Kinda just pivots the dataframes so that they're wide instead of long. idk?
    opp_team_avg_wide = opp_team_avgs.stack().reset_index(
        drop=True).to_frame().transpose()
    new_columns = list(opp_team_avgs.columns)*num_chosen
    new_columns = ['opp' + str(((i)//16)+1) + '_avg_' + str(col)
                   for i, col in enumerate(new_columns)]
    opp_team_avg_wide.columns = new_columns
    game_row.append(opp_team_avg_wide)

    # Add player performance in this game to the avg stats
    player_df = game_df[game_df['player'].str.strip() == player]
    new_columns = list(player_df.columns)
    new_columns = ['player_' + str(col) for col in new_columns]
    player_df.columns = new_columns
    game_row.append(player_df)

    game_row = pd.concat(game_row, axis=0)
    # mean to remove all nans because we'll have 10 rows due to column inconsistency.
    
    game_row = pd.DataFrame(
        game_row.mean(numeric_only=True)).transpose()
    
    # Add some values for potential use. 
    game_row['game_date'] = game_date
    game_row['link'] = game_link
    return game_row

start_date = datetime(2021, 10, 19)
end_date = datetime(2023, 4, 9)
raw_data = get_all_data(start_date, end_date)
create_row_for_player_game('Luka Doncic', '/boxscores/202211100WAS.html', 'pts')


Unnamed: 0,opp1_avg_mp,opp1_avg_fg,opp1_avg_fga,opp1_avg_fg3,opp1_avg_fg3a,opp1_avg_ft,opp1_avg_fta,opp1_avg_orb,opp1_avg_drb,opp1_avg_ast,...,player_trb,player_ast,player_stl,player_blk,player_tov,player_pf,player_pts,player_plus_minus,game_date,link
0,32.736029,6.441176,14.088235,1.867647,5.514706,2.573529,3.5,0.970588,7.073529,3.455882,...,9.0,6.0,2.0,0.0,5.0,2.0,22.0,-6.0,2022-11-10,/boxscores/202211100WAS.html


In [7]:
def create_predict_data(player: str, game_date: datetime, order_by: str):
    '''
    This function will take in a player's name and return a dataframe of their stats and other stats which can be used to predict. 
    Predictors for now, include average stats of opponent players with x most minutes, and average stats of same team with x most minutes.
    Only uses data from the past year.
    player: player full name
    game_date: datetime
    order_by: Stat by which to order opponent stats. 
    return: dataframe with predictors and stats
    '''
    global predict_data
    global raw_data

    # Check to see if data needs to be recreated
    if predict_data is not None:
        if hasattr(predict_data, 'player') and hasattr(predict_data, 'game_date_attr') and hasattr(predict_data, 'order_by'):
            if predict_data.player == player and predict_data.game_date_attr == game_date and predict_data.order_by == order_by:
                return # Nothing needs to be done. 
            
    # 2 years because we want average stats for all players, potentially including stats from the previous year. 
    raw_start_date = game_date - timedelta(days=365*2) # Potentially we are using games from 1 yr ago, and we want players avg stats for a year before that. 
    raw_data = get_all_data(raw_start_date, game_date)

    # Get all games for this player in the past year
    start_date = game_date - timedelta(days=365)
    player_data = raw_data[(raw_data['player'] == player)]
    player_data = player_data[(player_data['game_date'] >= start_date) & (player_data['game_date'] <= game_date)]
    games = set(player_data['link'])

    game_dfs = []
    for game in games:
        game_dfs.append(create_row_for_player_game(player, game, order_by))
    games_df = pd.DataFrame(pd.concat(game_dfs, axis=0).dropna(axis=0))

    # Keep track of these flags so that data does not have to be recreated constantly by predict.
    games_df.player = player
    games_df.game_date_attr = game_date # Can't use game_date because that overrides the column
    games_df.order_by = order_by

    games_df = games_df.sort_values(by='game_date', ascending=True) # make it sexyyyyy

    predict_data = games_df

game_date = datetime(2022, 10, 20)
create_predict_data('LeBron James', game_date, 'pts')
predict_data

Unnamed: 0,opp1_avg_mp,opp1_avg_fg,opp1_avg_fga,opp1_avg_fg3,opp1_avg_fg3a,opp1_avg_ft,opp1_avg_fta,opp1_avg_orb,opp1_avg_drb,opp1_avg_ast,...,player_trb,player_ast,player_stl,player_blk,player_tov,player_pf,player_pts,player_plus_minus,game_date,link
0,33.564035,8.821053,18.694737,1.810526,5.421053,5.021053,5.715789,0.526316,3.778947,4.136842,...,2.0,5.0,2.0,0.0,5.0,4.0,25.0,-6.0,2021-10-22,/boxscores/202110220LAL.html
0,33.472454,7.416667,16.208333,1.291667,4.138889,4.361111,5.972222,0.972222,3.138889,7.388889,...,6.0,6.0,2.0,2.0,3.0,1.0,19.0,-1.0,2021-10-24,/boxscores/202110240LAL.html
0,30.789865,6.256757,14.054054,0.864865,2.864865,1.500000,1.878378,0.837838,6.054054,5.337838,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-10-26,/boxscores/202110260SAS.html
0,28.546377,6.956522,13.804348,1.695652,4.217391,4.391304,5.456522,0.434783,3.500000,4.847826,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-10-27,/boxscores/202110270OKC.html
0,31.079224,7.753425,16.219178,1.438356,3.945205,4.452055,5.506849,0.863014,1.917808,3.726027,...,3.0,8.0,3.0,1.0,7.0,1.0,26.0,9.0,2021-10-29,/boxscores/202110290LAL.html
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,33.571341,7.414634,15.926829,2.353659,6.170732,2.670732,3.012195,0.500000,2.573171,7.609756,...,10.0,12.0,1.0,1.0,3.0,0.0,38.0,9.0,2022-03-21,/boxscores/202203210CLE.html
0,35.473563,8.712644,18.816092,2.965517,7.735632,1.896552,2.574713,0.850575,3.620690,5.011494,...,9.0,5.0,0.0,1.0,4.0,1.0,39.0,-9.0,2022-03-27,/boxscores/202203270NOP.html
0,35.475287,8.701149,18.793103,2.965517,7.689655,1.873563,2.620690,0.850575,3.586207,5.000000,...,8.0,4.0,1.0,0.0,6.0,3.0,38.0,-2.0,2022-04-01,/boxscores/202204010LAL.html
0,34.591667,8.569767,19.348837,4.372093,11.383721,4.441860,4.965116,0.523256,4.709302,6.197674,...,15.0,8.0,0.0,0.0,5.0,2.0,31.0,-10.0,2022-10-18,/boxscores/202210180GSW.html


In [8]:
def add_over_under_flag(stat, over_under):
    '''
    Add or mutate stat flag column to dataframe based on stat and over/under. Helper function for predict
    game_df: Dataframe formatted by create_predict_data
    stat: Stat to add or mutate flag column for
    over_under: Value which determines whether flag is 1 or 0
    '''
    global predict_data
    predict_data[f'{stat}_flag'] = predict_data[stat].apply(lambda x: 1 if int(x) > over_under else 0)

In [9]:
def predict_helper(player: str, game_date: str, order_by: str, stat: str, over_under: int):
    '''
    Seperates training data into opponent team and same team to optimize order_by combinations.
    player: player full name. romanized no accents using unidecode
    game_date: YYYY-MM-DD
    order_by_opp: Stat by which to order players for consistency against different teams. Opponent team
    order_by_same: Stat by which to order players for consistency against same team. Same team
    stat: Stat to predict
    over_under: Value which determines whether flag is 1 or 0
    '''
    global predict_data
    global stats

    game_date = datetime.strptime(game_date, '%Y-%m-%d')
    create_predict_data(player, game_date, order_by)
    add_over_under_flag(f'player_{stat}', over_under)

    model = Pipeline([
        ("pf", PolynomialFeatures(degree=2, include_bias=False)),
        ("std", StandardScaler()),
        ("lr", LogisticRegression(fit_intercept=False, max_iter=1000)),
        ])
    
    X = predict_data.filter(regex=r'opp\d_avg_')
    Y = predict_data[f'player_{stat}_flag']

    cv_score = cross_val_score(model, X, Y, cv=5) # Accuracy is about the same as f1 due to roughly 50/50 distribution with over/unders. So scoring method is fine. hopefully
    prediction = model.fit(X, Y).predict(X)[-1]
    actual = predict_data[predict_data['game_date'] == game_date][f'player_{stat}_flag'].iloc[0]

    results = {'order_by': [order_by], 'cv_score': [cv_score.mean()], 'cv_score_std': [cv_score.std()], 'prediction': [prediction], 'actual': [actual]}

    return pd.DataFrame(results)
    

predict_helper('LeBron James', '2022-12-25', 'blk', 'pts', 27)

Unnamed: 0,order_by,cv_score,cv_score_std,prediction,actual
0,blk,0.64359,0.157259,1,1


In [10]:
def predict(player, game_date, stat, over_under):
    '''
    Predicts a stat for a given player or team on a given game date.
    player: Name of player
    game_date: Date of game. Format: 'YYYY-MM-DD'
    order_by: Stat by which to order players for consistency against different teams
    stat: Stat to predict
    over_under: Value which determines whether stat flag is 1 or 0
    '''
    global predict_data
    global stats

    # Run's predict_half once for each stat, to see whats best for each opp/same
    scores = []
    for order_by in stats:
        scores.append(predict_helper(player, game_date, order_by, stat, over_under))
    
    scores = pd.concat(scores, axis=0).reset_index(drop=True)

    # Select best order_by for opp and same
    best = scores.sort_values(by='cv_score', ascending=False).iloc[0]

    # Run predict_half again with best order_by for opp and same
    return best.to_frame().transpose()
    

# results = predict('LeBron James', '2023-5-22', 'pts', 28)



In [11]:
def random_testing():
    '''
    Randomly tests player/game/stat combinations to see how accurate predictions are. 
    '''
    global stats
    global raw_data

    # Use data only from 2022-2023 season.
    start = datetime(2022, 10, 1)
    end = datetime(2023, 6, 1)
    selection_data = get_all_data(start, end)
    raw_data = get_all_data(start - timedelta(days=365), end)

    
    while True:    
        try:                        
            # Randomly select player, game, and stat. Only good players though
            stat = random.choice(stats)
            players = random.sample(list(selection_data['player'].unique()), k = 5)
            avgs = get_player_avgs(players, start, end)
            avgs = avgs.sort_values(by=stat, ascending=False).reset_index()
            # Select best player
            player = avgs.iloc[0]['player']
            # Select random game played by this player
            game_date = random.choice(selection_data[selection_data['player'] == player]['game_date'].unique())
            game_date = np.datetime_as_string(game_date, unit='D')
            # over under is this players average stats over the past year.
            over_under = int(avgs[avgs['player'] == player][stat].iloc[0]+1)

            # Predict stat, and prediction to df
            results = predict(player, game_date, stat, over_under)

            results['player'] = player
            results['game_date'] = game_date
            results['stat'] = stat
            results['over_under'] = over_under

            results = results.reset_index(drop=True)
            new_order = ['player', 'game_date', 'stat', 'over_under', 'order_by', 'cv_score', 'cv_score_std', 'prediction', 'actual']
            results = results[new_order]

            results.to_csv('random_testing.csv', mode='a', header=False)
        except:
            pass


random_testing()


Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/lib/python3/dist-packages/sklearn/pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/lib/python3/dist-packages/sklearn/linear_model/_logistic.py", line 1372, in fit
    raise ValueError("This solver needs samples of at least 2 classes"
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/lib/python3/dist-packages/sklearn/pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/lib/python3/dist-packages/sklearn/linear_mode