In [17]:
import json
import os
import random
import re
import time
from datetime import datetime, timedelta
from urllib.error import HTTPError

import bs4 as bs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import sklearn
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA
# new import statements
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                             confusion_matrix, precision_score, recall_score)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (OneHotEncoder, PolynomialFeatures,
                                   StandardScaler)

stats = ['mp', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb',
         'drb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus']

from sklearn.metrics import log_loss

from sklearn import model_selection
from sklearn.metrics import classification_report
# DF which holds data required for prediction is called 'predict_data'

# How tf do I work with branches in jupyter notebook?????

In [2]:
def get_all_data(start_date, end_date):
    '''
    This function returns a dataframe of all data of games from start_date to end_date
    '''
    start = time.time()
    dates = pd.date_range(start_date, end_date, freq='MS').strftime("%Y-%B").tolist() # get months from which to retrieve data. 
    dates = [date.lower() for date in dates]
    all_dataframes = []
    missing_data = []

    for date in dates:
        path = 'data/' + date.split('-')[0] + '/' + date + '.csv'
        try:
            # Get all dataframes for each month.
            with open(path, 'r') as f:
                all_dataframes.append(pd.read_csv(f))
        except Exception as e:
            missing_data.append(date)
            continue

    if all_dataframes != []:
        df = pd.concat(all_dataframes)
    else:
        df = pd.DataFrame()
    df.columns = df.columns.str.strip() # vscode extension that adds whitespace for readability in csv. this undos it. 
    for col in df.columns:
        if type(df[col]) == str:
            df[col] = df[col].str.strip()

    df.fillna(0, inplace=True)

    return df

# get_all_data('2020-10-10', '2023-10-10')

In [3]:
def get_player_avg(player_name, start_date, end_date, data):
    '''
    Gets a player's average stats during that year (not season, because that would make my life harder). 

    player_name: Name of player
    start_date: YYYY-MM-DD
    end_date: YYYY-MM-DD
    data: Dataframe of all data/all data which may be relevant to previous func call.
    '''
    global stats
    # start = time.time()

    # if year in player_averages:
    #     if player_name in player_averages[year]:
    #         return player_averages[year][player_name]
    # else:
    #     player_averages[year] = {}

    player_data = data[data['player'] == player_name]
    
    # compare dates so that we only get data from the timeframe we want.
    player_data = player_data[player_data['game_date'] >= start_date]
    player_data = player_data[player_data['game_date'] <= end_date]

    # get average stats
    player_data = player_data[stats]
 
    # player_averages[year][player_name] = 
    return player_data.mean().to_frame().transpose().fillna(0)

# data = get_all_data('2019-01-01', '2023-12-31')
# get_player_avg('LeBron James', 2021, data)

In [4]:
def create_row_for_player_game(player, game, data, order_by):
    '''
    player: player name
    game: link to game. 
    data: raw data for all games in past (year)
    player_averages: dictionary of player averages for the past year. Reused for efficiency.
    returns a single row of data aggregated from the game. Includes stats from both teams. 
    '''
    num_chosen = 5  # Number of players to choose from each team.
    global stats

    start = time.time()

    # vscode extension that adds whitespace for readability in csv. strip undos it.
    game_df = data[data['link'].str.strip() == game]
    same_team_abbr = game_df[game_df['player'].str.strip(
    ) == player]['team'].iloc[0]  # same shit
    opp_team_abbr = game_df[game_df['team'].str.strip()
                            != same_team_abbr]['team'].iloc[0]

    game_date = game_df['game_date'].iloc[0]
    # Can't include game in game calculations
    day_before_game_date = str(datetime.strptime(
        game_date, '%Y-%m-%d') - timedelta(days=1))
    year_before_game_date = str(datetime.strptime(
        game_date, '%Y-%m-%d') - timedelta(days=365))

    opp_team_roster = list(game_df[game_df['team'] == opp_team_abbr]['player'])
    same_team_roster = list(
        game_df[game_df['team'] == same_team_abbr]['player'])

    # This section is veryyyy slow. Mostly because there are a lot of func calls.
    opp_team_avgs = []
    same_team_avgs = []
    for player_name in opp_team_roster:
        player_avg = get_player_avg(
            player_name, year_before_game_date, day_before_game_date, data)
        opp_team_avgs.append(player_avg)
    for player_name in same_team_roster:
        player_avg = get_player_avg(
            player_name, year_before_game_date, day_before_game_date, data)
        same_team_avgs.append(player_avg)
    opp_team_avgs = pd.concat(opp_team_avgs).nlargest(num_chosen, order_by)
    same_team_avgs = pd.concat(same_team_avgs).nlargest(num_chosen, order_by)

    avgs_and_player_game_stats = []

    opp_team_avg_wide = opp_team_avgs.stack().reset_index(
        drop=True).to_frame().transpose()
    new_columns = list(opp_team_avgs.columns)*num_chosen
    new_columns = ['opp' + str(((i)//16)+1) + '_avg_' + str(col)
                   for i, col in enumerate(new_columns)]
    opp_team_avg_wide.columns = new_columns
    avgs_and_player_game_stats.append(opp_team_avg_wide)

    same_team_avg_wide = same_team_avgs.stack().reset_index(
        drop=True).to_frame().transpose()
    new_columns = list(same_team_avgs.columns)*num_chosen
    new_columns = ['same_avg' + str(((i)//16)+1) + '_avg_' + str(col)
                   for i, col in enumerate(new_columns)]
    same_team_avg_wide.columns = new_columns
    avgs_and_player_game_stats.append(same_team_avg_wide)

    # Add player performance in this game to the avg stats
    player_df = game_df[game_df['player'].str.strip() == player]
    avgs_and_player_game_stats.append(player_df)
    avgs_and_player_game_stats = pd.concat(avgs_and_player_game_stats, axis=0)
    # mean to remove all nans because we'll have 10 rows due to column inconsistency.
    avgs_and_player_game_stats = pd.DataFrame(
        avgs_and_player_game_stats.mean(numeric_only=True)).transpose()
    return avgs_and_player_game_stats

# data = get_all_data('2020-10-10', '2022-12-13')
# create_row_for_player_game('LeBron James', '/boxscores/202212130LAL.html', data, 'mp')


In [5]:
def create_predict_data(player, game_date, order_by):
    '''
    This function will take in a player's name and return a dataframe of their stats and other stats which can be used to predict. 
    Predictors for now, include average stats of opponent players with x most minutes, and average stats of same team with x most minutes.
    Only uses data from the past year.
    player: player full name
    game_date: 'YYYY-MM-DD'
    order_by: Stat by which to order players for consistency against different teams. Position is also a good one but that sounds hard. 
    # return: dataframe with predictors and stats
    '''
    global predict_data
    
    # 2 years because we want average stats for all players, potentially including stats from the previous year. 
    start_date = datetime.strptime(
        game_date, '%Y-%m-%d') - pd.DateOffset(years=2)
    raw_data = get_all_data(start_date, game_date)

    # Select only data for games in which this player played.
    player_data = raw_data[(raw_data['player'] == player)]
    games = set(player_data['link'])

    game_dfs = []
    for game in games:
        game_dfs.append(create_row_for_player_game(
            player, game, raw_data, order_by))

    games_df = pd.DataFrame(pd.concat(game_dfs, axis=0).dropna(axis=0))

    # Keep track of these flags so that data does not have to be recreated constantly by predict.
    # This function takes a while to run (15sec) sooooo.
    games_df.player = player
    games_df.game_date = game_date
    games_df.order_by = order_by

    predict_data = games_df

In [6]:
def add_over_under_flag(stat, over_under):
    '''
    Add or mutate stat flag column to dataframe based on stat and over/under
    game_df: Dataframe formatted by create_predict_data
    stat: Stat to add or mutate flag column for
    over_under: Value which determines whether flag is 1 or 0
    '''
    global predict_data
    predict_data[f'{stat}_flag'] = predict_data[stat].apply(lambda x: 1 if int(x) > over_under else 0)

    flag_attribute = f'{stat}_over_under'
    predict_data.flag_attribute = over_under

In [8]:
predict_data = None

In [46]:
def predict(player, game_date, order_by, stat, over_under):
    '''
    Predicts a stat for a given player or team on a given game date.
    # data: dataframe holding training data. Keep this out of the function to reduce runtime.
    # name: Name of player or abbreviation of team
    # game_date: Date of game. Format: 'YYYY-MM-DD'
    # stat: Stat to predict
    # order_by: Stat by which to order players for consistency against different teams
    # player_or_team: 'player' or 'team'
    # returns: predicted stat for that player/team on that date. 
    '''
    global predict_data
    global stats

    # See if new creation of data is necessary. Data is stored in predict_data, which might already hold 
    flag_attribute = f'{stat}_over_under'
    # Sometimes we might just be missing over/under. Other times the whole df might be wrong.
    if predict_data is not None and hasattr(predict_data, 'player') and hasattr(predict_data, 'game_date') and hasattr(predict_data, 'order_by') and predict_data.player == player and predict_data.game_date == game_date and predict_data.order_by == order_by:
        # The dataframe is correct.
        if hasattr(predict_data, flag_attribute) and predict_data.flag_attribute == over_under:
            # Dataframe correct, and over/under is correct.
            pass
        else:
            # Dataframe correct, but over/under is incorrect.
            add_over_under_flag(stat, over_under)
    else:
        create_predict_data(
            player, game_date, order_by)
        add_over_under_flag(stat, over_under)

    # Preprocess data
    processed_data = predict_data.copy()
    processed_data = processed_data.dropna(axis=0)
    deg = 1
    PolynomialFeatures(degree=deg).fit_transform(processed_data)
    StandardScaler().fit_transform(processed_data)

    X = processed_data.filter(regex='_avg_')
    Y = processed_data[f'{stat}_flag']

    kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle = True)
    model = LogisticRegression(max_iter = 3000)

    # Create dataframe with regression evaluation. 
    # Classification accuracy. Ratio of correct predictions to total predictions.
    scoring = 'accuracy'
    results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    accuracy_mean, accuracy_std = results.mean(), results.std()

    # Log loss. 
    scoring = 'neg_log_loss'
    results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    log_loss_mean, log_loss_std = results.mean(), results.std()

    # ROC AUC
    scoring = 'roc_auc'
    results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    roc_auc_mean, roc_auc_std = results.mean(), results.std()

    # Classification report
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.33, random_state=7)
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    report = classification_report(Y_test, predictions, output_dict=True)
    
    # no report temporarily
    dict_metrics = {"order_by": order_by, "accuracy_mean": [accuracy_mean], "accuracy_std": [accuracy_std], "log_loss_mean": [log_loss_mean], "log_loss_std": [log_loss_std], "roc_auc_mean": [roc_auc_mean], "roc_auc_std": [roc_auc_std]}
    return pd.DataFrame(dict_metrics)
    

results = predict('LeBron James', '2023-10-1', 'pts', 'pts', 28)
results


Unnamed: 0,order_by,accuracy_mean,accuracy_std,log_loss_mean,log_loss_std,roc_auc_mean,roc_auc_std
0,pts,0.581319,0.098497,-1.993513,0.774043,0.595383,0.122534


In [47]:
def optimize_predict(player, game_date, order_by, stat, over_under):
    '''
    Outputs evaluation metrics for different order by values. So that the user can choose the best order by value.
    '''
    global stats
    metrics = []
    for i, order_by in enumerate(stats):
        print(f'{i}/{len(stats)}')
        metrics.append(predict(player, game_date, order_by, stat, over_under))

    return pd.concat(metrics, axis=0)

metrics = optimize_predict('LeBron James', '2023-10-1', 'pts', 'pts', 28)


0/16
1/16
2/16
3/16
4/16
5/16
6/16
7/16
8/16
9/16
10/16
11/16
12/16
13/16
14/16
15/16


In [48]:
metrics

Unnamed: 0,order_by,accuracy_mean,accuracy_std,log_loss_mean,log_loss_std,roc_auc_mean,roc_auc_std
0,mp,0.603297,0.139712,-1.705094,0.68869,0.590105,0.13818
0,fg,0.545055,0.057143,-1.766203,0.793087,0.567361,0.097125
0,fga,0.597253,0.136791,-1.719095,0.452705,0.592058,0.12563
0,fg3,0.541209,0.154595,-1.930357,0.552342,0.507239,0.182234
0,fg3a,0.500549,0.092185,-2.052112,0.503284,0.483977,0.134656
0,ft,0.568132,0.108791,-1.830669,0.449322,0.577577,0.160385
0,fta,0.575824,0.159432,-1.606808,0.901006,0.63574,0.186065
0,orb,0.487363,0.133598,-2.343868,1.196131,0.504955,0.106249
0,drb,0.602747,0.140192,-1.633438,0.729476,0.5897,0.194332
0,ast,0.634615,0.123578,-1.194424,0.496001,0.671482,0.160188


In [11]:
def test_stat(order_by, year):
    '''
    Tests regression statistics for a given orderby, and all stat combinations. 
    order_by: wut u think
    year: Year to test. YYYY
    '''
    global stats
    global all_players
    global predict_data
    global cv_scores

    if os.path.exists(f'./test/{order_by}/{year}.csv'):
        cv_scores = pd.read_csv(f'./test/{order_by}/{year}.csv')
    else:
        cv_scores = pd.DataFrame(columns = ['player', 'game_date', 'order_by', 'stat', 'cross_val_score_mean', 'cross_val_score_std'], index = [0])

    raw_data = get_all_data(f'{year-2}-12-31', f'{year}-12-31')
    raw_data_this_year = raw_data[raw_data['game_date'].str.match(f'{year}')]
    # Randomly select some players/games from this year. 
    while True: # Want some actual games from this player. I want do while loop :(
        player = random.sample(list(raw_data_this_year['player'].unique()), 1)[0]
        all_player_games = raw_data_this_year[raw_data_this_year['player'] == player]
        if len(all_player_games) > 20:
            if player not in cv_scores['player'].unique():        
                break

    for game in random.sample(list(all_player_games['link']), 1):
        game_date = raw_data_this_year[raw_data_this_year['link'] == game]['game_date'].iloc[0] 
        for stat in stats:
            # Select over/under values for that player. Represented by some mean + std. 
            mean = all_player_games[stat].mean()
            std = all_player_games[stat].std()
            over_under_vals = [mean-std/2, mean, mean+std/2]
            over_under_vals = [round(x*2)/2 for x in over_under_vals] # round to nearest half
            over_under_vals = [0 if x < 0 else x for x in over_under_vals]
            for over_under in over_under_vals: 
                # Should be handled in predict but why not?
                # create_predict_data(player, game_date, order_by)
                # add_over_under_flag(stat, over_under)
                cv_score, cv_score_std = predict(player, game_date, order_by, stat, over_under)
                cv_score = pd.DataFrame({'player': [player], 'game_date': [game_date], 'order_by': [order_by], 'stat': [stat], 'over_under': [over_under], 'cross_val_score_mean': [cv_score], 'cross_val_score_std': [cv_score_std]})
                cv_scores = pd.concat([cv_scores, cv_score], axis=0)

    cv_scores = cv_scores.dropna()
    cv_scores.to_csv(f'./test/{order_by}/{year}.csv', index=False)

test_stat('mp', 2020)

KeyboardInterrupt: 

In [None]:
for order_by in stats:
    if not os.path.exists(f'./test/{order_by}'):
        os.mkdir(f'./test/{order_by}')

In [None]:
for order_by in stats:
    for i in range(20):
        try:
            print(f'Testing {order_by}, {i}')
            test_stat(order_by, 2020)
        except Exception as e:
            print(e)
            time.sleep(5)
            pass

Testing mp, 0


KeyboardInterrupt: 

In [None]:
p = PCA()
xcols = data.columns[:-20]
ycol = 'drb'

W = p.fit_transform(data[xcols])
C = p.components_

p.explained_variance_ratio_

NameError: name 'data' is not defined