In [1]:
# Bryce Testing

In [3]:
import pandas as pd 
import numpy as np 
from os import listdir as ls
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [4]:
v1 = pd.read_parquet('[2015, 2016]_game_stats.parquet.gzip')
v2 = pd.read_parquet('[2017, 2018]_game_stats.parquet.gzip')
v3 = pd.read_parquet('[2019, 2020]_game_stats.parquet.gzip')
v4 = pd.read_parquet('[2021, 2022]_game_stats.parquet.gzip')
season_reg = pd.concat([v1, v2, v3, v4], axis = 0)
season_reg.to_csv("season_reg.csv")
teams = season_reg['home_team'].unique()
arenas = season_reg['arena'].unique()

In [5]:
class OneHotEncoder:
    def __init__(self, categories):
        self.categories = categories
        self.category_to_index = {category: i for i, category in enumerate(categories)}
        self.num_categories = len(categories)

    def encode(self, category):
        if category not in self.category_to_index:
            raise ValueError(f"Category '{category}' not found in the provided categories.")
        else:
            encoded_vector = np.zeros(self.num_categories, dtype=int)
            encoded_vector[self.category_to_index[category]] = 1
            return encoded_vector
team_encoder = OneHotEncoder(teams)
arena_encoder = OneHotEncoder(arenas)

In [6]:
#create a dictionary with season team and per list 
seasons_pers = pd.read_csv('seasons_per_s.csv')

def get_top_n_pers(seasons_pers, n):
    roster_pers = {}
    for season in seasons_pers['season'].unique():
        subset = seasons_pers[seasons_pers['season'] == season]
        roster_pers[season] = {}
        for team in teams:
            team_subset = subset[subset['team_id'] ==  team]
            team_subset = team_subset.sort_values(by='per', ascending=False)
            roster_pers[season][team] = team_subset['per'].iloc[:n].values
    return roster_pers

rosters = get_top_n_pers(seasons_pers, 10)  # rosters[season][team]

In [7]:
# pipeline 
def get_season(season_string):
    cleaned_reg_season = season_reg.copy()
    columns = season_reg.columns
    date_features = ['date']
    string_features = [ 'home_team', 'away_team', 'arena']
    numeric_features = [ c for c in columns if c not in string_features + date_features]
    cleaned_reg_season[numeric_features] = cleaned_reg_season[numeric_features].apply(pd.to_numeric)
    cleaned_reg_season[date_features] = cleaned_reg_season[date_features].apply(pd.to_datetime)
    start_date, end_date = f'20{season_string[2:4]}-10',  f'20{season_string[-2:]}-04'
    season = cleaned_reg_season[(cleaned_reg_season["date"] >= start_date) & (cleaned_reg_season["date"] <= end_date)]
    return season

def run_pca(season= '2019-20'):
    season = get_season(season)
    pca = PCA(n_components=30)
    season_numeric = season.drop(["date", "home_team", "away_team", "arena"] , axis=1)
    numeric_components = pca.fit(season_numeric)
    return pca 

def pre_process(season, pca, data_all):
    season_name = season
    season = get_season(season)
    #drop cloumns to perform pca 
    season_numeric = season.drop(["date", "home_team", "away_team", "arena"] , axis=1)
    season_categorical = season[["date", "home_team", "away_team", "arena"]]
    numeric_components = pca.transform(season_numeric)
    spread = pd.DataFrame(season_numeric["home_pts"] - season_numeric["away_pts"],  columns= ["spread"])
    if data_all == True:
        # one hot encoded teams 
        home_team = season_categorical['home_team'].apply(team_encoder.encode)
        away_team = season_categorical['away_team'].apply(team_encoder.encode)
        # one hot encoded arena
        arena = season_categorical['arena'].apply(arena_encoder.encode) 
        # get top n player pers
        top_n = 10
        home_pers= season_categorical['home_team'].map(lambda team: rosters[season_name].get(team))
        away_pers= season_categorical['away_team'].map(lambda team: rosters[season_name].get(team))
        
        return pd.concat([home_pers, home_team, season_numeric, arena, away_team, away_pers] , axis= 1).values, spread.values
    else:
        return numeric_components, spread.values

In [8]:
pca = run_pca('2019-20')

x_2020, y_2020= pre_process('2019-20', pca, data_all=False)
x_2021, y_2021= pre_process('2020-21', pca, data_all=False)

# x_2020, y_2020= pre_process('2019-20', pca, data_all=True)
# x_2021, y_2021= pre_process('2020-21', pca, data_all=True)

x_2020.shape

(971, 30)

In [12]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
    tf.keras.layers.Dense(36, activation="relu"),
    tf.keras.layers.Dense(250, activation="relu"),
    tf.keras.layers.Dense(250, activation="relu"),
    tf.keras.layers.Dense(25, activation="relu"),
    tf.keras.layers.Dense(25, activation="relu"),
    tf.keras.layers.Dense(1, activation="linear")
])

model.compile(loss='mean_squared_error',
              optimizer="adam",
              metrics=["accuracy", "mean_squared_error"])

model.build(input_shape=(None, 30)) 

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 36)                1116      
                                                                 
 dense_1 (Dense)             (None, 250)               9250      
                                                                 
 dense_2 (Dense)             (None, 250)               62750     
                                                                 
 dense_3 (Dense)             (None, 25)                6275      
                                                                 
 dense_4 (Dense)             (None, 25)                650       
                                                                 
 dense_5 (Dense)             (None, 1)                 26        
                                                                 
Total params: 80067 (312.76 KB)
Trainable params: 80067 

In [None]:
history = model.fit(x_2020, y_2020, epochs=10,
                    validation_split = 0.3)

Epoch 1/10


In [None]:
import matplotlib.pyplot as plt

predictions = pd.DataFrame(model.predict(reg_2021).flatten())

predicted_values = predictions
observed_values = spread_2021


# Plot histogram
plt.hist(predicted_values, bins=40, label='Predicted', alpha=0.5)
plt.hist(observed_values, bins=40, label='Observed', alpha=0.5)
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.title('Histogram of Predicted vs Observed Values')
plt.legend()
plt.show()