In [1]:
import pandas as pd
import numpy as np
import json
import pymc as pm
from typing import Dict
import argparse

# Import standardScaler and normalize
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

def load_and_process_team_data(filepath: str, sport: str) -> pd.DataFrame:
    print(f"Loading data from {filepath}")
    with open(filepath) as f:
        data = json.load(f)

    print("Processing team data")
    data = pd.DataFrame.from_dict(data["response"])
    data["player"] = data["player"].map(lambda x: x["firstname"] + " " + x["lastname"])
    data["game"] = data["game"].map(lambda x: x["id"])
    if sport.lower() == "nba":
        data = data[["game", "player", "points", "assists", "steals", "blocks", "totReb", "turnovers"]]
    print(f"Processed team data with {len(data)} rows")
    return data

def load_and_process_match_data(data: pd.DataFrame) -> pd.DataFrame:
    print("Sorting and grouping match data")
    data.sort_values(by=["game"], inplace=True)
    data = data.groupby("game").sum()
    data.drop(columns=["player"], inplace=True)
    print(f"Processed match data with {len(data)} rows")
    return data

def load_and_process_player_data(data: pd.DataFrame) -> Dict[str, pd.DataFrame]:
    print("Processing player data")
    players = data.groupby("player", as_index=False)
    player_data = {}
    for player, player_df in players:
        print(f"Processing data for player: {player}")
        player_df = player_df.drop(columns=["player"])
        player_df = player_df.set_index("game")
        player_df = player_df.sort_index()
        player_data[player] = player_df
    print(f"Processed data for {len(player_data)} players")
    return player_data

def make_lags(data: pd.Series, lag: int, name: str) -> pd.DataFrame:
    print(f"Creating lags for {name} with lag={lag}")
    lagged_df = pd.concat([data.shift(i) for i in range(lag + 1)], axis=1)
    lagged_df.columns = [f"{name}_lag_{i}" if i > 0 else f"{name}_current" for i in range(lag + 1)]
    print(f"Created lags for {name}, resulting in {len(lagged_df)} rows")
    return lagged_df.dropna().reset_index(drop=True)

def make_model(data: pd.DataFrame) -> pm.Model:
    print("Creating model")
   
    model = pm.Model()
    with model:
        current_columns = [col for col in data.columns if "current" in col]
        lagged_columns = [col for col in data.columns if "lag" in col]

        intercept = pm.Normal("intercept", 0.0, 1.0, shape=len(current_columns))
        coefficients = {
            current: pm.Normal(f"{current}_coeffs", 0.0, 1.0, shape=len(lagged_columns))
            for current in current_columns
        }

        mu = {
            current: pm.math.exp(intercept[i] + pm.math.dot(data[lagged_columns].values, coefficients[current]))
            for i, current in enumerate(current_columns)
        }

        for i, current in enumerate(current_columns):
            pm.Poisson(f"{current}_obs", mu=mu[current], observed=data[current].values)

    print("Model created")
    return model

def fit_model(num_samples: int, model: pm.Model):
    print(f"Fitting model with {num_samples} samples")
    with model:
        return pm.sample(draws=num_samples, tune=1000, cores=4, chains=4)

def get_posterior_predictive(model, trace):
    print(f"Getting posterior predictive with ")
    with model:
        return pm.sample_posterior_predictive(trace)


def run_pipeline_players(filepath: str):
    print(f"Running pipeline for file: {filepath}")
    sport = "nba" if "nba" in filepath.lower() else "nfl"
    team_data = load_and_process_team_data(filepath, sport)
    player_data = load_and_process_player_data(team_data)

    for player in player_data.keys():
        print(f"Processing player: {player}")
        player_df = player_data[player]

        model_data = pd.DataFrame()

        for col in player_df.columns:
            lagged_df = make_lags(player_df[col], 5, col)
            model_data = pd.concat([model_data, lagged_df], axis=1)
        if model_data.empty:
            continue
        model = make_model(model_data)
        trace = fit_model(1000, model)

        posterior_predictive = get_posterior_predictive(model, trace)

        points_pred = np.array(posterior_predictive.posterior_predictive["points_current_obs"]).reshape(-1)
        assists_pred = np.array(posterior_predictive.posterior_predictive["assists_current_obs"]).reshape(-1)
        steals_pred = np.array(posterior_predictive.posterior_predictive["steals_current_obs"]).reshape(-1)
        blocks_pred = np.array(posterior_predictive.posterior_predictive["blocks_current_obs"]).reshape(-1)
        turnovers_pred = np.array(posterior_predictive.posterior_predictive["turnovers_current_obs"]).reshape(-1)
        totReb_pred = np.array(posterior_predictive.posterior_predictive["totReb_current_obs"]).reshape(-1)

        posterior_predictive_df = pd.DataFrame({
            "points": points_pred,
            "assists": assists_pred,
            "steals": steals_pred,
            "blocks": blocks_pred,
            "turnovers": turnovers_pred,
            "totReb": totReb_pred
        })

        write_filepath = filepath[:-5]

        posterior_predictive_df.to_csv(f"{write_filepath}_{player}_posterior_predictive.csv")


run_pipeline_players("../data/nba/2024/celtics/team_stats.json")


Running pipeline for file: ../data/nba/2024/celtics/team_stats.json
Loading data from ../data/nba/2024/celtics/team_stats.json
Processing team data
Processed team data with 229 rows
Processing player data
Processing data for player: Al Horford
Processing data for player: Anton Watson
Processing data for player: Baylor Scheierman
Processing data for player: D. Skapintsev
Processing data for player: Derrick White
Processing data for player: Drew Peterson
Processing data for player: JD Davison
Processing data for player: Jaden Springer
Processing data for player: Jaylen Brown
Processing data for player: Jayson Tatum
Processing data for player: Jordan Walsh
Processing data for player: Jrue Holiday
Processing data for player: Lonnie Walker IV
Processing data for player: Luke Kornet
Processing data for player: Neemias Queta
Processing data for player: Payton Pritchard
Processing data for player: Ron Harper Jr.
Processing data for player: Sam Hauser
Processing data for player: Xavier Tillman


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 125 seconds.
Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Getting posterior predictive with 


Processing player: Anton Watson
Creating lags for points with lag=5
Created lags for points, resulting in 3 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 3 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 3 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 3 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 3 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 3 rows
Processing player: Baylor Scheierman
Creating lags for points with lag=5
Created lags for points, resulting in 8 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 8 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 8 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 8 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 8 rows
Creating lags for turnovers wi

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 20 seconds.
There were 88 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Getting posterior predictive with 


Processing player: D. Skapintsev
Creating lags for points with lag=5
Created lags for points, resulting in 3 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 3 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 3 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 3 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 3 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 3 rows
Processing player: Derrick White
Creating lags for points with lag=5
Created lags for points, resulting in 19 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 19 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 19 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 19 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 19 rows
Creating lags for turnovers 

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 228 seconds.
There were 320 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Getting posterior predictive with 


Processing player: Drew Peterson
Creating lags for points with lag=5
Created lags for points, resulting in 6 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 6 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 6 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 6 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 6 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 6 rows
Creating model
Model created
Fitting model with 1000 samples


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 39 seconds.
There were 418 divergences after tuning. Increase `target_accept` or reparameterize.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details


Getting posterior predictive with 


Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Processing player: JD Davison
Creating lags for points with lag=5
Created lags for points, resulting in 8 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 8 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 8 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 8 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 8 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 8 rows
Creating model
Model created
Fitting model with 1000 samples


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 35 seconds.
There were 123 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Getting posterior predictive with 


Processing player: Jaden Springer
Creating lags for points with lag=5
Created lags for points, resulting in 7 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 7 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 7 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 7 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 7 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 7 rows
Creating model
Model created
Fitting model with 1000 samples


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 18 seconds.
There were 64 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Getting posterior predictive with 


Processing player: Jaylen Brown
Creating lags for points with lag=5
Created lags for points, resulting in 15 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 15 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 15 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 15 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 15 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 15 rows
Creating model
Model created
Fitting model with 1000 samples


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 236 seconds.
There were 65 divergences after tuning. Increase `target_accept` or reparameterize.
Chain 0 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 1 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 3 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Getting posterior predictive with 


Processing player: Jayson Tatum
Creating lags for points with lag=5
Created lags for points, resulting in 19 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 19 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 19 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 19 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 19 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 19 rows
Creating model
Model created
Fitting model with 1000 samples


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 248 seconds.
Chain 0 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 1 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 2 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 3 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Getting posterior predictive with 


Processing player: Jordan Walsh
Creating lags for points with lag=5
Created lags for points, resulting in 17 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 17 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 17 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 17 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 17 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 17 rows
Creating model
Model created
Fitting model with 1000 samples


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 40 seconds.
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Getting posterior predictive with 


Processing player: Jrue Holiday
Creating lags for points with lag=5
Created lags for points, resulting in 17 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 17 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 17 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 17 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 17 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 17 rows
Creating model
Model created
Fitting model with 1000 samples


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 135 seconds.
There were 1160 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Getting posterior predictive with 


Processing player: Lonnie Walker IV
Creating lags for points with lag=5
Created lags for points, resulting in 4 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 4 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 4 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 4 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 4 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 4 rows
Processing player: Luke Kornet
Creating lags for points with lag=5
Created lags for points, resulting in 17 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 17 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 17 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 17 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 17 rows
Creating lags for turnovers

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 73 seconds.
There were 9 divergences after tuning. Increase `target_accept` or reparameterize.
Sampling: [assists_current_obs, blocks_current_obs, points_current_obs, steals_current_obs, totReb_current_obs, turnovers_current_obs]


Getting posterior predictive with 


Processing player: Neemias Queta
Creating lags for points with lag=5
Created lags for points, resulting in 19 rows
Creating lags for assists with lag=5
Created lags for assists, resulting in 19 rows
Creating lags for steals with lag=5
Created lags for steals, resulting in 19 rows
Creating lags for blocks with lag=5
Created lags for blocks, resulting in 19 rows
Creating lags for totReb with lag=5
Created lags for totReb, resulting in 19 rows
Creating lags for turnovers with lag=5
Created lags for turnovers, resulting in 19 rows
Creating model
Model created
Fitting model with 1000 samples


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, points_current_coeffs, assists_current_coeffs, steals_current_coeffs, blocks_current_coeffs, totReb_current_coeffs, turnovers_current_coeffs]
