In [None]:
import pandas as pd
import numpy as np
import json
import pymc as pm
from typing import Dict
import argparse

# Import standardScaler and normalize
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

def load_and_process_team_data(filepath: str, sport: str) -> pd.DataFrame:
    print(f"Loading data from {filepath}")
    with open(filepath) as f:
        data = json.load(f)

    print("Processing team data")
    data = pd.DataFrame.from_dict(data["response"])
    data["player"] = data["player"].map(lambda x: x["firstname"] + " " + x["lastname"])
    data["game"] = data["game"].map(lambda x: x["id"])
    if sport.lower() == "nba":
        data = data[["game", "player", "points", "assists", "steals", "blocks", "totReb", "turnovers"]]
    print(f"Processed team data with {len(data)} rows")
    return data

def load_and_process_match_data(data: pd.DataFrame) -> pd.DataFrame:
    print("Sorting and grouping match data")
    data.sort_values(by=["game"], inplace=True)
    data = data.groupby("game").sum()
    data.drop(columns=["player"], inplace=True)
    print(f"Processed match data with {len(data)} rows")
    return data

def load_and_process_player_data(data: pd.DataFrame) -> Dict[str, pd.DataFrame]:
    print("Processing player data")
    players = data.groupby("player", as_index=False)
    player_data = {}
    for player, player_df in players:
        print(f"Processing data for player: {player}")
        player_df = player_df.drop(columns=["player"])
        player_df = player_df.set_index("game")
        player_df = player_df.sort_index()
        player_data[player] = player_df
    print(f"Processed data for {len(player_data)} players")
    return player_data

def make_lags(data: pd.Series, lag: int, name: str) -> pd.DataFrame:
    print(f"Creating lags for {name} with lag={lag}")
    lagged_df = pd.concat([data.shift(i) for i in range(lag + 1)], axis=1)
    lagged_df.columns = [f"{name}_lag_{i}" if i > 0 else f"{name}_current" for i in range(lag + 1)]
    print(f"Created lags for {name}, resulting in {len(lagged_df)} rows")
    return lagged_df.dropna().reset_index(drop=True)

def make_model(data: pd.DataFrame) -> pm.Model:
    print("Creating model")
   
    model = pm.Model()
    with model:
        current_columns = [col for col in data.columns if "current" in col]
        lagged_columns = [col for col in data.columns if "lag" in col]

        intercept = pm.Normal("intercept", 0.0, 1.0, shape=len(current_columns))
        coefficients = {
            current: pm.Normal(f"{current}_coeffs", 0.0, 1.0, shape=len(lagged_columns))
            for current in current_columns
        }

        mu = {
            current: pm.math.exp(intercept[i] + pm.math.dot(data[lagged_columns].values, coefficients[current]))
            for i, current in enumerate(current_columns)
        }

        for i, current in enumerate(current_columns):
            pm.Poisson(f"{current}_obs", mu=mu[current], observed=data[current].values)

    print("Model created")
    return model

def fit_model(num_samples: int, model: pm.Model):
    print(f"Fitting model with {num_samples} samples")
    with model:
        return pm.sample(draws=num_samples, tune=1000, cores=4, chains=4)

def get_posterior_predictive(model, trace):
    print(f"Getting posterior predictive with ")
    with model:
        return pm.sample_posterior_predictive(trace)


def run_pipeline_players(filepath: str):
    print(f"Running pipeline for file: {filepath}")
    sport = "nba" if "nba" in filepath.lower() else "nfl"
    team_data = load_and_process_team_data(filepath, sport)
    player_data = load_and_process_player_data(team_data)

    for player in player_data.keys():
        print(f"Processing player: {player}")
        player_df = player_data[player]

        model_data = pd.DataFrame()

        for col in player_df.columns:
            lagged_df = make_lags(player_df[col], 5, col)
            model_data = pd.concat([model_data, lagged_df], axis=1)
        if model_data.empty:
            continue
        model = make_model(model_data)
        trace = fit_model(1000, model)

        posterior_predictive = get_posterior_predictive(model, trace)

        points_pred = np.array(posterior_predictive.posterior_predictive["points_current_obs"]).reshape(-1)
        assists_pred = np.array(posterior_predictive.posterior_predictive["assists_current_obs"]).reshape(-1)
        steals_pred = np.array(posterior_predictive.posterior_predictive["steals_current_obs"]).reshape(-1)
        blocks_pred = np.array(posterior_predictive.posterior_predictive["blocks_current_obs"]).reshape(-1)
        turnovers_pred = np.array(posterior_predictive.posterior_predictive["turnovers_current_obs"]).reshape(-1)
        totReb_pred = np.array(posterior_predictive.posterior_predictive["totReb_current_obs"]).reshape(-1)

        posterior_predictive_df = pd.DataFrame({
            "points": points_pred,
            "assists": assists_pred,
            "steals": steals_pred,
            "blocks": blocks_pred,
            "turnovers": turnovers_pred,
            "totReb": totReb_pred
        })

        write_filepath = filepath[:-5]

        posterior_predictive_df.to_csv(f"{write_filepath}_{player}_posterior_predictive.csv")


run_pipeline_players("../data/nba/2023/celtics/team_stats.json")
