In [43]:
import pickle
import plotly.express as px
from plotly.graph_objs import Figure
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Utility Functions

In [44]:
def create_figure(df, x_column, y_column) -> Figure:
    x_data = df[x_column]
    y_data = df[y_column]

    min_value = y_data.min()
    max_value = y_data.max()
    season_value = y_data.iloc[-1]
    
    fig = px.line(x=x_data, y=y_data, markers=True)
    fig.update_layout(
        xaxis_title=x_column,
        yaxis_title=y_column,
        title=f'{df['Name'].iloc[0]} {y_column} over time'
    )
    fig.add_hline(y=min_value, line_dash="dot", line_color="black", annotation_text=f'Min: {min_value}', annotation_position="bottom right")
    fig.add_hline(y=max_value, line_dash="dot", line_color="black", annotation_text=f'Max: {max_value}', annotation_position="top right")
    
    fig.add_annotation(text=f"End of Season Value {season_value}", x=x_data.iloc[-12], y=season_value-season_value*0.05, showarrow=False)
    
    fig.update_yaxes(nticks=20)
    return fig

In [74]:
import plotly.graph_objects as go


def create_stability_animation(df, x_column, y_column, thresholds, log):
    x_data = list(df[x_column])
    y_data = list(df[y_column])
    x_axis = [x_data[0], x_data[-1]]
    y_axis = [min(y_data)-0.05*max(y_data), max(y_data)+0.05*max(y_data)]

    for i, (x_val, y_val) in enumerate(zip(x_data, y_data)):

        fig = go.Figure()
        fig.update_layout(
            title=f'{df["Name"].iloc[0]} {y_column} over time',
            xaxis_title=x_column,
            yaxis_title=y_column,
            xaxis_range=x_axis,
            yaxis_range=y_axis
        )
        if i < len(log):
            fig.add_annotation(text=f"Game: {log[i]['Game']}<br>Mean: {round(log[i]['Window Mean'],3)}<br>Std Dev: {round(log[i]['Window Std'],3)}", x=x_val, y=0.5, showarrow=False, align="left", xanchor="left")
            fig.add_annotation(text=f"Std Dev ROC: {round(log[i]['Std ROC'],3)}", x=x_val, y=0.1, showarrow=False, align="left", xanchor="left")
            shape_x_ref = x_val
            line_thickness = 2
            line_dash = "dot"
            line_color = "red"
        else:
            fig.add_annotation(text=f"<b>Game: {log[-1]['Game']}<br>Mean: {round(log[-1]['Window Mean'],3)}<br>Std Dev: {round(log[-1]['Window Std'],3)}</b>", x=x_data[len(log)], y=0.5, showarrow=False, align="left", xanchor="left")
            fig.add_annotation(text=f"<b>Std Dev ROC: {round(log[-1]['Std ROC'])}</b>", x=x_data[len(log)], y=0.1, showarrow=False, align="left", xanchor="left")
            shape_x_ref = x_data[len(log)]
            line_thickness = 4
            line_dash = "solid"
            line_color = "green"
            fig.add_shape(type="line",
                            x0=shape_x_ref, y0=log[-1]['Window Mean'], x1=x_data[-1], y1=log[-1]['Window Mean'],  # Start and end points of the line
                            line=dict(color="black", width=1, dash='dot')  # Line style
                         )
        fig.add_trace(go.Scatter(x=x_data[:i+1], y=y_data[:i+1], mode='lines+markers',
                                 line=dict(width=2), marker=dict(size=5)))
        fig.add_trace(go.Scatter(x=[x_val], y=[y_val], mode='markers', marker=dict(color='red', size=5)))
        fig.add_shape(type='rect',
                      x0=shape_x_ref-thresholds['window']['size'], y0=0, x1=shape_x_ref, y1=1,
                      xref='x', yref='paper', line=dict(color='red', width=0), fillcolor='red', opacity=0.2)
        
        fig.add_vline(x=shape_x_ref, line_dash=line_dash, line_color=line_color, line_width=line_thickness)
        fig.add_vline(x=shape_x_ref-thresholds['window']['size'], line_dash="dot", line_color="red")

        fig.add_shape(type='rect',
                    x0=shape_x_ref-thresholds['window']['size']-thresholds['window']['lag'], y0=0, x1=shape_x_ref-thresholds['window']['lag'], y1=1,
                    xref='x', yref='paper', line=dict(color='red', width=0), fillcolor='blue', opacity=0.2)
        fig.add_vline(x=shape_x_ref-thresholds['window']['lag'], line_dash="dot", line_color="blue")
        fig.add_vline(x=shape_x_ref-thresholds['window']['lag']-thresholds['window']['size'], line_dash="dot", line_color="blue")
        
        fig.write_image(f'./images/{i}.png')

    return

In [46]:
def compute_league_variance(dfs:dict):
    """
    Compute the season stat variance of each stat across the league 
    """
    stat_columns = ['BA','OBP','SLG','OPS','wOBA','K%','BB%','ISO','wRC+']
    player_values = {}
    for key, df in dfs.items():
        player_values[key] = df[stat_columns].iloc[-1].to_dict()
    
    return pd.DataFrame(player_values).T

# Stability Algorithms

In [47]:
def sliding_window_stability(dfs:dict, stat:str, thresholds:dict, player='all', log_stat='OPS') -> dict:
    """
    Works by computing the standard deviation for a given stat over a sliding window of games
    so that we can see how a player's performance stabilizes over time
    """
    window_size = thresholds['window']['size'] # games
    window_lag = thresholds['window']['lag'] # games
    lagged_std = 0
    log = []
    log_flag = False
    stability_columns = [f'Season {stat}', f'Game Threshold {stat}', f'{stat} Mean', f'{stat} Std']
    stability = pd.DataFrame(index=dfs.keys(), columns=stability_columns, dtype=float).fillna(-1)

    for player_id, df in dfs.items():
        i = 0
        stability.at[player_id,f'Season {stat}'] = df[stat].iloc[-1]
        stability.at[player_id,f'Game Threshold {stat}'] = df['G'].iloc[-1]
        stability.at[player_id,f'{stat} Mean'] = -1
        stability.at[player_id,f'{stat} Std'] = -1

        while i+window_size < len(df):
            window_stats = df.iloc[i:i+window_size][stat]
            window_std = window_stats.std()
            window_mean = window_stats.mean()
            mean_diff = abs(window_mean - stability.at[player_id,f'Season {stat}']) / stability.at[player_id,f'Season {stat}'] # % difference
            window_std_roc = abs(window_std - lagged_std)

            if player_id == player and stat == log_stat:
                log.append({
                    'Game': i+window_size,
                    'Window Mean': window_mean,
                    'Window Std': window_std,
                    'Mean Diff': mean_diff,
                    'Std ROC': window_std_roc
                })
                log_flag = True

            if window_std < thresholds[stat]['std'] and mean_diff < thresholds[stat]['mean'] and window_std_roc < thresholds[stat]['std_roc']:
                stability.at[player_id, f'Game Threshold {stat}'] = i+window_size
                stability.at[player_id, f'{stat} Mean'] = window_mean
                stability.at[player_id, f'{stat} Std'] = window_std
                break
            i+=1
            # if i-window_lag > 0:
            lagged_window = df.iloc[i-window_lag:i-window_lag+window_size][stat]
            lagged_std = lagged_window.std()
    if log_flag:
        return stability, log
    else:
        return stability


The above code computes stability off of when at a certain point a window gets below a standard deviation threshhold and a difference between the current value and the season mean. I worry that this is not a comprehensive way to measure stability since you could essentially get lucky by finding a temporarily stable period of time that is relatively close to the player's season value. 

Ideas to get around this:
* After you find the first window match (let's say at 65 games), compute how many "false" windows follow it (how many don't satisfy the threshold criteria). If the number of false windows is below another threshold, then keep the first window match, else move to the next one
* Create a measure of the std deviation rate of change - negative values means the value is getting more stable, then as it approaches 0 it's getting as stable as it will get

Will start by implementing the second option

# Analysis

In [48]:
player_dfs_2023 = pickle.load(open('2023_player_dfs.pkl', 'rb'))
player_dfs_2023.keys()

dict_keys([682928, 547989, 660670, 642715, 645277, 624413, 641313, 606115, 571448, 668227, 650333, 595879, 605137, 641355, 643217, 605141, 666182, 593428, 664761, 608324, 600869, 592192, 682998, 592206, 656305, 661388, 621043, 641487, 630105, 605204, 650559, 646240, 553869, 650490, 592273, 669242, 642731, 664034, 518692, 670770, 666969, 665926, 502671, 663757, 665489, 666971, 664023, 671739, 663647, 669720, 683002, 571771, 606192, 663538, 663697, 673962, 673490, 680757, 518934, 596019, 663993, 592518, 606466, 656716, 641857, 643446, 669004, 608841, 593160, 571970, 607043, 663457, 660271, 621566, 681546, 542303, 670623, 665161, 521692, 663728, 608070, 592663, 592669, 668804, 663586, 673357, 677594, 667670, 642708, 660688, 668939, 467793, 623993, 656941, 608369, 543760, 642086, 669257, 624585, 665742, 543807, 668715, 663886, 681082, 664702, 553993, 669261, 621020, 657041, 679529, 650402, 678662, 663656, 457759, 607208, 662139, 683734, 657077, 663837, 683011, 664774, 572233, 677951, 59288

In [49]:
df = compute_league_variance(player_dfs_2023)
df.describe()

Unnamed: 0,BA,OBP,SLG,OPS,wOBA,K%,BB%,ISO,wRC+
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,0.261774,0.334398,0.444383,0.778714,0.417922,0.207879,0.089364,0.182616,17.833764
std,0.025463,0.029555,0.060298,0.081808,0.044471,0.052094,0.029938,0.050986,3.557708
min,0.197,0.267,0.296,0.582,0.297691,0.055105,0.032759,0.05071,8.215267
25%,0.248,0.315,0.407,0.717,0.386575,0.170608,0.065123,0.153518,15.326027
50%,0.262,0.328,0.441,0.778,0.417373,0.209756,0.088123,0.178752,17.789855
75%,0.275,0.352,0.475,0.818,0.441756,0.238532,0.105008,0.20948,19.740458
max,0.354,0.416,0.654,1.066,0.559221,0.326996,0.186441,0.350101,29.137716


In [50]:
# Thresholds set by global accuracy threshold to final season value
thresholds = {
    'window':{
        'size': 10,
        'lag': 5
    },
    'BA': {
        'std': 0.015,
        'std_roc': 0.002,
        'mean': 0.05
    },
    'OBP': {
        'std': 0.02,
        'std_roc': 0.003,
        'mean': 0.05
    },
    'SLG': {
        'std': 0.04,
        'std_roc': 0.006,
        'mean': 0.05
    },
    'OPS': {
        'std': 0.05,
        'std_roc': 0.008,
        'mean': 0.05
    },
    'wOBA': {
        'std': 0.03,
        'std_roc': 0.006,
        'mean': 0.05
    },
    'K%': {
        'std': 0.03,
        'std_roc': 0.002,
        'mean': 0.05
    },
    'BB%': {
        'std': 0.03,
        'std_roc': 0.002,
        'mean': 0.1 
    },
    'ISO': {
        'std': 0.03,
        'std_roc': 0.002,
        'mean': 0.15
    },
    'wRC+': {
        'std': 1.5,
        'std_roc': 0.15,
        'mean': 0.05
    }
}

In [52]:
player_key = 641355
stability_df = pd.DataFrame()
stat_columns = ['BA','OBP','SLG','OPS','wOBA','K%','BB%','ISO','wRC+']
for stat in stat_columns:
    print(f"Computing stability for {stat}")
    if stat == 'OPS':
        stability, log = sliding_window_stability(player_dfs_2023, stat, thresholds, player=player_key, log_stat='OPS')
    else:
        stability = sliding_window_stability(player_dfs_2023, stat, thresholds)
    stability_df = pd.concat([stability_df, stability], axis=1)

Computing stability for BA
Computing stability for OBP
Computing stability for SLG
Computing stability for OPS
Computing stability for wOBA
Computing stability for K%
Computing stability for BB%
Computing stability for ISO
Computing stability for wRC+


In [53]:
log

[{'Game': 10,
  'Window Mean': 0.44489999999999996,
  'Window Std': 0.21439913453390821,
  'Mean Diff': 0.495005675368899,
  'Std ROC': 0.20314648052597653},
 {'Game': 11,
  'Window Mean': 0.4902,
  'Window Std': 0.21649008188726696,
  'Mean Diff': 0.44358683314415437,
  'Std ROC': nan},
 {'Game': 12,
  'Window Mean': 0.5498000000000001,
  'Window Std': 0.1844474029213869,
  'Mean Diff': 0.37593643586833136,
  'Std ROC': nan},
 {'Game': 13,
  'Window Mean': 0.6144999999999999,
  'Window Std': 0.0936284026232306,
  'Mean Diff': 0.30249716231555057,
  'Std ROC': nan},
 {'Game': 14,
  'Window Mean': 0.6472,
  'Window Std': 0.0836471424762642,
  'Mean Diff': 0.2653802497162316,
  'Std ROC': nan},
 {'Game': 15,
  'Window Mean': 0.6704,
  'Window Std': 0.11282651382641502,
  'Mean Diff': 0.23904653802497164,
  'Std ROC': 0.10157262070749319},
 {'Game': 16,
  'Window Mean': 0.6897,
  'Window Std': 0.12584562853838913,
  'Mean Diff': 0.2171396140749149,
  'Std ROC': 0.09064445334887783},
 {'Ga

In [54]:
stability_df['BA Error'] = ((stability_df['BA Mean'] - stability_df['Season BA']) / stability_df['Season BA']).abs()
stability_df['OBP Error'] = ((stability_df['OBP Mean'] - stability_df['Season OBP']) / stability_df['Season OBP']).abs()
stability_df['SLG Error'] = ((stability_df['SLG Mean'] - stability_df['Season SLG']) / stability_df['Season SLG']).abs() 
stability_df['OPS Error'] = ((stability_df['OPS Mean'] - stability_df['Season OPS']) / stability_df['Season OPS']).abs() 
stability_df['wOBA Error'] = ((stability_df['wOBA Mean'] - stability_df['Season wOBA']) / stability_df['Season wOBA']).abs()
stability_df['K% Error'] = ((stability_df['K% Mean'] - stability_df['Season K%']) / stability_df['Season K%']).abs() 
stability_df['BB% Error'] = ((stability_df['BB% Mean'] - stability_df['Season BB%']) / stability_df['Season BB%']).abs()
stability_df['ISO Error'] = ((stability_df['ISO Mean'] - stability_df['Season ISO']) / stability_df['Season ISO']).abs()
stability_df['wRC+ Error'] = ((stability_df['wRC+ Mean'] - stability_df['Season wRC+']) / stability_df['Season wRC+']).abs()
stability_df.mean()

Season BA               0.261774
Game Threshold BA      58.105263
BA Mean                 0.263303
BA Std                  0.007441
Season OBP              0.334398
Game Threshold OBP     51.879699
OBP Mean                0.336944
OBP Std                 0.008487
Season SLG              0.444383
Game Threshold SLG     58.067669
SLG Mean                0.441796
SLG Std                 0.015891
Season OPS              0.778714
Game Threshold OPS     53.766917
OPS Mean                0.779061
OPS Std                 0.023067
Season wOBA             0.417922
Game Threshold wOBA    52.105263
wOBA Mean               0.418416
wOBA Std                0.013154
Season K%               0.207879
Game Threshold K%      60.533835
K% Mean                 0.208178
K% Std                  0.007430
Season BB%              0.089364
Game Threshold BB%     55.180451
BB% Mean                0.091786
BB% Std                 0.005848
Season ISO              0.182616
Game Threshold ISO     48.977444
ISO Mean  

In [55]:
player_key = 641355
fig = create_figure(player_dfs_2023[player_key], x_column='G', y_column='OPS')
fig.add_vline(x=stability_df.loc[player_key]['Game Threshold OPS'], line_dash="dot", line_color="black", annotation_text=f'OPS Stabilized (Est: {round(stability_df.loc[player_key]['OPS Mean'],3)})', annotation_position="bottom right")
fig.show()

In [75]:
create_stability_animation(player_dfs_2023[player_key], x_column='G', y_column='OPS', thresholds=thresholds, log=log)

In [None]:
# Plot distribution of OPS for all players in player_dfs_2023
sns.set(style="whitegrid")

ops = [df['OPS'].iloc[-1] for df in player_dfs_2023.values()]
plt.figure(figsize=(10,6))
sns.histplot(ops, kde=True, bins=100, color='blue')
plt.title('Distribution of OPS at end of season 2023')
plt.xlabel('OPS')
plt.show()
