In [1]:
import pickle
import plotly.express as px
from plotly.graph_objs import Figure
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import os
import shutil


# Utility Functions

In [2]:
def create_figure(df, x_column, y_column) -> Figure:
    x_data = df[x_column]
    y_data = df[y_column]

    min_value = y_data.min()
    max_value = y_data.max()
    season_value = y_data.iloc[-1]
    
    fig = px.line(x=x_data, y=y_data, markers=True)
    fig.update_layout(
        xaxis_title=x_column,
        yaxis_title=y_column,
        title=f'{df['Name'].iloc[0]} {y_column} over time'
    )
    fig.add_hline(y=min_value, line_dash="dot", line_color="black", annotation_text=f'Min: {min_value}', annotation_position="bottom right")
    fig.add_hline(y=max_value, line_dash="dot", line_color="black", annotation_text=f'Max: {max_value}', annotation_position="top right")
    
    fig.add_annotation(text=f"End of Season Value {season_value}", x=x_data.iloc[-12], y=season_value-season_value*0.05, showarrow=False)
    
    fig.update_yaxes(nticks=20)
    return fig

In [45]:
def create_stability_animation(df, x_column, y_column, thresholds, log):
    directory = './images'
    # remove files from ./images
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    
    x_data = list(range(1,len(df[x_column])))
    y_data = list(df[y_column])
    x_axis = [x_data[0], x_data[-1]]
    y_axis = [min(y_data)-0.05*max(y_data), max(y_data)+0.05*max(y_data)]

    for i, (x_val, y_val) in enumerate(zip(x_data, y_data)):
        fig = go.Figure()
        fig.update_layout(
            title=f'{df["Name"].iloc[0]} {y_column} over time',
            xaxis_title='Player Game',
            yaxis_title=y_column,
            xaxis_range=x_axis,
            yaxis_range=y_axis,
            showlegend=False,
            template= "simple_white"
        )
        k = i-(thresholds['window']['size']-1)
        if k < len(log) and i >= thresholds['window']['size']-1:
            fig.add_annotation(text=f"Game: {log[k]['Game']}<br>Mean: {round(log[k]['Window Mean'],3)}<br>Std Dev: {round(log[k]['Window Std'],3)}", x=x_val, y=(y_axis[1]-y_axis[0])/2+y_axis[0], showarrow=False, align="left", xanchor="left")
            fig.add_annotation(text=f"Std Dev ROC: {round(log[k]['Std ROC'],3)}", x=x_val, y=y_axis[0]+y_axis[0]*0.1, showarrow=False, align="left", xanchor="left")
            shape_x_ref = x_data[i]
            line_thickness = 2
            line_dash = "dot"
            line_color = "red"
        elif i < thresholds['window']['size']-1:
            shape_x_ref = -1
            line_thickness = 2
            line_dash = "dot"
            line_color = "red"
        else:
            fig.add_annotation(text=f"<b>Game: {log[-1]['Game']}<br>Mean: {round(log[-1]['Window Mean'],3)}<br>Std Dev: {round(log[-1]['Window Std'],3)}</b>", x=x_data[len(log)+thresholds['window']['size']], y=(y_axis[1]-y_axis[0])/2+y_axis[0], showarrow=False, align="left", xanchor="left")
            fig.add_annotation(text=f"<b>Std Dev ROC: {round(log[-1]['Std ROC'])}</b>", x=x_data[len(log)+thresholds['window']['size']], y=y_axis[0]+y_axis[0]*0.1, showarrow=False, align="left", xanchor="left")
            shape_x_ref = x_data[len(log)+thresholds['window']['size']]
            line_thickness = 4
            line_dash = "solid"
            line_color = "green"
            fig.add_shape(type="line",
                            x0=shape_x_ref, y0=log[-1]['Window Mean'], x1=x_data[-1], y1=log[-1]['Window Mean'],  # Start and end points of the line
                            line=dict(color="black", width=1, dash='dot')  # Line style
                         )
        fig.add_trace(go.Scatter(x=x_data[:i+1], y=y_data[:i+1], mode='lines+markers',
                                 line=dict(width=2), marker=dict(size=5)))
        fig.add_trace(go.Scatter(x=[x_val], y=[y_val], mode='markers', marker=dict(color='red', size=5)))
        fig.add_shape(type='rect',
                      x0=shape_x_ref-(thresholds['window']['size']-1), y0=0, x1=shape_x_ref, y1=1,
                      xref='x', yref='paper', line=dict(color='red', width=0), fillcolor='red', opacity=0.2)
        
        fig.add_vline(x=shape_x_ref, line_dash=line_dash, line_color=line_color, line_width=line_thickness)

        fig.add_shape(type='rect',
                    x0=shape_x_ref-(thresholds['window']['size']-1)-thresholds['window']['lag'], y0=0, x1=shape_x_ref-thresholds['window']['lag'], y1=1,
                    xref='x', yref='paper', line=dict(color='red', width=0), fillcolor='blue', opacity=0.2)
        
        fig.write_image(f'./images/{i+1}.png')
    return

In [4]:
def compute_league_variance(dfs:dict):
    """
    Compute the season stat variance of each stat across the league 
    """
    stat_columns = ['BA','OBP','SLG','OPS','wOBA','K%','BB%','ISO','wRC+']
    player_values = {}
    for key, df in dfs.items():
        player_values[key] = df[stat_columns].iloc[-1].to_dict()
    
    return pd.DataFrame(player_values).T

# Stability Algorithms

In [5]:
def sliding_window_stability(dfs:dict, stat:str, thresholds:dict, player='all', log_stat='OPS') -> dict:
    """
    Works by computing the standard deviation for a given stat over a sliding window of games
    so that we can see how a player's performance stabilizes over time
    """
    window_size = thresholds['window']['size'] # games
    window_lag = thresholds['window']['lag'] # games
    lagged_std = 0
    log = []
    log_flag = False
    stability_columns = [f'Season {stat}', f'Game Threshold {stat}', f'{stat} Mean', f'{stat} Std']
    stability = pd.DataFrame(index=dfs.keys(), columns=stability_columns, dtype=float).fillna(-1)

    for player_id, df in dfs.items():
        i = 0
        stability.at[player_id,f'Season {stat}'] = df[stat].iloc[-1]
        stability.at[player_id,f'Game Threshold {stat}'] = df['G'].iloc[-1]
        stability.at[player_id,f'{stat} Mean'] = -1
        stability.at[player_id,f'{stat} Std'] = -1

        while i+window_size < len(df):
            window_stats = df.iloc[i:i+window_size][stat]
            window_std = window_stats.std()
            window_mean = window_stats.mean()
            mean_diff = abs(window_mean - stability.at[player_id,f'Season {stat}']) / stability.at[player_id,f'Season {stat}'] # % difference
            window_std_roc = abs(window_std - lagged_std)

            if player_id == player and stat == log_stat:
                log.append({
                    'Player Game': i+window_size,
                    'Window Mean': window_mean,
                    'Window Std': window_std,
                    'Mean Diff': mean_diff,
                    'Std ROC': window_std_roc
                })
                log_flag = True

            if window_std < thresholds[stat]['std'] and mean_diff < thresholds[stat]['mean'] and window_std_roc < thresholds[stat]['std_roc']:
                stability.at[player_id, f'Game Threshold {stat}'] = i+window_size
                stability.at[player_id, f'{stat} Mean'] = window_mean
                stability.at[player_id, f'{stat} Std'] = window_std
                break
            i+=1
            # if i-window_lag > 0:
            lagged_window = df.iloc[i-window_lag:i-window_lag+window_size][stat]
            lagged_std = lagged_window.std()
    if log_flag:
        return stability, log
    else:
        return stability


The above code computes stability off of when at a certain point a window gets below a standard deviation threshhold and a difference between the current value and the season mean. I worry that this is not a comprehensive way to measure stability since you could essentially get lucky by finding a temporarily stable period of time that is relatively close to the player's season value. 

Ideas to get around this:
* After you find the first window match (let's say at 65 games), compute how many "false" windows follow it (how many don't satisfy the threshold criteria). If the number of false windows is below another threshold, then keep the first window match, else move to the next one
* Create a measure of the std deviation rate of change - negative values means the value is getting more stable, then as it approaches 0 it's getting as stable as it will get

Will start by implementing the second option

# Analysis

In [6]:
player_dfs_2023 = pickle.load(open('2023_player_dfs.pkl', 'rb'))
player_dfs_2023.keys()

dict_keys([682928, 547989, 660670, 642715, 645277, 624413, 641313, 606115, 571448, 668227, 650333, 595879, 605137, 641355, 643217, 605141, 666182, 593428, 664761, 608324, 600869, 592192, 682998, 592206, 656305, 661388, 621043, 641487, 630105, 605204, 650559, 646240, 553869, 650490, 592273, 669242, 642731, 664034, 518692, 670770, 666969, 665926, 502671, 663757, 665489, 666971, 664023, 671739, 663647, 669720, 683002, 571771, 606192, 663538, 663697, 673962, 673490, 680757, 518934, 596019, 663993, 592518, 606466, 656716, 641857, 643446, 669004, 608841, 593160, 571970, 607043, 663457, 660271, 621566, 681546, 542303, 670623, 665161, 521692, 663728, 608070, 592663, 592669, 668804, 663586, 673357, 677594, 667670, 642708, 660688, 668939, 467793, 623993, 656941, 608369, 543760, 642086, 669257, 624585, 665742, 543807, 668715, 663886, 681082, 664702, 553993, 669261, 621020, 657041, 679529, 650402, 678662, 663656, 457759, 607208, 662139, 683734, 657077, 663837, 683011, 664774, 572233, 677951, 59288

In [7]:
df = compute_league_variance(player_dfs_2023)
df.describe()

Unnamed: 0,BA,OBP,SLG,OPS,wOBA,K%,BB%,ISO,wRC+
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,0.261774,0.334398,0.444383,0.778714,0.417922,0.207879,0.089364,0.182616,17.833764
std,0.025463,0.029555,0.060298,0.081808,0.044471,0.052094,0.029938,0.050986,3.557708
min,0.197,0.267,0.296,0.582,0.297691,0.055105,0.032759,0.05071,8.215267
25%,0.248,0.315,0.407,0.717,0.386575,0.170608,0.065123,0.153518,15.326027
50%,0.262,0.328,0.441,0.778,0.417373,0.209756,0.088123,0.178752,17.789855
75%,0.275,0.352,0.475,0.818,0.441756,0.238532,0.105008,0.20948,19.740458
max,0.354,0.416,0.654,1.066,0.559221,0.326996,0.186441,0.350101,29.137716


In [35]:
# Thresholds set by global accuracy threshold to final season value
thresholds = {
    'window':{
        'size': 20,
        'lag': 10
    },
    'BA': {
        'std': 0.015,
        'std_roc': 0.002,
        'mean': 0.05
    },
    'OBP': {
        'std': 0.02,
        'std_roc': 0.003,
        'mean': 0.05
    },
    'SLG': {
        'std': 0.04,
        'std_roc': 0.006,
        'mean': 0.05
    },
    'OPS': {
        'std': 0.05,
        'std_roc': 0.008,
        'mean': 0.05
    },
    'wOBA': {
        'std': 0.03,
        'std_roc': 0.006,
        'mean': 0.05
    },
    'K%': {
        'std': 0.03,
        'std_roc': 0.002,
        'mean': 0.05
    },
    'BB%': {
        'std': 0.03,
        'std_roc': 0.002,
        'mean': 0.1 
    },
    'ISO': {
        'std': 0.03,
        'std_roc': 0.002,
        'mean': 0.15
    },
    'wRC+': {
        'std': 1.5,
        'std_roc': 0.15,
        'mean': 0.05
    }
}

In [36]:
player_key = 595879
stability_df = pd.DataFrame()
stat_columns = ['BA','OBP','SLG','OPS','wOBA','K%','BB%','ISO','wRC+']
for stat in stat_columns:
    print(f"Computing stability for {stat}")
    if stat == 'OPS':
        stability, log = sliding_window_stability(player_dfs_2023, stat, thresholds, player=player_key, log_stat='OPS')
    else:
        stability = sliding_window_stability(player_dfs_2023, stat, thresholds)
    stability_df = pd.concat([stability_df, stability], axis=1)

Computing stability for BA
Computing stability for OBP
Computing stability for SLG
Computing stability for OPS
Computing stability for wOBA
Computing stability for K%
Computing stability for BB%
Computing stability for ISO
Computing stability for wRC+


In [45]:
stability_df['BA Error'] = ((stability_df['BA Mean'] - stability_df['Season BA']) / stability_df['Season BA']).abs()
stability_df['OBP Error'] = ((stability_df['OBP Mean'] - stability_df['Season OBP']) / stability_df['Season OBP']).abs()
stability_df['SLG Error'] = ((stability_df['SLG Mean'] - stability_df['Season SLG']) / stability_df['Season SLG']).abs() 
stability_df['OPS Error'] = ((stability_df['OPS Mean'] - stability_df['Season OPS']) / stability_df['Season OPS']).abs() 
stability_df['wOBA Error'] = ((stability_df['wOBA Mean'] - stability_df['Season wOBA']) / stability_df['Season wOBA']).abs()
stability_df['K% Error'] = ((stability_df['K% Mean'] - stability_df['Season K%']) / stability_df['Season K%']).abs() 
stability_df['BB% Error'] = ((stability_df['BB% Mean'] - stability_df['Season BB%']) / stability_df['Season BB%']).abs()
stability_df['ISO Error'] = ((stability_df['ISO Mean'] - stability_df['Season ISO']) / stability_df['Season ISO']).abs()
stability_df['wRC+ Error'] = ((stability_df['wRC+ Mean'] - stability_df['Season wRC+']) / stability_df['Season wRC+']).abs()

In [46]:
fig = create_figure(player_dfs_2023[player_key], x_column='G', y_column='OPS')
fig.add_vline(x=stability_df.loc[player_key]['Game Threshold OPS'], line_dash="dot", line_color="black", annotation_text=f'OPS Stabilized (Est: {round(stability_df.loc[player_key]['OPS Mean'],3)})', annotation_position="bottom right")
fig.show()

In [46]:
create_stability_animation(player_dfs_2023[player_key], x_column='G', y_column='OPS', thresholds=thresholds, log=log)

In [33]:
log

[{'Game': 10,
  'Window Mean': 0.3373,
  'Window Std': 0.18786404658688685,
  'Mean Diff': 0.4311973018549747,
  'Std ROC': 0.17873229489901837},
 {'Game': 11,
  'Window Mean': 0.2803,
  'Window Std': 0.07066988986359986,
  'Mean Diff': 0.527318718381113,
  'Std ROC': nan},
 {'Game': 12,
  'Window Mean': 0.26789999999999997,
  'Window Std': 0.042432822725390826,
  'Mean Diff': 0.5482293423271501,
  'Std ROC': nan},
 {'Game': 13,
  'Window Mean': 0.28459999999999996,
  'Window Std': 0.07150322448051634,
  'Mean Diff': 0.5200674536256324,
  'Std ROC': nan},
 {'Game': 14,
  'Window Mean': 0.30289999999999995,
  'Window Std': 0.09970896538314786,
  'Mean Diff': 0.4892074198988196,
  'Std ROC': nan},
 {'Game': 15,
  'Window Mean': 0.3286,
  'Window Std': 0.11456798078967012,
  'Mean Diff': 0.44586846543001685,
  'Std ROC': 0.07329606579721673},
 {'Game': 16,
  'Window Mean': 0.35540000000000005,
  'Window Std': 0.12064474939074454,
  'Mean Diff': 0.40067453625632365,
  'Std ROC': 0.04997485

In [47]:
#Create mp4 from images
!ffmpeg -framerate 10 -i ./images/%d.png -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p output.mp4 -y

ffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 12 (Debian 12.2.0-14)
  configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enab

In [None]:
# Plot distribution of OPS for all players in player_dfs_2023
sns.set(style="whitegrid")

ops = [df['OPS'].iloc[-1] for df in player_dfs_2023.values()]
plt.figure(figsize=(10,6))
sns.histplot(ops, kde=True, bins=100, color='blue')
plt.title('Distribution of OPS at end of season 2023')
plt.xlabel('OPS')
plt.show()
