In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

def load_data():
    data_path = 'https://raw.githubusercontent.com/clkruse/clkruse.github.io/master/10-projects/baseball-explorer/fangraphs-batter-stats-simple.csv'
    df = pd.read_csv(data_path)
    # convert columns with percentages from strings to floats
    for c in df.columns:
        try:
            df[c] = df[c].str.replace('%', '').astype(float) / 100
        except:
            pass
    return df

def format_figure(fig, size=1000):
    # set point size
    fig.update_traces(marker=dict(size=3))
    # set the height and width of the figure
    fig.update_layout(height=size, width=size)
    # turn off legend
    fig.update_layout(showlegend=False)
    # turn off axis labels
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    # turn off axis titles
    fig.update_xaxes(title_text='')
    fig.update_yaxes(title_text='')
    # make margins slimmer
    fig.update_layout(margin=dict(l=30, r=20, t=60, b=20))
    # set the background color
    fig.update_layout(plot_bgcolor='#ebebeb', paper_bgcolor='#fafafa')
    # set the grid color
    fig.update_layout(xaxis_gridcolor='#dddddd', yaxis_gridcolor='#dddddd')
    # set the color of the zerolines to the grid color
    fig.update_xaxes(showline=True, linewidth=1, linecolor='#dddddd', zeroline=True, zerolinewidth=1, zerolinecolor='#dddddd')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='#dddddd', zeroline=True, zerolinewidth=1, zerolinecolor='#dddddd')
    return fig

In [None]:
data = load_data()

In [None]:
stats = data.drop(columns=['Name', 'Team', 'playerid', 'Dol'])
names = data['Name']
teams = data['Team']

In [None]:
# fill any NaN values with 0
stats = stats.fillna(0)
# normalize stats such that each column has a mean of 0 and a standard deviation of 1
stats = (stats - stats.mean()) / stats.std()

In [None]:
# run tsne on embeddings

tsne = TSNE(n_components=2, random_state=0)
tsne_embeddings = tsne.fit_transform(np.array(stats))

In [None]:

# kmeans cluster the embeddings
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
kmeans.fit(stats)
clusters = kmeans.labels_

In [None]:

# create the plotly figure
# You can tell that I'm just asking copilot to code with all these comments
plotly_df = pd.DataFrame(tsne_embeddings, columns=['x', 'y'])
plotly_df['text'] = [textwrap.fill(str(f)[:100], width=50).replace('\n', '<br>') for f in names]
plotly_df['cluster'] = [str(c) for c in clusters]
plotly_df['WAR'] = data['WAR']
plotly_df['Dollars'] = data['Dol']
# create a scatter plot of the embeddings with descriptions as hover text
fig = px.scatter(plotly_df, x='x', y='y', range_color=[data['WAR'].min() - 1, data['WAR'].max() + 1], color='WAR', hover_name='text', hover_data={'text': False, 'Dollars': True, 'cluster': True, 'x': False, 'y':False})
#fig = px.scatter(plotly_df, x='x', y='y', color='cluster', hover_name='text', hover_data={'text': False, 'cluster': True, 'x': False, 'y':False})
# set the height and width of the figure
size = 1000
fig.update_layout(height=size, width=size)
# turn off legend
fig.update_layout(showlegend=False)
# turn off axis labels
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)
# turn off axis titles
fig.update_xaxes(title_text='')
fig.update_yaxes(title_text='')
# make margins slimmer
fig.update_layout(margin=dict(l=30, r=20, t=60, b=20))
# set the background color
fig.update_layout(plot_bgcolor='#ebebeb', paper_bgcolor='#fafafa')
# set the grid color
fig.update_layout(xaxis_gridcolor='#dddddd', yaxis_gridcolor='#dddddd')
# set the color of the zerolines to the grid color
fig.update_xaxes(showline=True, linewidth=1, linecolor='#dddddd', zeroline=True, zerolinewidth=1, zerolinecolor='#dddddd')
fig.update_yaxes(showline=True, linewidth=1, linecolor='#dddddd', zeroline=True, zerolinewidth=1, zerolinecolor='#dddddd')
# save the figure as an html file
fig.write_html('./hitter-tsne.html', include_plotlyjs='cdn')
fig.show()

In [None]:
from pybaseball import statcast
statcast_data = statcast(start_dt="2023-06-01", end_dt="2023-06-30")

In [None]:
pitching_column_names = [
    'release_speed', 
    #'release_pos_x',
    #'release_pos_y', 
    #'release_pos_z', 
    #'zone', 
    'pfx_x', 
    'pfx_z', 
    #'plate_x', 
    #'plate_z', 
    'vx0',
    'vy0',
    'vz0',
    'ax',
    'ay',
    'az',
    'release_spin_rate',
    #'release_extension',
    'spin_axis'
    ]


In [None]:
pitch_data = statcast_data[pitching_column_names]
pitch_data = pitch_data.fillna(0)
pitch_data

In [None]:
#pitch_data_norm = pitch_data.to_numpy()
pitch_data_norm = (pitch_data - pitch_data.mean()) / pitch_data.std()


In [None]:
# run tsne on embeddings
tsne = TSNE(n_components=2, random_state=0)
pitch_embeddings = tsne.fit_transform(np.array(pitch_data_norm))
# kmeans cluster the embeddings
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
kmeans.fit(pitch_data_norm)
clusters = kmeans.labels_

In [None]:
statcast_data.iloc[0][['game_date', 'inning', 'pitcher', 'inning_topbot', 'balls', 'strikes', 'outs_when_up']].to_dict()

In [None]:
from urllib.parse import quote

def create_video_url(row):
    base_url = "https://www.mlb.com/video/?utm=mlbfilmroom-redirect&q="

    fields = row[['game_date', 'inning', 'pitcher', 'inning_topbot', 'balls', 'strikes', 'outs_when_up']].to_dict()
    import pandas as pd


    query_params = []
    for field, value in fields.items():
        if field == "game_date":
            if isinstance(value, pd.Timestamp):
                query_params.append(f"Date %3D %5B%22{value.strftime('%Y-%m-%d')}%22%5D")
            else:
                query_params.append(f"Date %3D %5B%22{value[:10]}%22%5D")
        elif field == "inning":
            query_params.append(f"Inning %3D %5B{value}%5D")
        elif field == "pitcher":
            query_params.append(f"PitcherId %3D %5B{value}%5D")
        elif field == "inning_topbot":
            query_params.append(f"TopBottom %3D %5B%22{value.upper()}TOM%22%5D")
        elif field == "balls":
            query_params.append(f"Balls %3D %5B{value}%5D")
        elif field == "strikes":
            query_params.append(f"Strikes %3D %5B{value}%5D")
        elif field == "outs_when_up":
            query_params.append(f"Outs %3D %5B{value}%5D")

    query = base_url + "+AND+".join(query_params) + "+Order+By+Timestamp+DESC"
    # remove the spaces from the query 
    query = query.replace(' ', '')
    return query

In [None]:
statcast_data['url'] = statcast_data.apply(create_video_url, axis=1)
# convert the url to a clickable link
statcast_data['url'] = statcast_data['url'].apply(lambda x: f'<a href="{x}" target="_blank">Video</a>')

In [None]:
import webbrowser
# create the plotly figure
# You can tell that I'm just asking copilot to code with all these comments
statcast_data['x'] = pitch_embeddings[:, 0]
statcast_data['y'] = pitch_embeddings[:, 1]
statcast_data['cluster'] = [str(c) for c in clusters]
# create a scatter plot of the embeddings with descriptions as hover text
color_field = 'release_spin_rate'
statcast_data[color_field] = statcast_data[color_field].astype(float)
fig = px.scatter(
    statcast_data, 
    x='x', y='y', 
    range_color=[statcast_data[color_field].min() - 1, statcast_data[color_field].max() + 1],
    color=color_field,
    hover_name='player_name', 
    hover_data={
        'cluster': True,
        'pitch_name': True,
        'release_speed': True,
        'release_spin_rate': True,
        'zone': True,
        'p_throws': True,
        'description': True,
        'home_team': True,
        #'url': True,
        'x': False, 
        'y':False
      }
    )
fig = format_figure(fig)
fig.write_html('./pitch-only-stats-tsne.html', include_plotlyjs='cdn')
fig.show()