In [1]:
import polars as pl
import requests

In [2]:
# get the url for our dataset of all statcast era pitches (2015-2023)
# from the huggingface API
PARQUET_URL = requests.get(
    "https://huggingface.co/api/datasets/Jensen-holm/statcast-era-pitches/parquet/default/train",
).json()[0]

print(PARQUET_URL)

https://huggingface.co/api/datasets/Jensen-holm/statcast-era-pitches/parquet/default/train/0.parquet


In [3]:
# load the dataset into a polars DataFrame
statcast_era_pitches: pl.DataFrame = pl.read_parquet(PARQUET_URL)

# print columns and their types so we can see what we're working with
print(statcast_era_pitches.sample(3))

shape: (3, 92)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ pitch_typ ┆ game_date ┆ release_s ┆ release_p ┆ … ┆ of_fieldi ┆ spin_axis ┆ delta_hom ┆ delta_ru │
│ e         ┆ ---       ┆ peed      ┆ os_x      ┆   ┆ ng_alignm ┆ ---       ┆ e_win_exp ┆ n_exp    │
│ ---       ┆ str       ┆ ---       ┆ ---       ┆   ┆ ent       ┆ f32       ┆ ---       ┆ ---      │
│ str       ┆           ┆ f32       ┆ f32       ┆   ┆ ---       ┆           ┆ f32       ┆ f32      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ SL        ┆ 2022-08-1 ┆ 84.199997 ┆ -3.27     ┆ … ┆ Standard  ┆ 114.0     ┆ 0.0       ┆ 0.051    │
│           ┆ 6 00:00:0 ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│           ┆ 0.0000000 ┆           ┆           ┆   ┆           ┆           

In [4]:
# make sure that there are no missing values
statcast_era_pitches = statcast_era_pitches.drop_nulls(
    subset=[
        "release_pos_x",
        "release_pos_x",
        "release_pos_x",
        "pitch_type",
    ]
)

## Clustering pitch release positions in 3D Space

In [11]:
locations = ["x", "y", "z"]
pivot_values = (
    [f"{loc}_variance" for loc in locations] + 
    [f"{loc}_mean" for loc in locations]
)

pitcher_release_clusters: pl.DataFrame = (
    statcast_era_pitches.group_by("pitcher", "game_pk", "pitch_type")
    .agg(
        x_variance=pl.col("release_pos_x").std() ** 2,
        y_variance=pl.col("release_pos_y").std() ** 2,
        z_variance=pl.col("release_pos_z").std() ** 2,
        x_mean=pl.col("release_pos_x").mean(),
        y_mean=pl.col("release_pos_y").mean(),
        z_mean=pl.col("release_pos_z").mean(),
    )
    .pivot(
        values=pivot_values,
        index=("pitcher", "pitch_type", "game_pk"),
        columns="pitch_type",
        aggregate_function=None,
    )
)

pitcher_release_clusters.glimpse()

Rows: 525432
Columns: 117
$ pitcher                  <i32> 595234, 570632, 456696, 431148, 475115, 593974, 571561, 592135, 608349, 542882
$ pitch_type               <str> 'CU', 'SI', 'FF', 'SI', 'SL', 'SL', 'FF', 'SL', 'KC', 'FF'
$ game_pk                  <i32> 490827, 415934, 531027, 413684, 414914, 530094, 491523, 632569, 490882, 530487
$ x_variance_pitch_type_CU <f32> 0.0035904771648347378, None, None, None, None, None, None, None, None, None
$ x_variance_pitch_type_SI <f32> None, None, None, 0.015708647668361664, None, None, None, None, None, None
$ x_variance_pitch_type_FF <f32> None, None, 0.03644999489188194, None, None, None, 0.00561470678076148, None, None, 0.005990439094603062
$ x_variance_pitch_type_SL <f32> None, None, None, None, 0.03432677313685417, 0.0008333316654898226, None, 0.007091660518199205, None, None
$ x_variance_pitch_type_KC <f32> None, None, None, None, None, None, None, None, 0.01159048080444336, None
$ x_variance_pitch_type_FS <f32> None, None, None, None,