### Reading data

The data for this project is located in [this hugging face space](https://huggingface.co/datasets/Jensen-holm/statcast-era-pitches) that I made. It contains every single pitch from the modern statcast era up through last year (2015-2023).

In [1]:
import polars as pl

STATCAST_ERA_PITCHES_URL: str = (
    "https://huggingface.co/api/datasets/Jensen-holm/statcast-era-pitches/parquet/default/train/0.parquet"
)

statcast_era_df: pl.DataFrame = pl.read_parquet(STATCAST_ERA_PITCHES_URL)
statcast_era_df.sample(3)

pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,…,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
str,str,f32,f32,f32,str,i32,i32,str,str,i64,i64,i64,i64,f32,str,str,str,str,str,str,str,f32,str,i32,i32,i32,f32,f32,f32,f32,i32,i32,i32,i32,f32,str,…,f32,i64,f32,i32,i64,i64,i32,i32,i32,i32,i32,i32,i32,f32,f32,f32,f32,f32,f32,f32,f32,i32,i32,str,i32,i32,i32,i64,i32,i32,i32,i64,str,str,f32,f32,f32
"""FF""","""2017-05-09 00:…",95.699997,-2.39,5.93,"""Senzatela, Ant…",519203,622608,,"""foul""",,,,,8.0,"""Anthony Rizzo …","""R""","""L""","""R""","""COL""","""CHC""","""S""",,,2,0,2017,0.06,1.02,-0.11,2.13,,,,2,1.0,"""Top""",…,96.300003,2145,6.4,490574,622608,452672,448602,518934,571448,596115,435622,453568,471865,54.07,,,,,,,,3,3,"""4-Seam Fastbal…",0,0,0,0,0,0,0,0,"""Infield shift""","""Standard""",176.0,0.0,-0.022
"""SL""","""2017-04-21 00:…",85.900002,-3.36,5.77,"""Roark, Tanner""",608061,543699,,"""foul""",,,,,6.0,"""T.J. Rivera gr…","""R""","""R""","""R""","""NYM""","""WSH""","""S""",,,1,1,2017,-0.06,0.47,0.63,2.36,,,,1,7.0,"""Bot""",…,84.300003,2355,5.6,490350,543699,446653,452252,502517,543685,594694,502317,594809,547180,54.939999,,,,,,,,53,3,"""Slider""",3,3,3,3,3,3,3,3,"""Standard""","""Standard""",190.0,0.0,-0.045
"""ST""","""2022-07-22 00:…",82.099998,-2.37,5.46,"""Darvish, Yu""",621512,506433,,"""called_strike""",,,,,12.0,"""Tomas Nido str…","""R""","""R""","""R""","""NYM""","""SD""","""S""",,,0,0,2022,1.49,-0.1,0.86,2.6,,,,1,3.0,"""Bot""",…,82.099998,2754,6.6,662481,506433,543592,543333,630105,592518,682928,595777,663757,608577,53.889999,,,,,,,,22,1,"""Sweeper""",0,0,0,0,0,0,0,0,"""Standard""","""Standard""",58.0,0.0,-0.024


# Goal

I want to be able to measure a pitchers ability to tunnel pitches in an at bat. This will entail computing distances in 2D space between a few different metrics and combining them into one overarching tunnel score.

- Computing the distance between horizontal movement and vertical movement between different pitches (high score = better?)
- Computing the distance between release position x, y, and z for between different pitches (low score = better) 
- Estimate where the ball would have ended up without spin, compare that to other pitches without spin (low score = better). Say if two pitches had very different movement, but would have ended up in similar spots without spin, this means that the pitches started out on similar trajectories but broke a lot differently which is a very good thing.



In [2]:
TUNNEL_COLS: list[str] = [
    "pitch_type",  # type of pitch: FF, FC, CU, etc ...
    "release_pos_x",  # horizontal release position of ball in ft from catcher pov
    "release_pos_z",  # vertical release position of ball in ft from catcher pov
    "pfx_x",  # horizontal movement in ft from catchers perspective
    "pfx_z",  # vertical movement in ft from catchers perspective
    "plate_x",  # horizontal position of ball when it crossed the plate
    "plate_z",  # vertical position of the ball when it crossed the plate
]

# drop missing values for the columns that we care about
statcast_era_pitches = statcast_era_df.drop_nulls(subset=TUNNEL_COLS)

In [3]:
# grouping by these features will allow us to get fine grained data on each pitcher.
# Each row is going to be a pitchers metrics on one of their pitches in one at bat
GROUP_COLS = ["pitcher", "game_pk", "pitch_type", "at_bat_number"]

pitcher_release_clusters: pl.DataFrame = (
    statcast_era_df.group_by(GROUP_COLS).agg(
        # horizontal & vertical movement
        h_move_variance=pl.col("pfx_x").std() ** 2,
        v_move_variance=pl.col("pfx_z").std() ** 2,
        h_move_mean=pl.col("pfx_x").mean(),
        v_move_mean=pl.col("pfx_z").mean(),
        # release position
        h_release_variance=pl.col("release_pos_x").std() ** 2,
        v_release_variance=pl.col("release_pos_z").std() ** 2,
        h_release_mean=pl.col("release_pos_x").mean(),
        v_release_mean=pl.col("release_pos_z").mean(),
    )
    # merge back with other data that we want to know about the pitch
    .join(
        other=statcast_era_df.select(
            GROUP_COLS + ["spin_axis", "release_spin_rate", "home_team", "away_team"]
        ),
        on=["game_pk", "pitcher", "pitch_type", "at_bat_number"],
        how="left",
    )
)

pitcher_release_clusters.select(
    GROUP_COLS
    + [
        "h_move_variance",
        "v_move_variance",
        "h_move_mean",
        "v_move_mean",
    ]
).sample(3)

pitcher,game_pk,pitch_type,at_bat_number,h_move_variance,v_move_variance,h_move_mean,v_move_mean
i32,i32,str,i32,f32,f32,f32,f32
572362,530742,"""FF""",82,0.08405,0.02,0.605,1.38
519326,529524,"""SI""",75,0.019633,0.0309,-1.446667,0.69
543408,414852,"""FF""",46,0.01945,0.00812,-0.74,1.352
