# Tunnel Score Exploratory Data Analysis

Jensen Holm
Sep. 2024

In [1]:
import pybaseball
import polars as pl
import datetime
from utils import euclidean_distance

orig_df = pl.from_pandas(
    pybaseball.statcast(
        start_dt="2024-03-01",
        end_dt=datetime.datetime.today().strftime('%Y-%m-%d'),
))

orig_df.head(3)

This is a large query, it may take a moment to complete
Skipping offseason dates


100%|██████████| 173/173 [00:57<00:00,  3.01it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,…,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length
str,datetime[ns],f64,f64,f64,str,i64,i64,str,str,i64,i64,i64,i64,i64,str,str,str,str,str,str,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,str,…,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,i64,str,str,i64,f64,f64,f64,f64
"""FF""",2024-09-01 00:00:00,87.8,-3.46,5.87,"""Floro, Dylan""",681624,571670,"""strikeout""","""called_strike""",,,,,3,"""Andy Pages called out on strik…","""R""","""R""","""R""","""AZ""","""LAD""","""S""",2.0,,1,2,2024,-0.63,0.88,0.72,3.34,,605131,,2,9,"""Top""",…,6.6,747157,571670,645444,656976,641645,553993,621028,678489,664983,682998,53.88,,0.0,0.0,1.0,0.0,0.0,,81,4,"""4-Seam Fastball""",14,3,3,14,3,14,3,14,"""Standard""","""Standard""",234,0.0,-0.232,,
"""SI""",2024-09-01 00:00:00,90.0,-3.07,5.91,"""Floro, Dylan""",681624,571670,,"""ball""",,,,,12,"""Andy Pages called out on strik…","""R""","""R""","""R""","""AZ""","""LAD""","""B""",,,0,2,2024,-1.41,0.45,0.79,3.78,,605131,,2,9,"""Top""",…,6.6,747157,571670,645444,656976,641645,553993,621028,678489,664983,682998,53.87,,,,,,,,81,3,"""Sinker""",14,3,3,14,3,14,3,14,"""Standard""","""Standard""",228,0.0,0.019,,
"""FF""",2024-09-01 00:00:00,87.1,-3.57,5.8,"""Floro, Dylan""",681624,571670,,"""foul""",,,,,5,"""Andy Pages called out on strik…","""R""","""R""","""R""","""AZ""","""LAD""","""S""",,,0,1,2024,-0.5,1.26,0.0,2.89,,605131,,2,9,"""Top""",…,6.5,747157,571670,645444,656976,641645,553993,621028,678489,664983,682998,54.01,,,,,,,,81,2,"""4-Seam Fastball""",14,3,3,14,3,14,3,14,"""Standard""","""Standard""",229,0.0,-0.076,70.02346,7.03039


## Tying pitches to their previous ones

This will allow us to compare each pitch to the one before it, giving us the ability to calculate tunnel score.

In [3]:
# first we need to sort all the pitches

SORT_COLS = [
    "game_date",
    "pitcher",
    "at_bat_number",
    "pitch_number",
]

shifted_df = orig_df.sort(SORT_COLS, descending=True)

OVER_COLS = ["at_bat_number", "pitcher"]
# now we can shift the data by one row to get the previous pitch data for each pitch in one row
for col_name in shifted_df.columns:
    shifted_df = shifted_df.with_columns(
        pl.col(col_name)
        .shift(-1)
        .over(OVER_COLS)
        .alias(f"prev_{col_name}")
    )

shifted_df.select(SORT_COLS).head()

game_date,pitcher,at_bat_number,pitch_number
datetime[ns],i64,i64,i64
2024-09-01 00:00:00,702674,46,1
2024-09-01 00:00:00,702674,45,6
2024-09-01 00:00:00,702674,45,5
2024-09-01 00:00:00,702674,45,4
2024-09-01 00:00:00,702674,45,3


## Adding Tunnel Score Columns


In [5]:
# compute where the pitch and the previous one would have been if they had no movement
tunnel_distance_df: pl.DataFrame = shifted_df.with_columns(
    plate_x_no_move=pl.col("plate_x") - pl.col("pfx_x"), # x coord of the pitch if it had no movement
    plate_z_no_move=pl.col("plate_z") - pl.col("pfx_z"), # z coord of the pitch if it had no movement
    prev_plate_x_no_move=pl.col("prev_plate_x") - pl.col("prev_pfx_x"), # x coord of the previous pitch if it had no movement
    prev_plate_z_no_move=pl.col("prev_plate_z") - pl.col("prev_pfx_z"), # z coord of the previous pitch if it had no movement
)

tunnel_score_df: pl.DataFrame = tunnel_distance_df.with_columns(
    # tunnel_distance is the distance between the pitch and the previous pitch if they had no movement
    tunnel_distance=euclidean_distance( 
        x1=pl.col("plate_x_no_move"), 
        y1=pl.col("plate_z_no_move"), 
        x2=pl.col("prev_plate_x_no_move"), 
        y2=pl.col("prev_plate_z_no_move"),
    ),

    # actual_distance is the distance between the pitch and the previous pitch
    actual_distance=euclidean_distance(
        x1=pl.col("plate_x"),
        y1=pl.col("plate_z"),
        x2=pl.col("prev_plate_x"),
        y2=pl.col("prev_plate_z"),
    ),

    # release_distance is the distance between the release point of the pitch and the previous pitch
    release_distance=euclidean_distance(
        x1=pl.col("release_pos_x"),
        y1=pl.col("release_pos_z"),
        x2=pl.col("prev_release_pos_x"),
        y2=pl.col("prev_release_pos_z"),
    ),
)

tunnel_score_df.select(SORT_COLS + [
    "tunnel_distance",
    "actual_distance",
    "release_distance",
]).head()

game_date,pitcher,at_bat_number,pitch_number,tunnel_distance,actual_distance,release_distance
datetime[ns],i64,i64,i64,f64,f64,f64
2024-09-01 00:00:00,702674,46,1,,,
2024-09-01 00:00:00,702674,45,6,3.571582,2.080096,0.578014
2024-09-01 00:00:00,702674,45,5,1.861961,1.896233,0.155242
2024-09-01 00:00:00,702674,45,4,1.372953,0.47927,0.678823
2024-09-01 00:00:00,702674,45,3,0.372156,2.24058,0.554707


In [6]:
# now we can add tunnel score column using the tunnel_distance, actual_distance, and release_distance
tunnel_score_df = tunnel_score_df.with_columns(
    tunnel_score=(pl.col("tunnel_distance") / pl.col("actual_distance")) - pl.col("release_distance"),
)

tunnel_score_df.select(SORT_COLS + ["tunnel_score"]).head()

game_date,pitcher,at_bat_number,pitch_number,tunnel_score
datetime[ns],i64,i64,i64,f64
2024-09-01 00:00:00,702674,46,1,
2024-09-01 00:00:00,702674,45,6,1.139014
2024-09-01 00:00:00,702674,45,5,0.826685
2024-09-01 00:00:00,702674,45,4,2.185851
2024-09-01 00:00:00,702674,45,3,-0.388609
