# Tunnel Score Exploratory Data Analysis

Jensen Holm
Sep. 2024

In [1]:
import polars as pl
import pybaseball
import datetime

from utils import pl_euclidean_distance, get_player_headshot

orig_df = pl.from_pandas(
    pybaseball.statcast(
        start_dt="2024-03-01",
        end_dt=datetime.datetime.today().strftime("%Y-%m-%d"),
))

orig_df.head(3)

This is a large query, it may take a moment to complete
Skipping offseason dates


100%|██████████| 173/173 [00:59<00:00,  2.91it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,…,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length
str,datetime[ns],f64,f64,f64,str,i64,i64,str,str,i64,i64,i64,i64,i64,str,str,str,str,str,str,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,str,…,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,i64,str,str,i64,f64,f64,f64,f64
"""FF""",2024-09-02 00:00:00,96.2,-1.0,6.1,"""Honeywell, Brent""",656976,641703,"""field_out""","""hit_into_play""",,,,,4,"""Pavin Smith lines out sharply …","""R""","""L""","""R""","""AZ""","""LAD""","""X""",7.0,"""line_drive""",2,2,2024,-0.62,1.47,-0.48,2.62,,605137,,2,9,"""Bot""",…,6.2,747155,641703,669257,518692,621035,571771,500743,606192,669242,681624,54.34,0.611,0.631,0.0,1.0,0.0,0.0,4.0,87,5,"""4-Seam Fastball""",6,11,6,11,11,6,6,11,"""Strategic""","""Standard""",206,-0.001,-0.28,70.34365,6.63676
"""SC""",2024-09-02 00:00:00,79.7,-1.13,6.01,"""Honeywell, Brent""",656976,641703,,"""ball""",,,,,13,"""Pavin Smith lines out sharply …","""R""","""L""","""R""","""AZ""","""LAD""","""B""",,,1,2,2024,-0.61,-0.35,-0.35,1.2,,605137,,2,9,"""Bot""",…,6.2,747155,641703,669257,518692,621035,571771,500743,606192,669242,681624,54.29,,,,,,,,87,4,"""Screwball""",6,11,6,11,11,6,6,11,"""Infield shade""","""Standard""",304,0.0,0.048,,
"""SC""",2024-09-02 00:00:00,81.6,-1.21,6.06,"""Honeywell, Brent""",656976,641703,,"""called_strike""",,,,,8,"""Pavin Smith lines out sharply …","""R""","""L""","""R""","""AZ""","""LAD""","""S""",,,1,1,2024,-0.57,-0.35,-0.27,1.81,,605137,,2,9,"""Bot""",…,6.2,747155,641703,669257,518692,621035,571771,500743,606192,669242,681624,54.29,,,,,,,,87,3,"""Screwball""",6,11,6,11,11,6,6,11,"""Strategic""","""Standard""",286,0.0,-0.073,,


## Tying pitches to their previous ones

This will allow us to compare each pitch to the one before it, giving us the ability to calculate tunnel score.

In [2]:
# first we need to sort all the pitches

SORT_COLS = [
    "game_date",
    "pitcher",
    "at_bat_number",
    "pitch_number",
]

shifted_df = orig_df.sort(SORT_COLS, descending=True)

OVER_COLS = ["at_bat_number", "pitcher"]
# now we can shift the data by one row to get the previous pitch data for each pitch in one row
for col_name in shifted_df.columns:
    shifted_df = shifted_df.with_columns(
        pl.col(col_name)
        .shift(-1)
        .over(OVER_COLS)
        .alias(f"prev_{col_name}")
    )

shifted_df.select(SORT_COLS).head()

game_date,pitcher,at_bat_number,pitch_number
datetime[ns],i64,i64,i64
2024-09-02 00:00:00,687924,24,8
2024-09-02 00:00:00,687924,24,7
2024-09-02 00:00:00,687924,24,6
2024-09-02 00:00:00,687924,24,5
2024-09-02 00:00:00,687924,24,4


## Adding Tunnel Score

```math
TunnelScore = log_{2}( (\frac{TunnelDistance}{ActualDistance}) - releaseDistance)
```

Where ...

**ReleaseDistance** = euclidean distance between the release points of current pitch and the previous pitch

**ActualDistance** = euclidean distance between where the current pitch ended up over the plate, and the previous pitch 

**TunnelDistance** = euclidean distance between where the current pitch would have ended up without movement, and the same for the previous pitch 


In [3]:
# compute where the pitch and the previous one would have been if they had no movement
tunnel_distance_df: pl.DataFrame = shifted_df.with_columns(
    plate_x_no_move=pl.col("plate_x") - pl.col("pfx_x"), # x coord of the pitch if it had no movement
    plate_z_no_move=pl.col("plate_z") - pl.col("pfx_z"), # z coord of the pitch if it had no movement
    prev_plate_x_no_move=pl.col("prev_plate_x") - pl.col("prev_pfx_x"), # x coord of the previous pitch if it had no movement
    prev_plate_z_no_move=pl.col("prev_plate_z") - pl.col("prev_pfx_z"), # z coord of the previous pitch if it had no movement
)

tunnel_score_df: pl.DataFrame = tunnel_distance_df.with_columns(
    # tunnel_distance is the distance between the pitch and the previous pitch if they had no movement
    tunnel_distance=pl_euclidean_distance( 
        x1=pl.col("plate_x_no_move"), 
        y1=pl.col("plate_z_no_move"), 
        x2=pl.col("prev_plate_x_no_move"), 
        y2=pl.col("prev_plate_z_no_move"),
    ),

    # actual_distance is the distance between the pitch and the previous pitch
    actual_distance=pl_euclidean_distance(
        x1=pl.col("plate_x"),
        y1=pl.col("plate_z"),
        x2=pl.col("prev_plate_x"),
        y2=pl.col("prev_plate_z"),
    ),

    # release_distance is the distance between the release point of the pitch and the previous pitch
    release_distance=pl_euclidean_distance(
        x1=pl.col("release_pos_x"),
        y1=pl.col("release_pos_z"),
        x2=pl.col("prev_release_pos_x"),
        y2=pl.col("prev_release_pos_z"),
    ),
)

TUNNEL_COLS = [
    "tunnel_distance",
    "actual_distance",
    "release_distance",
]

tunnel_score_df.select(SORT_COLS + TUNNEL_COLS).head()

game_date,pitcher,at_bat_number,pitch_number,tunnel_distance,actual_distance,release_distance
datetime[ns],i64,i64,i64,f64,f64,f64
2024-09-02 00:00:00,687924,24,8,1.4796,2.51,0.3129
2024-09-02 00:00:00,687924,24,7,2.3241,2.3776,0.0969
2024-09-02 00:00:00,687924,24,6,13.2781,4.1104,0.0876
2024-09-02 00:00:00,687924,24,5,7.3661,3.1025,0.1429
2024-09-02 00:00:00,687924,24,4,4.6009,1.9689,0.1641


In [4]:
# now we can add tunnel score column using the tunnel_distance, actual_distance, and release_distance
tunnel_score_df = tunnel_score_df.with_columns(
    tunnel_score=(pl.col("tunnel_distance") / pl.col("actual_distance")) - pl.col("release_distance").log(base=2),
)

TUNNEL_COLS += ["tunnel_score"]

tunnel_score_df.select(SORT_COLS + TUNNEL_COLS).head()

game_date,pitcher,at_bat_number,pitch_number,tunnel_distance,actual_distance,release_distance,tunnel_score
datetime[ns],i64,i64,i64,f64,f64,f64,f64
2024-09-02 00:00:00,687924,24,8,1.4796,2.51,0.3129,2.265709
2024-09-02 00:00:00,687924,24,7,2.3241,2.3776,0.0969,4.344858
2024-09-02 00:00:00,687924,24,6,13.2781,4.1104,0.0876,6.743292
2024-09-02 00:00:00,687924,24,5,7.3661,3.1025,0.1429,5.181169
2024-09-02 00:00:00,687924,24,4,4.6009,1.9689,0.1641,4.94414
