# BSBL-Tomorrow Pre-Processing

Author: Jensen Holm <br>
April 2024

In [1]:
import requests

# get the url for our dataset of all statcast era pitches (2015-2023)
# from the huggingface API
PARQUET_URL = requests.get(
    "https://huggingface.co/api/datasets/Jensen-holm/statcast-era-pitches/parquet/default/train",
).json()[0]

print(PARQUET_URL)

https://huggingface.co/api/datasets/Jensen-holm/statcast-era-pitches/parquet/default/train/0.parquet


In [2]:
import polars as pl

# load the dataset into a polars DataFrame
statcast_era_pitches: pl.DataFrame = pl.read_parquet(PARQUET_URL)

# print columns and their types so we can see what we're working with
statcast_era_pitches.glimpse()

Rows: 5479763
Columns: 92
$ pitch_type                      <str> 'FF', 'FC', 'FF', 'FC', 'FF', 'FF', 'FF', 'FF', 'FC', 'KC'
$ game_date                       <str> '2015-11-01 00:00:00.000000000', '2015-11-01 00:00:00.000000000', '2015-11-01 00:00:00.000000000', '2015-11-01 00:00:00.000000000', '2015-11-01 00:00:00.000000000', '2015-11-01 00:00:00.000000000', '2015-11-01 00:00:00.000000000', '2015-11-01 00:00:00.000000000', '2015-11-01 00:00:00.000000000', '2015-11-01 00:00:00.000000000'
$ release_speed                   <f32> 96.0999984741211, 93.0999984741211, 97.0, 93.5999984741211, 97.0999984741211, 96.5, 96.5999984741211, 97.5999984741211, 92.0, 86.69999694824219
$ release_pos_x                   <f32> -2.0199999809265137, -1.659999966621399, -1.6399999856948853, -1.5800000429153442, -1.7000000476837158, -1.6200000047683716, -1.3899999856948853, -1.5099999904632568, -1.8899999856948853, -1.6200000047683716
$ release_pos_z                   <f32> 6.25, 6.239999771118164, 6.3000001

In [3]:
statcast_era_pitches.columns

['pitch_type',
 'game_date',
 'release_speed',
 'release_pos_x',
 'release_pos_z',
 'player_name',
 'batter',
 'pitcher',
 'events',
 'description',
 'spin_dir',
 'spin_rate_deprecated',
 'break_angle_deprecated',
 'break_length_deprecated',
 'zone',
 'des',
 'game_type',
 'stand',
 'p_throws',
 'home_team',
 'away_team',
 'type',
 'hit_location',
 'bb_type',
 'balls',
 'strikes',
 'game_year',
 'pfx_x',
 'pfx_z',
 'plate_x',
 'plate_z',
 'on_3b',
 'on_2b',
 'on_1b',
 'outs_when_up',
 'inning',
 'inning_topbot',
 'hc_x',
 'hc_y',
 'tfs_deprecated',
 'tfs_zulu_deprecated',
 'fielder_2',
 'umpire',
 'sv_id',
 'vx0',
 'vy0',
 'vz0',
 'ax',
 'ay',
 'az',
 'sz_top',
 'sz_bot',
 'hit_distance_sc',
 'launch_speed',
 'launch_angle',
 'effective_speed',
 'release_spin_rate',
 'release_extension',
 'game_pk',
 'pitcher.1',
 'fielder_2.1',
 'fielder_3',
 'fielder_4',
 'fielder_5',
 'fielder_6',
 'fielder_7',
 'fielder_8',
 'fielder_9',
 'release_pos_y',
 'estimated_ba_using_speedangle',
 'estimat

# Goal

The main goal of this project is to be able to predict how well current MLB pitchers are going to be next week, next month and next year (these will probably all be different models). In order to do this, the data will have to be structured a certain way.

I think that I am going to try and build a model that predicts performance on the game level, then to project the next week, we put all predictions together for the teams that they will be facing. If we do this for all pitchers on a team for every game in an upcoming week, we can see what pitchers should be throwing more against certain teams. 


##### Targets(s)
- Pitcher expected run value (delta_run_exp / some pitch amount)

##### Features
(might do feature selection, or PCA to shrink this very highly dimensional dataset)
- Statcast metrics of players on teams that the pitchers team will be facing
- Split statcast metrics against hitters of similar hitting profiles
- The pitches that the pitcher throws

In [4]:
# calculate each pitchers total delta_run_exp per 50 pitches in each of their outings
# we want to pair this with the aggregate statcast metrics of the batters they faced against
# similar pitching to the current pitcher.

PER_PITCHES: int = 100
PITCH_MINIMUM: int = 30

pitchers_outings = (
    statcast_era_pitches.group_by("game_pk", "home_team", "away_team", "pitcher").agg(
        # aggregate the delta_run_exp and count the number of pitches in each outing
        pl.sum("delta_run_exp").alias("total_delta_run_exp"),
        pl.count("delta_run_exp").alias("total_pitches"),
        # aggregate the hitters metrics in games before this outing
        # ...
    )
    # remove outings with less than the minimum
    .filter(pl.col("total_pitches") > PITCH_MINIMUM)
    # calculate the delta_run_exp per PER_PITCHES
    .with_columns(
        [
            # delta run_exp per PER_PITCHES pitches
            (
                pl.col("total_delta_run_exp") / pl.col("total_pitches") * PER_PITCHES
            ).alias(f"delta_run_exp_per_{PER_PITCHES}_pitches"),
            # statcast metrics that describe the batters that the pitcher is facing
            # ...
        ]
    )
)

pitchers_outings.glimpse()

Rows: 47745
Columns: 7
$ game_pk                       <i32> 491874, 661478, 632780, 529828, 529801, 567159, 447526, 414564, 447465, 531633
$ home_team                     <str> 'SEA', 'ATL', 'CLE', 'BOS', 'PHI', 'ATL', 'CIN', 'PIT', 'LAA', 'ATL'
$ away_team                     <str> 'BAL', 'COL', 'LAA', 'TB', 'ATL', 'CHC', 'SEA', 'PHI', 'LAD', 'WSH'
$ pitcher                       <i32> 452027, 657140, 663474, 519144, 601713, 641438, 502028, 502046, 502211, 476451
$ total_delta_run_exp           <f32> -0.36899977922439575, -3.340001106262207, -3.3109993934631348, -1.0550003051757812, 0.36199986934661865, -1.0810000896453857, -1.0209999084472656, -2.5159993171691895, -0.04199977591633797, -1.2680001258850098
$ total_pitches                 <u32> 77, 96, 95, 116, 82, 31, 37, 95, 69, 60
$ delta_run_exp_per_100_pitches <f64> -0.4792204924992153, -3.4791678190231323, -3.4852625194348787, -0.9094830217032597, 0.4414632553007544, -3.487097063372212, -2.7594592120196366, -2.6484203338623047, 

In [5]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# create visualization of the distribution delta_run_exp for each team in statcast era

TEAMS: list[str] = (
    pitchers_outings.unique("home_team").select("home_team").to_numpy().flatten()
)

num_rows = len(TEAMS) // 5
num_cols = len(TEAMS) // 5

fig = make_subplots(
    rows=(len(TEAMS) + 4) // 5,
    cols=len(TEAMS) // 5,
    subplot_titles=[team for team in TEAMS],
)

for i, team in enumerate(TEAMS):
    team_data = (
        pitchers_outings.filter(pl.col("home_team") == team)
        .select(
            f"delta_run_exp_per_{PER_PITCHES}_pitches",
        )
        .to_numpy()
        .flatten()
    )

    # Create histogram for the team
    histogram = go.Histogram(
        x=team_data,
        name=team,
        histnorm="probability",
    )
    row = (i // num_cols) + 1
    col = (i % num_cols) + 1

    # Add histogram to the subplot
    fig.add_trace(histogram, row=row, col=col)

    # Update subplot title
    fig.update_yaxes(title_text="", row=row, col=col)
    fig.update_xaxes(title_text=f"delta run exp", row=row, col=col)


fig.update_layout(
    title=f"Distribution of delta_run_exp per {PER_PITCHES} pitches for Each Team (2015-2023, minimum 30 pitches thrown)",
    height=1000,  # Adjust height if needed
    showlegend=False,
)

fig.show()