# BSBL-Tomorrow Pre-Processing

Author: Jensen Holm <br>
April 2024

In [1]:
from constants import DATA_DIR, COMPRESSION
import polars as pl
import requests
import os

In [2]:
# get the url for our dataset of all statcast era pitches (2015-2023)
# from the huggingface API
PARQUET_URL = requests.get(
    "https://huggingface.co/api/datasets/Jensen-holm/statcast-era-pitches/parquet/default/train",
).json()[0]

print(PARQUET_URL)

https://huggingface.co/api/datasets/Jensen-holm/statcast-era-pitches/parquet/default/train/0.parquet


In [3]:
# load the dataset into a polars DataFrame
statcast_era_pitches: pl.DataFrame = pl.read_parquet(PARQUET_URL)

# print columns and their types so we can see what we're working with
statcast_era_pitches.sample(5).glimpse()

Rows: 5
Columns: 92
$ pitch_type                      <str> 'FF', 'CH', 'FF', 'FC', 'SL'
$ game_date                       <str> '2015-08-28 00:00:00.000000000', '2021-10-01 00:00:00.000000000', '2022-08-31 00:00:00.000000000', '2022-08-17 00:00:00.000000000', '2016-07-24 00:00:00.000000000'
$ release_speed                   <f32> 93.9000015258789, 86.0999984741211, 93.5, 97.19999694824219, 85.0
$ release_pos_x                   <f32> -2.369999885559082, -1.6100000143051147, -1.899999976158142, -0.7900000214576721, 2.7799999713897705
$ release_pos_z                   <f32> 6.230000019073486, 5.789999961853027, 6.5, 6.150000095367432, 5.840000152587891
$ player_name                     <str> 'Feliz, Neftalí', 'Lorenzen, Michael', 'McKenzie, Triston', 'Clase, Emmanuel', 'Alvarez, Jose'
$ batter                          <i32> 572365, 660829, 623993, 570731, 621043
$ pitcher                         <i32> 491703, 547179, 663474, 661403, 501625
$ events                          <str> None, '

In [4]:
statcast_era_pitches.columns

['pitch_type',
 'game_date',
 'release_speed',
 'release_pos_x',
 'release_pos_z',
 'player_name',
 'batter',
 'pitcher',
 'events',
 'description',
 'spin_dir',
 'spin_rate_deprecated',
 'break_angle_deprecated',
 'break_length_deprecated',
 'zone',
 'des',
 'game_type',
 'stand',
 'p_throws',
 'home_team',
 'away_team',
 'type',
 'hit_location',
 'bb_type',
 'balls',
 'strikes',
 'game_year',
 'pfx_x',
 'pfx_z',
 'plate_x',
 'plate_z',
 'on_3b',
 'on_2b',
 'on_1b',
 'outs_when_up',
 'inning',
 'inning_topbot',
 'hc_x',
 'hc_y',
 'tfs_deprecated',
 'tfs_zulu_deprecated',
 'fielder_2',
 'umpire',
 'sv_id',
 'vx0',
 'vy0',
 'vz0',
 'ax',
 'ay',
 'az',
 'sz_top',
 'sz_bot',
 'hit_distance_sc',
 'launch_speed',
 'launch_angle',
 'effective_speed',
 'release_spin_rate',
 'release_extension',
 'game_pk',
 'pitcher.1',
 'fielder_2.1',
 'fielder_3',
 'fielder_4',
 'fielder_5',
 'fielder_6',
 'fielder_7',
 'fielder_8',
 'fielder_9',
 'release_pos_y',
 'estimated_ba_using_speedangle',
 'estimat

# Goal

The main goal of this project is to be able to predict how well current MLB pitchers are going to be next week, next month and next year (these will probably all be different models). In order to do this, the data will have to be structured a certain way.

I think that I am going to try and build a model that predicts performance on the game level, then to project the next week, we put all predictions together for the teams that they will be facing. If we do this for all pitchers on a team for every game in an upcoming week, we can see what pitchers should be throwing more against certain teams. 


##### Targets(s)
- Pitcher expected run value (delta_run_exp / some pitch amount)

##### Features
(might do feature selection, or PCA to shrink this very highly dimensional dataset)
- Statcast metrics of players on teams that the pitchers team will be facing
- Split statcast metrics against hitters of similar hitting profiles
- The pitches that the pitcher throws

In [5]:
# calculate each pitchers total delta_run_exp per 50 pitches in each of their outings
# we want to pair this with the aggregate statcast metrics of the batters they faced against
# similar pitching to the current pitcher.

PER_PITCHES: int = 100
PITCH_MINIMUM: int = 30

pitchers_outings = (
    statcast_era_pitches.group_by("game_pk", "home_team", "away_team", "pitcher").agg(
        # aggregate the delta_run_exp and count the number of pitches in each outing
        pl.sum("delta_run_exp").alias("total_delta_run_exp"),
        pl.count("delta_run_exp").alias("total_pitches"),
        # aggregate the hitters metrics in games before this outing
        # ...
    )
    # remove outings with less than the minimum
    .filter(pl.col("total_pitches") > PITCH_MINIMUM)
    # calculate the delta_run_exp per PER_PITCHES
    .with_columns(
        [
            # delta run_exp per PER_PITCHES pitches
            (
                pl.col("total_delta_run_exp") / pl.col("total_pitches") * PER_PITCHES
            ).alias(f"delta_run_exp_per_{PER_PITCHES}_pitches"),
            # statcast metrics that describe the batters that the pitcher is facing
            # ...
        ]
    )
)

pitchers_outings.glimpse()

Rows: 47745
Columns: 7
$ game_pk                       <i32> 447157, 630968, 564807, 566418, 413902, 566512, 663191, 631172, 530353, 531501
$ home_team                     <str> 'DET', 'TB', 'LAA', 'SEA', 'CIN', 'SF', 'BOS', 'OAK', 'CHC', 'OAK'
$ away_team                     <str> 'OAK', 'TOR', 'TB', 'OAK', 'CHC', 'AZ', 'CLE', 'COL', 'PIT', 'NYY'
$ pitcher                       <i32> 519455, 670950, 642545, 605135, 571561, 595881, 543135, 608566, 641771, 570666
$ total_delta_run_exp           <f32> -1.2750000953674316, 1.0260003805160522, -2.0949997901916504, 2.2430005073547363, 3.9850001335144043, 0.0140000581741333, 1.7540000677108765, -1.9970004558563232, -0.18800032138824463, -0.15199995040893555
$ total_pitches                 <u32> 102, 40, 89, 78, 66, 84, 95, 95, 111, 40
$ delta_run_exp_per_100_pitches <f64> -1.250000093497482, 2.5650009512901306, -2.3539323485299444, 2.8756416760958157, 6.03787899017334, 0.016666735921587263, 1.8463158607482912, -2.102105743006656, -0.16936965

In [6]:
# save the DataFrame to a parquet file
pitchers_outings.write_parquet(
    os.path.join(DATA_DIR, "pitchers_outings.parquet"),
    compression=COMPRESSION,
)