In [1]:
# Imports
import statcast_pitches
import polars as pl
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV

from imblearn.over_sampling import SMOTENC
from catboost import CatBoostClassifier


In [2]:
# Helper functions
def load_passed_ball_data(years=None, data_dir="/Users/maxwassarman/baseball_work/api_scrape/passed_ball_data/"):    
    dfs = []
    for year in years:
        file_path = os.path.join(data_dir, f"mlb_data_{year}.parquet")
        if os.path.exists(file_path):
            df = pl.read_parquet(file_path)
            dfs.append(df)
    
    if dfs:
        return pl.concat(dfs)
    
def round_join_columns(df: pl.DataFrame, columns: list[str], precision: int = 4) -> pl.DataFrame:
    return df.with_columns([
        pl.col(c).round(precision).alias(c) for c in columns
    ])

def calculate_landing_points(x0, y0, z0, vx0, vy0, vz0, ax, ay, az):
    discriminant = vz0**2 - 2*az*z0
    sqrt_discriminant = np.sqrt(discriminant)
    t1 = (-vz0 + sqrt_discriminant) / az
    t2 = (-vz0 - sqrt_discriminant) / az
    pos_root = max(t1, t2)
    
    x_landing = x0 + vx0*pos_root + 0.5*ax*pos_root**2
    y_landing = y0 + vy0*pos_root + 0.5*ay*pos_root**2
    
    return x_landing, y_landing

In [3]:
# Load and prepare statcast data (1 min)
years = [2020,2021,2022,2023,2024,2025]

pitches_df = (statcast_pitches.load()
    .filter(pl.col("game_date").dt.year().is_in(years),
            pl.col("game_type").is_in(["R", "F", "D", "L", "W"]))
    .collect()
    .unique(subset=["game_date", "game_pk", "at_bat_number", "pitch_number"]) # For some reason some pitches had 2 identical rows
)

In [4]:
# Load passed ball data and join
passed_ball_df = load_passed_ball_data(years=years)
join_columns = ["ax", "ay", "az", "vx0", "vy0", "vz0"]
passed_ball_slim = passed_ball_df.select([*join_columns, "passed_ball", "wild_pitch"])

pitches_df = round_join_columns(pitches_df, join_columns, precision=4)
passed_ball_slim = round_join_columns(passed_ball_slim, join_columns, precision=4)

joined_df = (pitches_df.join(passed_ball_slim, on=join_columns, how="left")
    .with_columns(
        pl.when((pl.col("passed_ball") == True) | (pl.col("wild_pitch") == True))
        .then(1)
        .otherwise(0)
        .alias("is_pb_wp")
    )
)

# Add catcher mapping
catcher_mapping = pl.read_csv("/Users/maxwassarman/baseball_work/api_scrape/catcher_mapping.csv")
joined_df = joined_df.join(catcher_mapping, left_on="fielder_2", right_on="player_id", how="left")

In [5]:
# Calculate landing points (25 sec)
data = joined_df.with_columns(
    pl.struct(["release_pos_x", "release_pos_y", "release_pos_z", 
               "vx0", "vy0", "vz0", "ax", "ay", "az"])
    .map_elements(lambda row: calculate_landing_points(
        row["release_pos_x"], row["release_pos_y"], row["release_pos_z"],
        row["vx0"], row["vy0"], row["vz0"],
        row["ax"], row["ay"], row["az"]
    ), return_dtype=pl.List(pl.Float64))
    .alias("landing_cords")
).with_columns([
    pl.col("landing_cords").list.get(0).alias("landing_x"),
    pl.col("landing_cords").list.get(1).alias("landing_y")
]).filter(
    ~pl.col("landing_x").is_null() & ~pl.col("landing_y").is_null()
)

  sqrt_discriminant = np.sqrt(discriminant)


In [6]:
# Catcher positioning logic
fastball = ["FF", "SI", "FC", "FT"]
offspeed = ["CH", "FS", "SC", "FO"]
breaking = ["ST", "SL", "GY", "SV", "CU", "KC", "KN", "CS"]

catcher_location = data.with_columns([
    # Pitch categories
    pl.when(pl.col("pitch_type").is_in(fastball)).then(pl.lit("fastball"))
    .when(pl.col("pitch_type").is_in(offspeed)).then(pl.lit("offspeed"))
    .when(pl.col("pitch_type").is_in(breaking)).then(pl.lit("breaking"))
    .otherwise(pl.lit("other"))
    .alias("pitch_category"),

    # Handedness matchup
    pl.when(pl.col("stand") != pl.col("p_throws")).then(pl.lit("opposite"))
    .otherwise(pl.lit("same"))
    .alias("handedness_matchup"),

    # Catcher location
    pl.when((pl.col("pitch_type").is_in(fastball)) | (pl.col("pitch_type").is_in(offspeed)))
    .then(pl.lit("middle"))
    .when((pl.col("pitch_type").is_in(breaking)) & (pl.col("stand") != pl.col("p_throws")))
    .then(pl.lit("middle"))
    .when((pl.col("pitch_type").is_in(breaking)) & (pl.col("stand") == "L") & 
          (pl.col("p_throws") == "L") & (pl.col("strikes") == 2))
    .then(pl.lit("left"))
    .when((pl.col("pitch_type").is_in(breaking)) & (pl.col("stand") == "R") & 
          (pl.col("p_throws") == "R") & (pl.col("strikes") == 2))
    .then(pl.lit("right"))
    .otherwise(pl.lit("middle"))
    .alias("catcher_location")
])

# Convert catcher positioning to coordinates and calculate distances
depth = -5.0 # Assuming the catcher sits 5 feet behind the tip of the plate
lateral_mvmt = 8.5/12 # Distance to edge of plate from middle

catcher_location_expanded = (catcher_location
    .with_columns([
        pl.when(pl.col("catcher_location") == "left").then(-lateral_mvmt)
        .when(pl.col("catcher_location") == "right").then(lateral_mvmt)
        .otherwise(0.0).alias("catcher_x"),
        pl.lit(depth).alias("catcher_y")
    ])
    .with_columns([
        ((pl.col("landing_x") - pl.col("catcher_x")) ** 2 +
         (pl.col("landing_y") - pl.col("catcher_y")) ** 2).sqrt().alias("distance_to_catcher"),
        (pl.col("landing_x") - pl.col("catcher_x")).alias("lateral_distance"),
        (pl.col("landing_y") - pl.col("catcher_y")).alias("depth_distance")
    ])
)

In [7]:
filtered_data = catcher_location_expanded.filter(
    (~pl.col("distance_to_catcher").is_nan()),
    (~pl.col("on_1b").is_null()) | (~pl.col("on_2b").is_null()) | (~pl.col("on_3b").is_null()),
    (pl.col("description") != 'hit_by_pitch')
)

In [8]:
# Modeling (16 sec)
features = [
    "lateral_distance",
    "depth_distance", 
    "release_speed", 
    "pfx_x",
    "pfx_z",
    "p_throws",
    "stand",
    "pitch_type"
]

model_data = (filtered_data.filter(pl.col("game_date").dt.year().is_in([2020, 2021, 2022, 2023]))
                .select([*features, "is_pb_wp"])
            ).drop_nulls()

X = model_data.drop("is_pb_wp").to_numpy()
y = model_data.select("is_pb_wp").to_numpy().flatten()

X_tr, X_t, y_tr, y_t = train_test_split(X, y, train_size=0.75, random_state=804)

smotenc = SMOTENC(categorical_features=[5,6,7], random_state=804, k_neighbors=3)
X_tr_smote, y_tr_smote = smotenc.fit_resample(X_tr, y_tr)

In [9]:
cb_block = CatBoostClassifier(
    cat_features=[5,6,7],
    iterations=3000,
    random_seed=804,
    verbose=250
)

#cb_block.fit(X_tr_smote, y_tr_smote, eval_set=(X_t, y_t))
#cb_block.save_model('passed.cbm')
cb_block.load_model('passed.cbm')

<catboost.core.CatBoostClassifier at 0x173f28590>

In [10]:
#cb_calibrated = CalibratedClassifierCV(cb_block, method='isotonic', cv=5)
#cb_calibrated.fit(X_t, y_t)  # Calibrate on validation set
#joblib.dump(cb_calibrated, "cb_calibrated.joblib")
cb_calibrated = joblib.load('passed_calibrated.joblib')

In [11]:
passed_prob = cb_calibrated.predict_proba(
    filtered_data.filter(pl.col("game_date").dt.year() == 2025).select(features).to_numpy()
)[:, 1]

passed_25 = (filtered_data.filter(pl.col("game_date").dt.year() == 2025)
    .with_columns(pl.Series(name="pb_wp_prob_calibrated", values=passed_prob))
)

In [12]:
raw_blocks_25 = (passed_25
    .filter(~pl.col("name").is_null())
    .with_columns((pl.col("pb_wp_prob_calibrated") - pl.col("is_pb_wp")).alias("block_runs"),
                  (pl.col("name")).alias("catcher_name"),
                  (pl.col("player_name")).alias("pitcher_name"))
    .sort("block_runs", descending=False)
    .select("fielder_2",
            "catcher_name",
            "pitcher_name", 
            "game_date", 
            "pitch_type", 
            "release_speed", 
            "description", 
            "pfx_x", 
            "pfx_z", 
            "passed_ball", 
            "wild_pitch", 
            "is_pb_wp", 
            "lateral_distance",
            "depth_distance",
            "pb_wp_prob_calibrated",
            "block_runs")
)

In [13]:
catcher_rankings = (raw_blocks_25
    .group_by("catcher_name")
    .agg([
        pl.col("pb_wp_prob_calibrated").sum().alias("expected_pb_wp"),
        pl.col("is_pb_wp").sum().alias("actual_pb_wp"),
        pl.len().alias("total_pitches")
    ])
    .with_columns((pl.col("expected_pb_wp") - pl.col("actual_pb_wp")).alias("baa"))
    .sort("baa", descending=True)
)

ranks = catcher_rankings.to_numpy()

In [14]:
# Display results
print("=== 2025 CATCHER PASSED BALL + WILD PITCH PREVENTION (BAA) ===")

# Top 10
print("TOP 10 (Best Prevention):")
header = f"{'Rk':>3} {'Catcher Name':30} | {'BAA':>8} {'Actual':>8} {'Expected':>10} {'Pitches':>6}\n"
print(header + '-' * (len(header) - 1))

for i, r in enumerate(ranks[:10]):
    name, expected, actual, pitches, baa = r[0], r[1], int(r[2]), int(r[3]), r[4]
    print(f"{i+1:3} {name:30} | {baa:8.2f} {actual:8} {expected:10.1f} {pitches:6}")

# Bottom 10
print(f"\nBOTTOM 10 (Worst Prevention):")
print(header + '-' * (len(header) - 1))

for i, r in enumerate(reversed(ranks[-10:])):
    name, expected, actual, pitches, baa = r[0], r[1], int(r[2]), int(r[3]), r[4]
    rank = len(ranks) - i
    print(f"{rank:3} {name:30} | {baa:8.2f} {actual:8} {expected:10.1f} {pitches:6}")

=== 2025 CATCHER PASSED BALL + WILD PITCH PREVENTION (BAA) ===
TOP 10 (Best Prevention):
 Rk Catcher Name                   |      BAA   Actual   Expected Pitches
-------------------------------------------------------------------------
  1 Danny Jansen                   |    10.00        5       15.0   2946
  2 Alejandro Kirk                 |     9.67       13       22.7   3737
  3 Patrick Bailey                 |     9.28       13       22.3   3699
  4 Dillon Dingler                 |     8.66       14       22.7   3615
  5 Sean Murphy                    |     8.20        9       17.2   2827
  6 William Contreras              |     7.35       21       28.3   4603
  7 Christian Vázquez              |     7.22        7       14.2   2506
  8 Carlos Narváez                 |     7.16       15       22.2   3843
  9 Nick Fortes                    |     6.89        4       10.9   1879
 10 Victor Caratini                |     6.56        4       10.6   1762

BOTTOM 10 (Worst Prevention):
 R