In [1]:
import pandas as pd
import numpy as np
from pybaseball import statcast, playerid_lookup, batting_stats, pitching_stats_range, schedule_and_record
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
YEAR = 2024
MIN_PA = 100
START_DATE = f"{YEAR}-03-20"
END_DATE = f"{YEAR}-10-15"
TEAM = "BOS"

# Batters

In [3]:
def get_qualified_team_batters(year=YEAR, min_pa=MIN_PA, team=TEAM):
    print(f"Fetching {team} batters with > {min_pa} PA in {year}...")

    batting_stats_2024 = batting_stats(year, qual=min_pa)

    team_batters = batting_stats_2024[batting_stats_2024["Team"] == team]
    
    return team_batters[["Name", "IDfg", "PA", "AVG", "OBP", "SLG"]]

In [4]:
qualified_batters = get_qualified_team_batters()
print(f"Found {len(qualified_batters)} qualified {TEAM} batters")
qualified_batters.head()

Fetching BOS batters with > 100 PA in 2024...
Found 15 qualified BOS batters


Unnamed: 0,Name,IDfg,PA,AVG,OBP,SLG
41,Jarren Duran,24617,735,0.285,0.342,0.492
30,Rafael Devers,17350,601,0.272,0.354,0.516
84,Wilyer Abreu,23772,447,0.253,0.322,0.459
35,Tyler O'Neill,15711,473,0.241,0.336,0.511
214,David Hamilton,27531,317,0.248,0.303,0.395


In [5]:
player_ids = pd.read_csv("../data/qualified_red_sox_id_table.csv")
player_ids.head()

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,duran,jarren,680776,duraj001,duranja01,24617,2021.0,2025.0
1,devers,rafael,646240,dever001,deverra01,17350,2017.0,2025.0
2,abreu,wilyer,677800,abrew002,abreuwi02,23772,2023.0,2025.0
3,o'neill,tyler,641933,oneit001,oneilty01,15711,2018.0,2025.0
4,hamilton,david,666152,hamid002,hamilda03,27531,2023.0,2025.0


In [6]:
qualified_batters_ids = pd.merge(
    qualified_batters,
    player_ids[["key_fangraphs", "key_mlbam"]],
    left_on="IDfg",
    right_on="key_fangraphs",
    how="left"
)

qualified_batters_ids.drop(columns="key_fangraphs", inplace=True)

qualified_batters_ids.head()

Unnamed: 0,Name,IDfg,PA,AVG,OBP,SLG,key_mlbam
0,Jarren Duran,24617,735,0.285,0.342,0.492,680776
1,Rafael Devers,17350,601,0.272,0.354,0.516,646240
2,Wilyer Abreu,23772,447,0.253,0.322,0.459,677800
3,Tyler O'Neill,15711,473,0.241,0.336,0.511,641933
4,David Hamilton,27531,317,0.248,0.303,0.395,666152


In [7]:
from pybaseball import statcast_batter

def get_player_game_data(player_id, player_name, start_date, end_date):
    player_data = statcast_batter(start_dt=start_date, end_dt=end_date, player_id=player_id)

    player_batting = player_data[player_data["batter"] == player_id].copy()

    player_batting["player_name"] = player_name

    return player_batting

def collect_all_player_data(qualified_batters, start_date, end_date):
    all_player_data = []

    for _, player in tqdm(qualified_batters.iterrows(), total=len(qualified_batters), desc="Processing players"):
        print(f"Fetching data for {player['Name']}...")
        player_data = get_player_game_data(player["key_mlbam"], player["Name"], start_date, end_date)

        if len(player_data) == 0:
            print(f"No data found {player['Name']}")
            continue

        player_data["player_AVG"] = player["AVG"]
        player_data["player_OBP"] = player["OBP"]
        player_data["player_SLG"] = player["SLG"]
        player_data["player_PA"] = player["PA"]

        all_player_data.append(player_data)

    if not all_player_data:
        return pd.DataFrame()
    
    return pd.concat(all_player_data, ignore_index=True)

In [8]:
all_batters_data = collect_all_player_data(qualified_batters_ids, START_DATE, END_DATE)
print(f"Collected {len(all_batters_data)} at-bats from {all_batters_data['player_name'].nunique()} players")
all_batters_data.to_csv("all_batter_data.csv", index=False)

Processing players:   0%|          | 0/15 [00:00<?, ?it/s]

Fetching data for Jarren Duran...
Gathering Player Data
Fetching data for Rafael Devers...
Gathering Player Data
Fetching data for Wilyer Abreu...
Gathering Player Data
Fetching data for Tyler O'Neill...
Gathering Player Data
Fetching data for David Hamilton...
Gathering Player Data
Fetching data for Rob Refsnyder...
Gathering Player Data
Fetching data for Connor Wong...
Gathering Player Data
Fetching data for Ceddanne Rafaela...
Gathering Player Data
Fetching data for Masataka Yoshida...
Gathering Player Data
Fetching data for Triston Casas...
Gathering Player Data
Fetching data for Trevor Story...
Gathering Player Data
Fetching data for Romy Gonzalez...
Gathering Player Data
Fetching data for Reese McGuire...
Gathering Player Data
Fetching data for Enmanuel Valdez...
Gathering Player Data
Fetching data for Vaughn Grissom...
Gathering Player Data
Collected 21579 at-bats from 15 players


In [9]:
all_batters_data.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,player_AVG,player_OBP,player_SLG,player_PA
0,FF,2024-09-29,95.8,-0.88,6.01,Jarren Duran,680776,686752,single,hit_into_play,...,,,0.88,0.83,-0.83,46.4,0.285,0.342,0.492,735
1,SI,2024-09-29,92.8,2.36,5.13,Jarren Duran,680776,663992,field_out,hit_into_play,...,,,2.26,1.68,1.68,16.5,0.285,0.342,0.492,735
2,SI,2024-09-29,93.3,2.42,5.22,Jarren Duran,680776,663992,,ball,...,,,2.07,1.62,1.62,16.9,0.285,0.342,0.492,735
3,SI,2024-09-29,92.7,2.41,5.1,Jarren Duran,680776,663992,,foul,...,,,2.62,1.64,1.64,16.1,0.285,0.342,0.492,735
4,SL,2024-09-29,86.3,2.29,4.92,Jarren Duran,680776,663992,,ball,...,,,2.85,-0.03,-0.03,11.1,0.285,0.342,0.492,735


# Pitchers

In [11]:
MIN_I = 10

In [27]:
from pybaseball import pitching_stats

def get_qualified_pitchers(year=YEAR, min_i=MIN_I):
    print(f"Fetching pitchers with > {min_i} INNINGS in {year}...")

    pitching_stats_2024 = pitching_stats(year, qual=min_i)
    
    return pitching_stats_2024[["Name", "IDfg", "ERA", "WHIP", "K/9", "BB/9", "HR/9", "K/BB", "AVG"]]

In [28]:
qualified_pitchers = get_qualified_pitchers()
qualified_pitchers.head()

Fetching pitchers with > 10 INNINGS in 2024...


Unnamed: 0,Name,IDfg,ERA,WHIP,K/9,BB/9,HR/9,K/BB,AVG
65,Chris Sale,10603,2.38,1.01,11.4,1.98,0.46,5.77,0.215
66,Tarik Skubal,22267,2.39,0.92,10.69,1.64,0.7,6.51,0.2
78,Zack Wheeler,10310,2.57,0.96,10.08,2.34,0.9,4.31,0.191
149,Cole Ragans,21846,3.14,1.14,10.77,3.24,0.72,3.33,0.212
208,Dylan Cease,18525,3.47,1.07,10.65,3.09,0.86,3.45,0.197


In [30]:
pitcher_ids = pd.read_csv("../data/pitcher_lookup_table.csv")

qualified_pitcher_ids = pd.merge(
    qualified_pitchers,
    pitcher_ids[["key_fangraphs", "key_mlbam"]],
    left_on="IDfg",
    right_on="key_fangraphs",
    how="inner"
)

qualified_pitcher_ids.drop(columns="key_fangraphs", inplace=True)

qualified_pitcher_ids.head()

Unnamed: 0,Name,IDfg,ERA,WHIP,K/9,BB/9,HR/9,K/BB,AVG,key_mlbam
0,Chris Sale,10603,2.38,1.01,11.4,1.98,0.46,5.77,0.215,519242
1,Tarik Skubal,22267,2.39,0.92,10.69,1.64,0.7,6.51,0.2,669373
2,Zack Wheeler,10310,2.57,0.96,10.08,2.34,0.9,4.31,0.191,554430
3,Cole Ragans,21846,3.14,1.14,10.77,3.24,0.72,3.33,0.212,666142
4,Dylan Cease,18525,3.47,1.07,10.65,3.09,0.86,3.45,0.197,656302


# Combine Data

In [37]:
def add_pitcher_stats(game_data, all_pitchers, year=YEAR):
    print("Fetching and adding pitcher statistics...")

    pitcher_stats = all_pitchers.set_index("key_mlbam")

    game_data_with_pitcher = game_data.copy()

    pitcher_ids = game_data["pitcher"].unique()
    pitcher_season_stats = {}

    for pid in pitcher_ids:
        if pid in pitcher_stats.index:
            pitcher_season_stats[pid] = {
                'pitcher_name': pitcher_stats.loc[pid, 'Name'],
                'pitcher_ERA': pitcher_stats.loc[pid, 'ERA'],
                'pitcher_WHIP': pitcher_stats.loc[pid, 'WHIP'],
                'pitcher_SO9': pitcher_stats.loc[pid, 'K/9'],
                'pitcher_BB9': pitcher_stats.loc[pid, 'BB/9'],
                'pitcher_HR9': pitcher_stats.loc[pid, 'HR/9'],
                'pitcher_K_BB': pitcher_stats.loc[pid, 'K/BB'],
                'pitcher_AVG': pitcher_stats.loc[pid, 'AVG']
            }
    
    for stat in ['pitcher_name', 'pitcher_ERA', 'pitcher_WHIP', 'pitcher_SO9', 
                'pitcher_BB9', 'pitcher_HR9', 'pitcher_K_BB', 'pitcher_AVG']:
        game_data_with_pitcher[stat] = game_data['pitcher'].map(
            {pid: pitcher_season_stats.get(pid, {}).get(stat, np.nan) for pid in pitcher_ids}
        )
    
    return game_data_with_pitcher

In [38]:
game_data_with_pitcher = add_pitcher_stats(all_batters_data, qualified_pitcher_ids)
print(f"Added pitcher stats. Dataset shape: {game_data_with_pitcher.shape}")

Fetching and adding pitcher statistics...
Added pitcher stats. Dataset shape: (21579, 125)


In [39]:
game_data_with_pitcher_trimmed = game_data_with_pitcher[[
    'batter', 'pitcher', 'stand', 'p_throws',
    'player_name', 'player_AVG', 'player_OBP', 'player_SLG', 'player_PA',
    'events', 'description',
    'pitcher_name', 'pitcher_ERA', 'pitcher_WHIP', 'pitcher_SO9', 
    'pitcher_BB9', 'pitcher_HR9', 'pitcher_K_BB', 'pitcher_AVG']]

game_data_with_pitcher_trimmed.head()

Unnamed: 0,batter,pitcher,stand,p_throws,player_name,player_AVG,player_OBP,player_SLG,player_PA,events,description,pitcher_name,pitcher_ERA,pitcher_WHIP,pitcher_SO9,pitcher_BB9,pitcher_HR9,pitcher_K_BB,pitcher_AVG
0,680776,686752,L,R,Jarren Duran,0.285,0.342,0.492,735,single,hit_into_play,Ryan Pepiot,3.6,1.15,9.83,3.32,1.18,2.96,0.211
1,680776,663992,L,L,Jarren Duran,0.285,0.342,0.492,735,field_out,hit_into_play,Richard Lovelady,4.46,1.31,6.82,2.88,0.79,2.36,0.254
2,680776,663992,L,L,Jarren Duran,0.285,0.342,0.492,735,,ball,Richard Lovelady,4.46,1.31,6.82,2.88,0.79,2.36,0.254
3,680776,663992,L,L,Jarren Duran,0.285,0.342,0.492,735,,foul,Richard Lovelady,4.46,1.31,6.82,2.88,0.79,2.36,0.254
4,680776,663992,L,L,Jarren Duran,0.285,0.342,0.492,735,,ball,Richard Lovelady,4.46,1.31,6.82,2.88,0.79,2.36,0.254


In [40]:
game_data_with_pitcher_trimmed.describe()

Unnamed: 0,batter,pitcher,player_AVG,player_OBP,player_SLG,player_PA,pitcher_ERA,pitcher_WHIP,pitcher_SO9,pitcher_BB9,pitcher_HR9,pitcher_K_BB,pitcher_AVG
count,21579.0,21579.0,21579.0,21579.0,21579.0,21579.0,16874.0,16874.0,16874.0,16874.0,16874.0,16874.0,16874.0
mean,670485.64864,637124.231614,0.258985,0.324208,0.442001,453.136012,4.013271,1.263859,8.738713,3.06203,1.101212,3.198856,0.238805
std,44638.754777,50375.440673,0.023457,0.029793,0.062945,175.388923,1.240797,0.215331,1.747039,1.069844,0.427581,1.3175,0.036051
min,596115.0,434378.0,0.19,0.246,0.219,106.0,0.61,0.52,4.16,0.77,0.0,1.0,0.111
25%,646240.0,607455.0,0.246,0.306,0.395,307.0,3.25,1.13,7.48,2.31,0.84,2.3,0.22
50%,666152.0,657376.0,0.266,0.336,0.459,473.0,3.82,1.25,8.54,2.92,1.07,2.89,0.24
75%,678882.0,669952.0,0.28,0.342,0.492,571.0,4.64,1.42,9.83,3.68,1.35,3.76,0.261
max,807799.0,805673.0,0.285,0.359,0.516,735.0,10.8,2.23,14.72,7.55,3.71,8.5,0.353
