In [1]:
import pandas as pd
import numpy as np
from pybaseball import statcast, playerid_lookup, batting_stats, pitching_stats_range, schedule_and_record
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
YEAR = 2024
MIN_PA = 100
START_DATE = f"{YEAR}-03-28"
END_DATE = f"{YEAR}-10-15"
TEAM = "BOS"

# Batters

In [3]:
def get_qualified_team_batters(year=YEAR, min_pa=MIN_PA, team=TEAM):
    print(f"Fetching {team} batters with > {min_pa} PA in {year}...")

    batting_stats_2024 = batting_stats(year, qual=min_pa)

    team_batters = batting_stats_2024[batting_stats_2024["Team"] == team]
    
    return team_batters[["Name", "IDfg", "PA", "AVG", "OBP", "SLG"]]

In [4]:
qualified_batters = get_qualified_team_batters()
print(f"Found {len(qualified_batters)} qualified {TEAM} batters")
qualified_batters.head()

Fetching BOS batters with > 100 PA in 2024...
Found 15 qualified BOS batters


Unnamed: 0,Name,IDfg,PA,AVG,OBP,SLG
41,Jarren Duran,24617,735,0.285,0.342,0.492
30,Rafael Devers,17350,601,0.272,0.354,0.516
84,Wilyer Abreu,23772,447,0.253,0.322,0.459
35,Tyler O'Neill,15711,473,0.241,0.336,0.511
214,David Hamilton,27531,317,0.248,0.303,0.395


In [5]:
player_ids = pd.read_csv("../data/qualified_red_sox_id_table.csv")
player_ids.head()

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,duran,jarren,680776,duraj001,duranja01,24617,2021.0,2025.0
1,devers,rafael,646240,dever001,deverra01,17350,2017.0,2025.0
2,abreu,wilyer,677800,abrew002,abreuwi02,23772,2023.0,2025.0
3,o'neill,tyler,641933,oneit001,oneilty01,15711,2018.0,2025.0
4,hamilton,david,666152,hamid002,hamilda03,27531,2023.0,2025.0


In [6]:
qualified_batters_ids = pd.merge(
    qualified_batters,
    player_ids[["key_fangraphs", "key_mlbam"]],
    left_on="IDfg",
    right_on="key_fangraphs",
    how="left"
)

qualified_batters_ids.drop(columns="key_fangraphs", inplace=True)

qualified_batters_ids.head()

Unnamed: 0,Name,IDfg,PA,AVG,OBP,SLG,key_mlbam
0,Jarren Duran,24617,735,0.285,0.342,0.492,680776
1,Rafael Devers,17350,601,0.272,0.354,0.516,646240
2,Wilyer Abreu,23772,447,0.253,0.322,0.459,677800
3,Tyler O'Neill,15711,473,0.241,0.336,0.511,641933
4,David Hamilton,27531,317,0.248,0.303,0.395,666152


In [7]:
from pybaseball import statcast_batter

def get_player_game_data(player_id, player_name, start_date, end_date):
    player_data = statcast_batter(start_dt=start_date, end_dt=end_date, player_id=player_id)

    player_batting = player_data[player_data["batter"] == player_id].copy()

    player_batting["player_name"] = player_name

    return player_batting

def collect_all_player_data(qualified_batters, start_date, end_date):
    all_player_data = []

    for _, player in tqdm(qualified_batters.iterrows(), total=len(qualified_batters), desc="Processing players"):
        print(f"Fetching data for {player['Name']}...")
        player_data = get_player_game_data(player["key_mlbam"], player["Name"], start_date, end_date)

        if len(player_data) == 0:
            print(f"No data found {player['Name']}")
            continue

        player_data["player_AVG"] = player["AVG"]
        player_data["player_OBP"] = player["OBP"]
        player_data["player_SLG"] = player["SLG"]
        player_data["player_PA"] = player["PA"]

        all_player_data.append(player_data)

    if not all_player_data:
        return pd.DataFrame()
    
    return pd.concat(all_player_data, ignore_index=True)

In [8]:
all_batters_data = collect_all_player_data(qualified_batters_ids, START_DATE, END_DATE)
print(f"Collected {len(all_batters_data)} at-bats from {all_batters_data['player_name'].nunique()} players")
all_batters_data.to_csv("all_batter_data.csv", index=False)

Processing players:   0%|          | 0/15 [00:00<?, ?it/s]

Fetching data for Jarren Duran...
Gathering Player Data
Fetching data for Rafael Devers...
Gathering Player Data
Fetching data for Wilyer Abreu...
Gathering Player Data
Fetching data for Tyler O'Neill...
Gathering Player Data
Fetching data for David Hamilton...
Gathering Player Data
Fetching data for Rob Refsnyder...
Gathering Player Data
Fetching data for Connor Wong...
Gathering Player Data
Fetching data for Ceddanne Rafaela...
Gathering Player Data
Fetching data for Masataka Yoshida...
Gathering Player Data
Fetching data for Triston Casas...
Gathering Player Data
Fetching data for Trevor Story...
Gathering Player Data
Fetching data for Romy Gonzalez...
Gathering Player Data
Fetching data for Reese McGuire...
Gathering Player Data
Fetching data for Enmanuel Valdez...
Gathering Player Data
Fetching data for Vaughn Grissom...
Gathering Player Data
Collected 21090 at-bats from 15 players


In [9]:
all_batters_data.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,player_AVG,player_OBP,player_SLG,player_PA
0,FF,2024-09-29,95.8,-0.88,6.01,Jarren Duran,680776,686752,single,hit_into_play,...,,,0.88,0.83,-0.83,46.4,0.285,0.342,0.492,735
1,SI,2024-09-29,92.8,2.36,5.13,Jarren Duran,680776,663992,field_out,hit_into_play,...,,,2.26,1.68,1.68,16.5,0.285,0.342,0.492,735
2,SI,2024-09-29,93.3,2.42,5.22,Jarren Duran,680776,663992,,ball,...,,,2.07,1.62,1.62,16.9,0.285,0.342,0.492,735
3,SI,2024-09-29,92.7,2.41,5.1,Jarren Duran,680776,663992,,foul,...,,,2.62,1.64,1.64,16.1,0.285,0.342,0.492,735
4,SL,2024-09-29,86.3,2.29,4.92,Jarren Duran,680776,663992,,ball,...,,,2.85,-0.03,-0.03,11.1,0.285,0.342,0.492,735


# Pitchers

In [10]:
MIN_I = 1

In [20]:
from pybaseball import pitching_stats_range

def get_qualified_pitchers(start_dt=START_DATE, end_dt=END_DATE):
    print(f"Fetching pitchers from {start_dt} to {end_dt}...")

    pitching_stats_2024 = pitching_stats_range(start_dt, end_dt)
    
    return pitching_stats_2024

In [21]:
qualified_pitchers = get_qualified_pitchers()
qualified_pitchers.to_csv("../data/all_pitcher_data.csv", index=False)
qualified_pitchers.head()

Fetching pitchers from 2024-03-28 to 2024-10-15...


Unnamed: 0,Name,Age,#days,Lev,Tm,G,GS,W,L,SV,...,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W,mlbID
1,Andrew Abbott,25,241,Maj-NL,Cincinnati,25,25,10.0,10.0,,...,0.16,0.09,0.35,0.21,0.12,1.297,0.26,7.4,2.19,671096
2,Bryan Abreu,27,196,Maj-AL,Houston,79,0,3.0,3.0,1.0,...,0.18,0.14,0.4,0.25,0.05,1.155,0.284,11.9,3.18,650556
3,Jason Adam,32,187,"Maj-AL,Maj-NL","San Diego,Tampa Bay",79,0,7.0,2.0,4.0,...,0.14,0.18,0.45,0.19,0.09,0.848,0.211,9.9,3.78,592094
4,Austin Adams,33,246,Maj-AL,Oakland,56,0,1.0,2.0,,...,0.18,0.13,0.38,0.27,0.11,1.476,0.351,11.5,2.3,613534
5,Ty Adcock,27,285,Maj-NL,New York,3,0,,,,...,0.13,0.07,0.25,0.19,0.0,2.077,0.25,6.2,1.5,686654


# Clean Data

In [32]:
qualified_pitchers = qualified_pitchers[['mlbID', 'ERA', 'WHIP', 'SO9', 'SO/W', 'HR', 'BB', "H", "IP","BAbip"]]
qualified_pitchers.to_csv("../data/cleaned_pitcher_data.csv", index=False)
qualified_pitchers.head()

Unnamed: 0,mlbID,ERA,WHIP,SO9,SO/W,HR,BB,H,IP,BAbip
1,671096,3.72,1.297,7.4,2.19,25,52,127,138.0,0.26
2,650556,3.05,1.155,11.9,3.18,9,33,59,79.2,0.284
3,592094,2.05,0.848,9.9,3.78,6,23,44,79.0,0.211
4,613534,3.92,1.476,11.5,2.3,4,23,38,41.1,0.351
5,686654,14.54,2.077,6.2,1.5,4,2,7,4.1,0.25


In [31]:
cleaned_batter_data = all_batters_data[[
    "batter",
    "pitcher",
    "stand",
    "p_throws",
    "events",
    "description",
    "player_AVG",
    "player_OBP",
    "player_SLG",
    "player_PA"
]]
cleaned_batter_data.to_csv("../data/cleaned_batter_data.csv", index=False)
cleaned_batter_data.head()

Unnamed: 0,batter,pitcher,stand,p_throws,events,description,player_AVG,player_OBP,player_SLG,player_PA
0,680776,686752,L,R,single,hit_into_play,0.285,0.342,0.492,735
1,680776,663992,L,L,field_out,hit_into_play,0.285,0.342,0.492,735
2,680776,663992,L,L,,ball,0.285,0.342,0.492,735
3,680776,663992,L,L,,foul,0.285,0.342,0.492,735
4,680776,663992,L,L,,ball,0.285,0.342,0.492,735


# Combine Data

In [40]:
def add_pitcher_stats(game_data, all_pitchers, year=YEAR):
    print("Fetching and adding pitcher statistics...")

    pitcher_stats = all_pitchers.set_index("mlbID")

    game_data_with_pitcher = game_data.copy()

    pitcher_ids = game_data["pitcher"].unique()
    pitcher_season_stats = {}

    for pid in pitcher_ids:
        if pid in pitcher_stats.index:
            pitcher_season_stats[pid] = {
                'pitcher_ERA': pitcher_stats.loc[pid, 'ERA'],
                'pitcher_WHIP': pitcher_stats.loc[pid, 'WHIP'],
                'pitcher_SO9': pitcher_stats.loc[pid, 'SO9'],
                'pitcher_SO/W': pitcher_stats.loc[pid, 'SO/W'],
                'pitcher_HR': pitcher_stats.loc[pid, 'HR'],
                'pitcher_BB': pitcher_stats.loc[pid, 'BB'],
                'pitcher_H': pitcher_stats.loc[pid, 'H'],
                'pitcher_BAbip': pitcher_stats.loc[pid, 'BAbip']
            }
    
    for stat in ['pitcher_ERA', 'pitcher_WHIP', 'pitcher_SO9', 
                'pitcher_SO/W', 'pitcher_HR', 'pitcher_BB', 
                'pitcher_H', 'pitcher_BAbip']:
        game_data_with_pitcher[stat] = game_data['pitcher'].map(
            {pid: pitcher_season_stats.get(pid, {}).get(stat, np.nan) for pid in pitcher_ids}
        )
    
    return game_data_with_pitcher

In [45]:
print(cleaned_batter_data["pitcher"].dtype)
qualified_pitchers["mlbID"] = qualified_pitchers["mlbID"].astype(int)
print(qualified_pitchers["mlbID"].dtype)

int64
int64


In [46]:
game_data = cleaned_batter_data.copy()

game_data_with_pitcher = game_data.merge(
    qualified_pitchers,
    left_on="pitcher",
    right_on="mlbID",
    how="left"
)

game_data_with_pitcher.head()

Unnamed: 0,batter,pitcher,stand,p_throws,events,description,player_AVG,player_OBP,player_SLG,player_PA,mlbID,ERA,WHIP,SO9,SO/W,HR,BB,H,IP,BAbip
0,680776,686752,L,R,single,hit_into_play,0.285,0.342,0.492,735,686752,3.6,1.154,9.8,2.96,17,48,102,130.0,0.262
1,680776,663992,L,L,field_out,hit_into_play,0.285,0.342,0.492,735,663992,4.46,1.311,6.8,2.36,3,11,34,34.1,0.298
2,680776,663992,L,L,,ball,0.285,0.342,0.492,735,663992,4.46,1.311,6.8,2.36,3,11,34,34.1,0.298
3,680776,663992,L,L,,foul,0.285,0.342,0.492,735,663992,4.46,1.311,6.8,2.36,3,11,34,34.1,0.298
4,680776,663992,L,L,,ball,0.285,0.342,0.492,735,663992,4.46,1.311,6.8,2.36,3,11,34,34.1,0.298


In [47]:
game_data_with_pitcher.describe()

Unnamed: 0,batter,pitcher,player_AVG,player_OBP,player_SLG,player_PA,mlbID,ERA,WHIP,SO9,SO/W,HR,BB,H,IP,BAbip
count,21090.0,21090.0,21090.0,21090.0,21090.0,21090.0,21090.0,21090.0,21090.0,21090.0,21048.0,21090.0,21090.0,21090.0,21090.0,21090.0
mean,670610.625367,637343.607681,0.259181,0.324277,0.442117,454.290138,637343.607681,4.097012,1.282775,8.731897,3.124648,12.816264,32.820152,94.312233,102.679526,0.290521
std,44661.52438,50262.926309,0.023455,0.029821,0.063006,175.126987,50262.926309,2.871615,0.391713,1.88294,1.279437,8.502417,17.82074,55.444637,58.667772,0.043932
min,596115.0,434378.0,0.19,0.246,0.219,106.0,434378.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0
25%,646240.0,607455.0,0.246,0.306,0.395,307.0,607455.0,3.33,1.148,7.4,2.23,5.0,19.0,48.0,54.0,0.267
50%,666152.0,657508.0,0.266,0.336,0.459,473.0,657508.0,3.8,1.247,8.6,2.87,11.0,29.0,80.0,87.1,0.29
75%,678882.0,670036.0,0.28,0.342,0.492,571.0,670036.0,4.56,1.417,9.9,3.7,20.0,47.0,149.0,164.0,0.315
max,807799.0,805673.0,0.285,0.359,0.516,735.0,805673.0,81.0,10.5,15.8,8.5,33.0,77.0,208.0,216.0,0.778


In [48]:
game_data_with_pitcher.drop("mlbID", axis=1)
game_data_with_pitcher.to_csv("../data/combine_data_raw.csv", index=False)