In [1]:
import pandas as pd
import numpy as np
from pybaseball import statcast, playerid_lookup, batting_stats, pitching_stats_range, schedule_and_record
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
YEAR = 2024
MIN_PA = 100
START_DATE = f"{YEAR}-03-20"
END_DATE = f"{YEAR}-10-15"
TEAM = "BOS"

In [3]:
def get_qualified_team_batters(year=YEAR, min_pa=MIN_PA, team=TEAM):
    print(f"Fetching {team} batters with > {min_pa} PA in {year}...")

    batting_stats_2024 = batting_stats(year, qual=min_pa)

    team_batters = batting_stats_2024[batting_stats_2024["Team"] == team]
    
    return team_batters[["Name", "IDfg", "PA", "AVG", "OBP", "SLG"]]

In [4]:
qualified_batters = get_qualified_team_batters()
print(f"Found {len(qualified_batters)} qualified {TEAM} batters")
qualified_batters.head()

Fetching BOS batters with > 100 PA in 2024...
Found 15 qualified BOS batters


Unnamed: 0,Name,IDfg,PA,AVG,OBP,SLG
41,Jarren Duran,24617,735,0.285,0.342,0.492
30,Rafael Devers,17350,601,0.272,0.354,0.516
84,Wilyer Abreu,23772,447,0.253,0.322,0.459
35,Tyler O'Neill,15711,473,0.241,0.336,0.511
214,David Hamilton,27531,317,0.248,0.303,0.395


In [8]:
player_ids = pd.read_csv("../data/qualified_red_sox_id_table.csv")
player_ids.head()

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,duran,jarren,680776,duraj001,duranja01,24617,2021.0,2025.0
1,devers,rafael,646240,dever001,deverra01,17350,2017.0,2025.0
2,abreu,wilyer,677800,abrew002,abreuwi02,23772,2023.0,2025.0
3,o'neill,tyler,641933,oneit001,oneilty01,15711,2018.0,2025.0
4,hamilton,david,666152,hamid002,hamilda03,27531,2023.0,2025.0


In [9]:
qualified_batters_ids = pd.merge(
    qualified_batters,
    player_ids[["key_fangraphs", "key_mlbam"]],
    left_on="IDfg",
    right_on="key_fangraphs",
    how="left"
)

qualified_batters_ids.drop(columns="key_fangraphs", inplace=True)

qualified_batters_ids.head()

Unnamed: 0,Name,IDfg,PA,AVG,OBP,SLG,key_mlbam
0,Jarren Duran,24617,735,0.285,0.342,0.492,680776
1,Rafael Devers,17350,601,0.272,0.354,0.516,646240
2,Wilyer Abreu,23772,447,0.253,0.322,0.459,677800
3,Tyler O'Neill,15711,473,0.241,0.336,0.511,641933
4,David Hamilton,27531,317,0.248,0.303,0.395,666152


In [7]:
from pybaseball import statcast_batter

def get_player_game_data(player_id, player_name, start_date, end_date):
    player_data = statcast_batter(start_dt=start_date, end_dt=end_date, player_id=player_id)

    player_batting = player_data[player_data["batter"] == player_id].copy()

    player_batting["player_name"] = player_name

    return player_batting

def collect_all_player_data(qualified_batters, start_date, end_date):
    all_player_data = []

    for _, player in tqdm(qualified_batters.iterrows(), total=len(qualified_batters), des="Processing players"):
        print(f"Fetching data for {player['Name']}...")
        player_data = get_player_game_data(player["key_mlbam"], player["Name"], start_date, end_date)

        if len(player_data) == 0:
            print(f"No data found {player['Name']}")
            continue

        player_data["player_AVG"] = player["AVG"]
        player_data["player_OBP"] = player["OBP"]
        player_data["player_SLG"] = player["SLG"]
        player_data["player_PA"] = player["PA"]

        all_player_data.append(player_data)

    if not all_player_data:
        return pd.DataFrame()
    
    return pd.concat(all_player_data, ignore_index=True)

In [None]:
all_batters_data = collect_all_player_data(qualified_batters_ids, START_DATE, END_DATE)
print(f"Collected {len(all_batters_data)} at-bats from {all_batters_data['player_name'].nunique()} players")