This file was used to scrape data from the 2016-2025 Regular Seasons. The date range can be adjusted. All other explainations are listed in comments. Analysis is in the analyze_data notebook. Thanks!

In [2]:
import statsapi

In [3]:
import pandas as pd
from pprint import pprint
import time

In [4]:
def is_bunt(play):
    res = play.get("result", {}) or {}
    event_type = (res.get("eventType") or "").lower()
    event_text = (res.get("event") or "").lower()
    desc = (res.get("description") or "").lower()
    return ("bunt" in event_type) or ("bunt" in event_text) or ("foul bunt" in desc) or ("bunt" in desc)

In [5]:
def get_games_date(DATE):
    games = statsapi.schedule(date = DATE)
    gamepks = []
    for g in games:
        if g.get("game_type") == "R":
            pk = g.get("game_id") or g.get("game_pk")
            if pk is not None:
                gamepks.append(int(pk))
    return(gamepks)

In [6]:
def get_games_range(start, end):
    games = statsapi.schedule(start_date = start, end_date=end)
    gamepks = []
    for g in games:
        if g.get("game_type") == "R":
            pk = g.get("game_id") or g.get("game_pk")
            if pk is not None:
                gamepks.append(int(pk))
    return(gamepks)

In [7]:
#Specify Date Range
games_pks = get_games_range("2016-03-15", "2016-10-05")

print(type(games_pks), len(games_pks))  



<class 'list'> 2456


In [None]:
#This is the for-loop that I used to log all plays in a pandas dataframe. The data is stored in a parquet file
#2016 Season is used in example
data = []
i = 0

for pk in games_pks:
    pbp = statsapi.get("game_playByPlay", {"gamePk": pk})
    plays = pbp.get('allPlays', [])

    current_half = None
    current_inning = None
    at_bat_index = -1
    bases = {"1B": None, "2B": None, "3B": None}  # start empty per game
    current_outs = 0 #start game with no outs

    for play in plays:
        about = play.get('about', {}) or {}
        result = play.get('result', {}) or {}
        matchup = play.get('matchup',{}) or {}
        play_half = about.get('halfInning')          # "top"/"bottom"
        play_inning = about.get('inning')
        play_index = about.get('atBatIndex', -1)

        # carry bases forward within same half-inning, otherwise reset
        if (current_half == play_half and
            current_inning == play_inning and
            play_index > at_bat_index):
            bases_beginning = bases.copy()
            outs_beginning = current_outs
        else:
            bases_beginning = {"1B": None, "2B": None, "3B": None}
            outs_beginning = 0

        final_bases = bases_beginning.copy()
        runs_scored = 0
        all_runners = play.get('runners', []) or []
        outs_on_play = 0
        #Clear each base that a runner left from
        origins_to_clear = set()
        for runner in all_runners:
            mov = (runner.get('movement') or {})
            origin = mov.get('originBase')
            if mov.get('isOut') == True:
                outs_on_play += 1
            if origin in ("1B", "2B", "3B"):
                origins_to_clear.add(origin)

        for origin in origins_to_clear:
            final_bases[origin] = None
        outs_after = outs_beginning + outs_on_play
        current_outs = outs_after
        # go through each runner and create dictionary of ending location. Replace if duplicate
        final_by_runner = {} 
        for runner in all_runners:
            mov = (runner.get('movement') or {})
            det = (runner.get('details') or {})
            rid = ((det.get('runner') or {}).get('id'))
            end = mov.get('end')            # "1B"/"2B"/"3B"/"score"/None
            is_out = bool(mov.get('isOut'))
            #
            prev = final_by_runner.get(rid, {"end": None, "is_out": False})
            # If they ever score, score is automatically the ending place. Otherwise take the latest non-None end
            if end == "score":
                final_end = "score"
            else:
                final_end = end if end is not None else prev["end"]

            final_by_runner[rid] = {
                "end": final_end,
                "is_out": is_out or prev["is_out"]
            }

        # Go through runner dict and ending places
        for rid, info in final_by_runner.items():
            end = info["end"]
            is_out = info["is_out"]

            if end == "score":
                runs_scored += 1
                continue

            if (not is_out) and end in ("1B", "2B", "3B"):
                final_bases[end] = rid

        # Build summary (0/1 flags)
        summary = {
            'before_1B': 1 if bases_beginning['1B'] else 0,
            'before_2B': 1 if bases_beginning['2B'] else 0,
            'before_3B': 1 if bases_beginning['3B'] else 0,
            'after_1B':  1 if final_bases['1B'] else 0,
            'after_2B':  1 if final_bases['2B'] else 0,
            'after_3B':  1 if final_bases['3B'] else 0,
            'runs_scored': runs_scored,
            'outs_before': outs_beginning,
            'outs_after': outs_after
        }


        row = {
            'game_pk': pk,
            'half_inning': play_half,                
            'inning': play_inning,
            'at_bat_index': about.get('atBatIndex'),
            'start_time': about.get('startTime'),
            'batter': matchup.get('batter').get('fullName'),
            'pitcher': matchup.get('pitcher').get('fullName'),
            'bat_side': matchup.get('batSide').get('code'),
            'pitch_side': matchup.get('pitchHand').get('code'),
            'description': result.get('description'),
            'event': result.get('event'),
            'event_type': result.get('eventType'),
            'is_bunt': is_bunt(play),
            
                                   
        }

        # Merge dictionaries and append both
        data.append({**row, **summary})

        # Carry state forward
        bases = final_bases
        current_half = play_half
        current_inning = play_inning
        at_bat_index = play_index

    i += 1
    #track progress of load
    if i % 100 == 0: 
        print(f"{i} out of {len(games_pks)}")
    time.sleep(0.05)


dfs = pd.DataFrame(data)


dfs['start_time'] = pd.to_datetime(dfs['start_time'], errors='coerce')
dfs['half_inning_int'] = dfs['half_inning'].map({'top': 0, 'bottom': 1})

dfs.to_parquet("2016-reg-allPlays.parquet", index=False)


In [73]:
#check that df was populated correctly
dfs.head()

Unnamed: 0,game_pk,half_inning,inning,at_bat_index,start_time,batter,pitcher,bat_side,pitch_side,description,...,before_1B,before_2B,before_3B,after_1B,after_2B,after_3B,runs_scored,outs_before,outs_after,half_inning_int
0,634642,top,1,0,2021-04-01 17:10:30.233000+00:00,Marcus Semien,Gerrit Cole,R,R,"Marcus Semien grounds out, shortstop Gleyber T...",...,0,0,0,0,0,0,0,0,1,0
1,634642,top,1,1,2021-04-01 17:11:22.056000+00:00,Cavan Biggio,Gerrit Cole,L,R,Cavan Biggio strikes out swinging.,...,0,0,0,0,0,0,0,1,2,0
2,634642,top,1,2,2021-04-01 17:12:36.595000+00:00,Bo Bichette,Gerrit Cole,R,R,"Bo Bichette grounds out sharply, second basema...",...,0,0,0,0,0,0,0,2,3,0
3,634642,bottom,1,3,2021-04-01 17:16:25.939000+00:00,DJ LeMahieu,Hyun Jin Ryu,R,L,DJ LeMahieu grounds out to first baseman Vladi...,...,0,0,0,0,0,0,0,0,1,1
4,634642,bottom,1,4,2021-04-01 17:18:20.009000+00:00,Aaron Judge,Hyun Jin Ryu,R,L,Aaron Judge strikes out swinging.,...,0,0,0,0,0,0,0,1,2,1


In [87]:
#Concatination if needed
frames = []
for year in range(2016, 2026):
    df = pd.read_parquet(f"{year}-reg-allPlays.parquet")
    df['season'] = year
    frames.append(df)


allPlays1625 = pd.concat(frames, ignore_index=True)
allPlays1625.to_parquet("allPlays1625.parquet")