In [1]:
from collections import defaultdict
import os
from datetime import datetime
import pandas as pd
from Helpers import add_runner_states, add_game_state, add_runs_remaining, calculate_zero_run_probabilities  # assumes same helpers as Colab

In [2]:
def calculate_zero_run_probabilities(df):
    """
    Helper function to count how many times RunsRemaining == 0
    for each GameState within a dataframe.
    Returns a dictionary like: { 'GameState': {'ZeroRunsCount': int} }
    """
    zero_df = df[df['RunsRemaining'] == 0]
    zero_counts = zero_df['GameState'].value_counts().to_dict()
    return {
        state: {"ZeroRunsCount": count}
        for state, count in zero_counts.items()
    }


def build_gamestate_summary_all_years(data_root, save_path):
    summary = defaultdict(lambda: {
        "Count": 0,
        "TotalRunsRemaining": 0,
        "ZeroRunsCount": 0  # ‚úÖ Added field to track zero-run events
    })

    # üîç Loop through all years
    years = sorted([y for y in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, y))])

    for year in years:
        year_path = os.path.join(data_root, year)
        months = sorted([m for m in os.listdir(year_path) if os.path.isdir(os.path.join(year_path, m))])

        for month in months:
            month_path = os.path.join(year_path, month)
            print(f"\nüìÖ Processing {year}-{month} from local files ...")

            for day in os.listdir(month_path):
                day_csv_path = os.path.join(month_path, day, "CSV")
                if not os.path.exists(day_csv_path):
                    continue

                for file in os.listdir(day_csv_path):
                    if "_unverified" in file or "playerpositioning" in file or not file.endswith(".csv"):
                        continue

                    file_path = os.path.join(day_csv_path, file)

                    try:
                        df = pd.read_csv(file_path)
                        required = {'Inning', 'Top/Bottom', 'Outs', 'Balls', 'Strikes', 'RunsScored', 'PlayResult'}
                        if not required.issubset(df.columns):
                            continue

                        # ‚úÖ Filter to valid innings
                        df = df[df['Inning'] < 9]
                        if df.empty:
                            continue

                        # ‚úÖ Add runner and game state logic
                        df = add_runner_states(df)
                        df = add_game_state(df)
                        df = add_runs_remaining(df)

                        # ‚úÖ Filter invalid GameStates
                        df = df[(df['Outs'] <= 2) & (df['Balls'] <= 3) & (df['Strikes'] <= 2)]

                        # ‚úÖ Aggregate runs and counts
                        agg = df.groupby('GameState')['RunsRemaining'].agg(['count', 'sum'])
                        for state, row in agg.iterrows():
                            summary[state]["Count"] += row['count']
                            summary[state]["TotalRunsRemaining"] += row['sum']

                        # ‚úÖ Count zero-run cases per GameState
                        zero_stats = calculate_zero_run_probabilities(df)
                        for state, val in zero_stats.items():
                            summary[state]["ZeroRunsCount"] += val["ZeroRunsCount"]

                    except Exception:
                        continue

    # üíæ Convert dictionary summary ‚Üí DataFrame
    combined_df = pd.DataFrame([
        {
            "GameState": s,
            "Count": d["Count"],
            "TotalRunsRemaining": d["TotalRunsRemaining"],
            "ExpectedRuns": d["TotalRunsRemaining"] / d["Count"] if d["Count"] else 0,
            "ZeroRunsCount": d["ZeroRunsCount"],
            "ZeroRunProbability": d["ZeroRunsCount"] / d["Count"] if d["Count"] else 0
        }
        for s, d in summary.items()
    ])

    # üíæ Save summary
    os.makedirs(save_path, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    filename = f"GameState_Summary_ALL_prob_{timestamp}.csv"
    summary_path = os.path.join(save_path, filename)
    combined_df.to_csv(summary_path, index=False)

    print(f"\n‚úÖ All-year summary saved: {summary_path}")
    return combined_df

In [3]:
data_root = "/Users/suma/Downloads/Baseball_Project/v3"
save_path = "/Users/suma/Downloads/Baseball_Project/CSV_files"

summary_df = build_gamestate_summary_all_years(data_root, save_path)
summary_df.head()


üìÖ Processing 2024-02 from local files ...

üìÖ Processing 2024-03 from local files ...

üìÖ Processing 2024-04 from local files ...

üìÖ Processing 2024-05 from local files ...

üìÖ Processing 2024-06 from local files ...

üìÖ Processing 2024-07 from local files ...

üìÖ Processing 2024-08 from local files ...

üìÖ Processing 2024-09 from local files ...

üìÖ Processing 2024-10 from local files ...

üìÖ Processing 2024-11 from local files ...

üìÖ Processing 2024-12 from local files ...

üìÖ Processing 2025-01 from local files ...

üìÖ Processing 2025-02 from local files ...

üìÖ Processing 2025-03 from local files ...

üìÖ Processing 2025-04 from local files ...

üìÖ Processing 2025-05 from local files ...

üìÖ Processing 2025-06 from local files ...

üìÖ Processing 2025-07 from local files ...

üìÖ Processing 2025-08 from local files ...

üìÖ Processing 2025-09 from local files ...

üìÖ Processing 2025-10 from local files ...

‚úÖ All-year summary saved: /User

Unnamed: 0,GameState,Count,TotalRunsRemaining,ExpectedRuns,ZeroRunsCount,ZeroRunProbability
0,000-O0-B0-S0,216461,173712,0.802509,137520,0.635311
1,000-O0-B0-S1,98943,72089,0.728591,65494,0.661937
2,000-O0-B0-S2,48433,30189,0.623315,33994,0.701877
3,000-O0-B1-S0,87406,76687,0.877365,53423,0.611205
4,000-O0-B1-S1,81918,63986,0.781098,52830,0.644913
