In [1]:
import pandas as pd
import pickle

In [2]:
def add_stats(player_dfs) -> dict:
    """
    Adds additional derived statistics to the player dataframes contained in the player_dfs dictionary.
    Computes and adds WoBA, K%, BB%, ISO, and wRC+ to each player's dataframe.
    Equations sourced from https://www.fangraphs.com
    """
    
    for player_id, player_df in player_dfs.items():
        player_df = player_df.copy()
        player_df['wOBA'] = (0.696 * (player_df['BB'] - player_df['IBB']) + 0.726 * player_df['HBP'] + 0.883 * player_df['H'] + 1.244 * player_df['2B'] + 1.569 * player_df['3B'] + 2.004 * player_df['HR']) / (player_df['PA'] - player_df['IBB'])
        player_df['K%'] = player_df['SO'] / player_df['PA']
        player_df['BB%'] = player_df['BB'] / player_df['PA']
        player_df['ISO'] = (player_df['2B'] + 2 * player_df['3B'] + 3 * player_df['HR']) / player_df['AB']
        player_df['wRC+'] = ((player_df['wOBA'] - 0.320) / 1.25 + 0.100) * 100
        player_dfs[player_id] = player_df
    
    return player_dfs

In [3]:
days = 186
year = '2023'
qualifying_pa = 503

# Set start and end dates for the seasons
start_dates = ["2023-03-30", "2022-04-07", "2021-04-01"]
end_dates = ["2023-10-01", "2022-10-05", "2021-10-03"]

In [4]:
# Get stats from the last game of the season to get the qualifying players
season_stats = pd.read_csv(f'./{year} Stats/stats_{year}_{days-1}.csv')

In [5]:
# Get the players who have enough plate appearances to qualify based on fangraphs
season_stats_qual = season_stats[season_stats['PA'] >= qualifying_pa]
qual_ids = season_stats_qual['mlbID']

In [6]:
player_dfs = {}

for day in range(0, days):
    # Load csv of league wide stats through the day
    date_stats = pd.read_csv(f'./2023 Stats/stats_{year}_{day}.csv')
    
    # Filter for players that met the PA threshold (from Fangraphs)
    date_stats_qual = date_stats[date_stats['mlbID'].isin(qual_ids)].copy()

    # Add new column for day
    date_stats_qual['Day'] = day

    # Loop over each player in dataframe
    for player_id in date_stats_qual['mlbID'].unique():
        # Filter for player
        player_stats = date_stats_qual[date_stats_qual['mlbID'] == player_id]

        # If player already has dataframe append the new stats
        if player_id in player_dfs:
            player_dfs[player_id] = pd.concat([player_dfs[player_id], player_stats])
        else:
            player_dfs[player_id] = player_stats


In [7]:
columns_to_remove = ['\xa0', 'Opp']
for player_id, df in player_dfs.items():
    # Remove columns that have the @ symbol and the Opp column b/c these are only defined for the first couple games
    if all(col in df.columns for col in columns_to_remove):
        player_dfs[player_id] = df.drop(columns=columns_to_remove)
    
    # Make the Date column defined as the first day of the season + the day number
    player_dfs[player_id]['Date'] = pd.to_datetime(player_dfs[player_id]['Day'], origin=f'{start_dates[0]}', unit='D')

    # For rows that have the same G as another row, keep only first row with that unique G
    player_dfs[player_id] = player_dfs[player_id].drop_duplicates(subset='G', keep='first')


In [8]:
player_dfs = add_stats(player_dfs)

In [9]:
pickle.dump(player_dfs, open(f'./{year}_player_dfs.pkl', 'wb'))

In [11]:
player_dfs[641355]

Unnamed: 0,Name,Age,#days,Lev,Date,Tm,G,PA,AB,R,...,OBP,SLG,OPS,mlbID,Day,wOBA,K%,BB%,ISO,wRC+
24,Cody Bellinger,27,311,Maj-NL,2023-03-30,Chicago,1,4,3,0,...,0.250,0.000,0.250,641355,0,0.172500,0.250000,0.250000,0.000000,-1.800000
28,Cody Bellinger,27,309,Maj-NL,2023-04-01,Chicago,2,8,7,0,...,0.125,0.000,0.125,641355,2,0.086250,0.375000,0.125000,0.000000,-8.700000
29,Cody Bellinger,27,308,Maj-NL,2023-04-02,Chicago,3,12,11,0,...,0.083,0.000,0.083,641355,3,0.057500,0.333333,0.083333,0.000000,-11.000000
29,Cody Bellinger,27,307,Maj-NL,2023-04-03,Chicago,4,17,15,1,...,0.177,0.267,0.443,641355,4,0.257059,0.235294,0.117647,0.200000,4.964706
30,Cody Bellinger,27,306,Maj-NL,2023-04-04,Chicago,5,23,20,2,...,0.304,0.350,0.654,641355,5,0.336087,0.173913,0.130435,0.150000,11.286957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55,Cody Bellinger,27,131,Maj-NL,2023-09-26,Chicago,126,538,482,93,...,0.359,0.533,0.892,641355,180,0.472579,0.156134,0.072491,0.224066,22.206355
55,Cody Bellinger,27,130,Maj-NL,2023-09-27,Chicago,127,543,487,94,...,0.359,0.532,0.891,641355,181,0.471500,0.156538,0.071823,0.221766,22.120000
55,Cody Bellinger,27,129,Maj-NL,2023-09-28,Chicago,128,547,491,94,...,0.358,0.530,0.888,641355,182,0.469669,0.157221,0.071298,0.219959,21.973529
55,Cody Bellinger,27,128,Maj-NL,2023-09-29,Chicago,129,551,495,94,...,0.356,0.525,0.881,641355,183,0.466241,0.157895,0.070780,0.218182,21.699270
