In [9]:
import pandas as pd

In [10]:
days = 186
year = '2023'
qualifying_pa = 503

# Set start and end dates for the seasons
start_dates = ["2023-03-30", "2022-04-07", "2021-04-01"]
end_dates = ["2023-10-01", "2022-10-05", "2021-10-03"]

In [11]:
# Get stats from the last game of the season to get the qualifying players
season_stats = pd.read_csv(f'./{year} Stats/stats_{year}_{days-1}.csv')

In [12]:
# Get the players who have enough plate appearances to qualify based on fangraphs
season_stats_qual = season_stats[season_stats['PA'] >= qualifying_pa]
qual_ids = season_stats_qual['mlbID']
qual_ids

0      682928
1      547989
3      660670
4      642715
12     645277
        ...  
624    664774
627    572233
648    677951
653    592885
655    807799
Name: mlbID, Length: 133, dtype: int64

In [13]:
player_dfs = {}

for day in range(0, days):
    # Load csv of league wide stats through the day
    date_stats = pd.read_csv(f'./2023 Stats/stats_{year}_{day}.csv')
    
    # Filter for players that met the PA threshold (from Fangraphs)
    date_stats_qual = date_stats[date_stats['mlbID'].isin(qual_ids)].copy()

    # Add new column for day
    date_stats_qual['Day'] = day

    # Loop over each player in dataframe
    for player_id in date_stats_qual['mlbID'].unique():
        # Filter for player
        player_stats = date_stats_qual[date_stats_qual['mlbID'] == player_id]

        # If player already has dataframe append the new stats
        if player_id in player_dfs:
            player_dfs[player_id] = pd.concat([player_dfs[player_id], player_stats])
        else:
            player_dfs[player_id] = player_stats


In [14]:
columns_to_remove = ['\xa0', 'Opp']
for player_id, df in player_dfs.items():
    # Remove columns that have the @ symbol and the Opp column b/c these are only defined for the first couple games
    if all(col in df.columns for col in columns_to_remove):
        player_dfs[player_id] = df.drop(columns=columns_to_remove)
    
    # Make the Date column defined as the first day of the season + the day number
    player_dfs[player_id]['Date'] = pd.to_datetime(player_dfs[player_id]['Day'], origin=f'{start_dates[0]}', unit='D')

    # For rows that have the same G as another row, keep only first row with that unique G
    player_dfs[player_id] = player_dfs[player_id].drop_duplicates(subset='G', keep='first')
