**Cleaning For Data**
**Horizontally Add FIFA Data year by year**

In [28]:
import pandas as pd
import openpyxl
RAW_DATA_DIR = '../data/raw'

In [29]:
df_standard = pd.read_csv(f'{RAW_DATA_DIR}/standard_stats.csv', dtype =str )
df_shooting = pd.read_csv(f'{RAW_DATA_DIR}/shooting_stats.csv', dtype =str )

In [30]:

df_shooting = df_shooting.rename(columns={'90s': 'Playing Time_90s',
                                          'Standard_Gls': 'Performance_Gls',
                                          'Standard_PK': 'Performance_PK',
                                          'Standard_PKatt': 'Performance_PKatt'})
df_shooting = df_shooting.drop(columns=['Expected_xG', 'Expected_npxG', 'Expected_G-xG', 'Expected_np:G-xG'])

In [31]:
# Merge general w shooting, left concat, prioritize general stats
df_merged = pd.merge(df_standard, df_shooting, how='left')
df_merged = df_merged.drop(columns=['Performance_Gls', 'Performance_Ast', 'Performance_G+A', 'Performance_G-PK', 'Performance_PK',  'Standard_FK', 'Expected_xG', 'Expected_npxG', 'Expected_xAG', 'Expected_npxG+xAG', 'Standard_SoT', 'Standard_Sh'])
df_merged['Performance_PKatt'] = df_merged['Performance_PKatt'].astype(float) / df_merged['Playing Time_90s'].astype(float)
df_merged['Performance_CrdY'] = df_merged['Performance_CrdY'].astype(float) / df_merged['Playing Time_90s'].astype(float)
df_merged['Performance_CrdR'] = df_merged['Performance_CrdR'].astype(float) / df_merged['Playing Time_90s'].astype(float)
df_merged['Progression_PrgC'] = df_merged['Progression_PrgC'].astype(float) / df_merged['Playing Time_90s'].astype(float)
df_merged['Progression_PrgP'] = df_merged['Progression_PrgP'].astype(float) / df_merged['Playing Time_90s'].astype(float)
df_merged['Progression_PrgR'] = df_merged['Progression_PrgR'].astype(float) / df_merged['Playing Time_90s'].astype(float)
df_merged.rename(columns={'Performance_PKatt': 'Per 90 Minutes_PKatt',
                          'Performance_CrdY': 'Per 90 Minutes_CrdY',
                            'Performance_CrdR': 'Per 90 Minutes_CrdR',
                            'Progression_PrgC': 'Per 90 Minutes_PrgC',
                            'Progression_PrgP': 'Per 90 Minutes_PrgP',
                            'Progression_PrgR': 'Per 90 Minutes_PrgR'}, inplace=True)
#df_merged.tail(10)

In [32]:
# Function for removing seasons 1415, 1516, 1617
def remove_early_seasons(df):
    seasons_to_remove = ['1415', '1516', '1617']
    df_filtered = df[~df['season'].isin(seasons_to_remove)]
    return df_filtered
df_merged = remove_early_seasons(df_merged)
#df_merged.head(10)

In [33]:
df_passing = pd.read_csv(f'{RAW_DATA_DIR}/passing_stats.csv', dtype =str )
df_passing = remove_early_seasons(df_passing)
df_passing = df_passing.drop(columns=['Ast', 'xAG', '90s', 'PrgP'])
df_passing['Playing Time_Min'] = df_merged['Playing Time_Min'].astype(float)
per90 = ['Total_Cmp', 'Total_Att', 'Total_TotDist', 'Total_PrgDist', 'Short_Cmp', 'Short_Att', 'Medium_Cmp', 'Medium_Att', 'Long_Cmp', 'Long_Att', 'Expected_xA', 'Expected_A-xAG', 'KP', '1/3', 'PPA', 'CrsPA']
for col in per90:
    df_passing[col] = df_passing[col].astype(float) / df_passing['Playing Time_Min'].astype(float) * 90
    df_passing.rename(columns={col: f'Per 90 Minutes_{col}'}, inplace=True)
#df_passing.head(10)

In [34]:
df_passing = df_passing.drop(columns=['Playing Time_Min'])
# Final merge with passing stats
df_merged = pd.merge(df_merged, df_passing, how='left')
#df_merged.head(10)

In [35]:
# Merged general, shooting, and passsing, and creation, and defensive
#TODO: merge defensive DONE
#TODO: merge keeper, adv keeper; implement keeper features in modeling

In [36]:
df_creation = pd.read_csv(f'{RAW_DATA_DIR}/creation_stats.csv', dtype =str )
df_creation = remove_early_seasons(df_creation)
df_creation['Playing Time_Min'] = df_merged['Playing Time_Min'].astype(float)
df_creation = df_creation.drop(columns=['SCA_SCA', 'GCA_GCA', '90s'])
per90_creation = ['SCA Types_PassLive', 'SCA Types_PassDead', 'SCA Types_TO', 'SCA Types_Sh', 'SCA Types_Fld', 'SCA Types_Def', 'GCA Types_PassLive', 'GCA Types_PassDead', 'GCA Types_TO', 'GCA Types_Sh', 'GCA Types_Fld', 'GCA Types_Def']
for col in per90_creation:
    df_creation[col] = df_creation[col].astype(float) / df_creation['Playing Time_Min'].astype(float) * 90
    df_creation.rename(columns={col: f'Per 90 Minutes_{col}'}, inplace=True)
#df_creation.head(10)

In [37]:
df_creation = df_creation.drop(columns=['Playing Time_Min'])
df_merged = pd.merge(df_merged, df_creation, how='left')
#df_merged.head(10)

In [38]:
df_defensive = pd.read_csv(f'{RAW_DATA_DIR}/defensive_stats.csv', dtype =str)
df_defensive = remove_early_seasons(df_defensive)
df_defensive['Playing Time_Min'] = df_merged['Playing Time_Min'].astype(float)
df_defensive = df_defensive.drop(columns=['90s'])
per90_defensive = ['Tackles_Tkl', 'Tackles_TklW', 'Tackles_Def 3rd', 'Tackles_Mid 3rd', 'Tackles_Att 3rd', 'Challenges_Tkl', 'Challenges_Att', 'Challenges_Lost', 'Blocks_Blocks', 'Blocks_Sh', 'Blocks_Pass', 'Int', 'Tkl+Int', 'Clr', 'Err']
for col in per90_defensive:
    df_defensive[col] = df_defensive[col].astype(float) / df_defensive['Playing Time_Min'].astype(float) * 90
    df_defensive.rename(columns={col: f'Per 90 Minutes_{col}'}, inplace=True)
df_defensive = df_defensive.drop(columns=['Playing Time_Min'])
#df_defensive.head(10)

In [39]:
df_merged = pd.merge(df_merged, df_defensive, how='left')
#df_merged.head(10)

In [40]:
df_keeper = pd.read_csv(f'{RAW_DATA_DIR}/keeper_stats.csv', dtype =str )
df_adv_keeper = pd.read_csv(f'{RAW_DATA_DIR}/keeper__adv_stats.csv', dtype =str )
df_adv_keeper = remove_early_seasons(df_adv_keeper)
df_keeper = remove_early_seasons(df_keeper)
df_keeper = df_keeper.drop(columns=['Performance_GA', 'Playing Time_MP', 'Playing Time_90s'])
per90_keeper = ['Performance_Saves', 'Performance_SoTA', 'Performance_W', 'Performance_D', 'Performance_L', 'Performance_CS', 'Penalty Kicks_PKatt', 'Penalty Kicks_PKA', 'Penalty Kicks_PKsv', 'Penalty Kicks_PKm']
for col in per90_keeper:
    df_keeper[col] = df_keeper[col].astype(float) / df_keeper['Playing Time_Min'].astype(float) * 90
    df_keeper.rename(columns={col: f'Per 90 Minutes_{col}'}, inplace=True)
#df_keeper.head(10)

In [41]:
df_merged = pd.merge(df_merged, df_keeper, how='left')
#df_merged.head(10)

In [42]:
# df_merged is the final merged dataframe for IRL STATS
df_merged.to_csv(f'{RAW_DATA_DIR}/fbref_merged_stats.csv', index=False)