In [51]:
import pandas as pd
import os

In [52]:
path_folder = "/Users/matteolemesre/Desktop/Data LOSC/csv/csv24_25/"
path_start  = os.path.join(path_folder, "clean/data_goals.csv")
path_end_aggregated_data = os.path.join(path_folder, "centiles/data_goals_aggregated.csv")
path_end_adjusted_data   = os.path.join(path_folder, "centiles/data_goals_adjusted.csv")
path_end_centiles        = os.path.join(path_folder, "centiles/data_goals_centiles.csv")

In [53]:
data = pd.read_csv(path_start, index_col=0)

In [54]:
def aggregated_data_goals(data):
    df_grouped = data.groupby("Player").agg({
        "Team": lambda x: " / ".join(sorted(set(x))),
        "League": lambda x: " / ".join(sorted(set(x))),
        "Nationality": "first",
        "Minutes": "sum",
        
        "Shots on Target Against": "sum",
        "Goals Against": "sum",
        "Saves": "sum",
        "Post-Shot Expected Goals (PSxG)": "sum",
        
        "Save Efficiency": "sum",  
        "Clean Sheets": "sum",
        
        "Completed Long Passes": "sum",
        "Attempted Long Passes": "sum",
        "Attempted Passes (excluding GK)": "sum",
        "Attempted Throws": "sum",
        "Pass Length (Total)": "sum", 
         
        "Attempted Goal Kicks": "sum",
        "Goal Kick Length (Total)": "sum",
          
        "Crosses Faced": "sum",
        "Crosses Stopped": "sum",
        "Defensive Actions Outside Penalty Area": "sum",
        "Distance of Defensive Actions (Total)": "sum" 
    }).reset_index()
    
    df_grouped.insert(df_grouped.columns.get_loc("Minutes"), "Matches", data.groupby("Player")["Game Week"].count().values)
    
    df_grouped["Pass Length"] = (df_grouped["Pass Length (Total)"] / df_grouped["Attempted Passes (excluding GK)"])
    df_grouped["Goal Kick Length"] = (df_grouped["Goal Kick Length (Total)"] / df_grouped["Attempted Goal Kicks"])
    df_grouped["Distance of Defensive Actions"] = (df_grouped["Distance of Defensive Actions (Total)"] / df_grouped["Defensive Actions Outside Penalty Area"])
    
    columns_to_drop = ["Pass Length (Total)", "Goal Kick Length (Total)", "Distance of Defensive Actions (Total)"]
    df_grouped = df_grouped.drop(columns=columns_to_drop)
    
    for col in df_grouped.select_dtypes(include=['float64', 'int64']).columns:
        df_grouped[col] = df_grouped[col].round(2)
    
    return df_grouped


In [55]:
def adjusted_data_goals(data):
    stats_per_90 = data.copy()
    stats_columns = [
        "Shots on Target Against", "Goals Against", "Saves", 
        "Post-Shot Expected Goals (PSxG)", "Save Efficiency", "Clean Sheets",
        "Completed Long Passes", "Attempted Long Passes",
        "Attempted Passes (excluding GK)", "Attempted Throws", 
        "Attempted Goal Kicks", "Crosses Faced", "Crosses Stopped", 
        "Defensive Actions Outside Penalty Area"
    ]
    
    for column in stats_columns:
        stats_per_90[column] = stats_per_90[column] * 90 / stats_per_90["Minutes"]
    
    for col in stats_per_90.select_dtypes(include=['float64', 'int64']).columns:
        stats_per_90[col] = stats_per_90[col].round(2)
    
    return stats_per_90

In [56]:
def centiles_data_goals(data):
    centiles = data.copy()
    centiles = centiles.drop(columns=["Shots on Target Against", "Post-Shot Expected Goals (PSxG)"])
    stats_columns = [
        "Goals Against", "Saves", 
        "Save Efficiency", "Clean Sheets",
        "Completed Long Passes", "Attempted Long Passes",
        "Attempted Passes (excluding GK)", "Attempted Throws", 
        "Attempted Goal Kicks", "Crosses Faced", "Crosses Stopped", 
        "Defensive Actions Outside Penalty Area", "Pass Length", 
        "Goal Kick Length", "Distance of Defensive Actions"
    ]
    
    stats_columns_neg = [
        "Goals Against"
    ]
    for column in stats_columns_neg:
        centiles[column] = -centiles[column]
    
    for column in stats_columns:
        centiles[column] = centiles[column].rank(pct=True) * 100
    
    for col in centiles.select_dtypes(include=['float64', 'int64']).columns:
        centiles[col] = centiles[col].round(2)
    
    return centiles

In [57]:
df_aggregated = aggregated_data_goals(data)
df_adjusted = adjusted_data_goals(df_aggregated)
df_centiles = centiles_data_goals(df_adjusted)
df_aggregated.to_csv(path_end_aggregated_data, index=False)
df_adjusted.to_csv(path_end_adjusted_data, index=False)
df_centiles.to_csv(path_end_centiles, index=False)