In [73]:
import pandas as pd
import numpy as np
import os

In [74]:
path_folder = "/Users/matteolemesre/Desktop/Data LOSC/csv/csv24_25/"
path_start  = os.path.join(path_folder, "clean/data_teams.csv")
path_end_aggregated_data = os.path.join(path_folder, "centiles/data_teams_aggregated.csv")
path_end_adjusted_data   = os.path.join(path_folder, "centiles/data_teams_adjusted.csv")
path_end_centiles        = os.path.join(path_folder, "centiles/data_teams_centiles.csv")

In [75]:
df = pd.read_csv(path_start)
df = df.drop(columns=[col for col in df.columns if 'Opponent' in col], errors='ignore')

In [76]:
def aggregate_column(col):
    if col.name == 'League':
        return ', '.join(sorted(set(col.astype(str))))
    elif np.issubdtype(col.dtype, np.number):
        return col.sum()
    else:
        return ''

aggregated_df = df.groupby(['Team']).agg(aggregate_column).reset_index()

aggregated_df['Matchs Played'] = df.groupby('Team').size().values

ordered_columns = ['Team', 'Matchs Played', 'League'] + [
    col for col in aggregated_df.columns if col not in ['Team', 'League', 'Matchs Played']
]
aggregated_df = aggregated_df[ordered_columns]

aggregated_df.drop(columns=['Shirt Number', 'Home', 'Away'], errors='ignore', inplace=True)

for col in aggregated_df.select_dtypes(include=[np.number]).columns:
    aggregated_df[col] = aggregated_df[col].round(2)

In [77]:
indices_df = aggregated_df.copy()
stats_cols = indices_df.select_dtypes(include=[np.number]).columns.drop(['Matchs Played'])

for col in stats_cols:
    indices_df[col] = indices_df[col] / indices_df['Matchs Played']

for col in indices_df.select_dtypes(include=[np.number]).columns:
    indices_df[col] = indices_df[col].round(2)

In [78]:
inverse_metrics = [
    "Errors Leading to Shot",
    "Miscontrols",
    "Dispossessed",
    "Ball Losses",
    "Passes Blocked",
    "Passes Offside",
    "Challenges Lost",
    "Yellow Cards",
    "Red Cards",
    "Fouls Committed",
    "Offsides",
    "Penalty Kicks Conceded",
    "Own Goals",
    "Aerials Lost",
    "Goals Against",
    "Post-Shot Expected Goals (PSxG)",  
    "Crosses Faced",  
]


In [79]:
centiles_df = indices_df.copy()

for col in stats_cols:
    centile = centiles_df[col].rank(pct=True) * 100
    if col in inverse_metrics:
        centiles_df[col] = 100 - centile
    else:
        centiles_df[col] = centile

for col in centiles_df.select_dtypes(include=[np.number]).columns:
    centiles_df[col] = centiles_df[col].round(2)


In [80]:
aggregated_df = aggregated_df.drop(columns=['Game Week'], errors='ignore')
indices_df = indices_df.drop(columns=['Game Week'], errors='ignore')
centiles_df = centiles_df.drop(columns=['Game Week'], errors='ignore')
aggregated_df.to_csv(path_end_aggregated_data, index=False)
indices_df.to_csv(path_end_adjusted_data, index=False)
centiles_df.to_csv(path_end_centiles, index=False)