In [5]:
import pandas as pd
import numpy as np
import os

# --- CONFIGURA√á√ÉO ---
# Pesos ajustados para valorizar efici√™ncia
WEIGHTS = {
    'Shot_Stopping': 0.70,   
    'Distribution': 0.15,    
    'Aeriel_Control': 0.15   
}

NAME_MAPPING = {
    'Brighton': 'Brighton & Hove Albion', 'Brighton & Hove Albion': 'Brighton',
    'Manchester Utd': 'Manchester United', 'Manchester United': 'Manchester Utd',
    'Newcastle Utd': 'Newcastle United', 'Newcastle United': 'Newcastle Utd',
    'Nott\'ham Forest': 'Nottingham Forest', 'Nottingham Forest': 'Nott\'ham Forest',
    'Tottenham': 'Tottenham Hotspur', 'Tottenham Hotspur': 'Tottenham',
    'West Ham': 'West Ham United', 'West Ham United': 'West Ham',
    'Wolves': 'Wolverhampton Wanderers', 'Wolverhampton Wanderers': 'Wolves'
}

def load_and_clean_data():
    path = "data/processed/pl_goalkeepers_2526_matches.csv"
    if not os.path.exists(path): raise FileNotFoundError(f"Arquivo {path} n√£o existe.")
    
    df = pd.read_csv(path)
    df.columns = [c.rstrip('_') for c in df.columns]
    
    rename_map = {
        'team': 'Team', 'player': 'Player', 'game': 'Game', 
        'date': 'Date', 'min': 'Minutes', 'psxg': 'PSxG', 'psxg_net': 'PSxG_Net',
        'opa': 'OPA'
    }
    df = df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns})
    
    if 'Date' not in df.columns and 'Game' in df.columns:
        df['Date'] = pd.to_datetime(df['Game'].str.split(' ').str[0])
        
    return df

def get_opponent(row):
    try:
        match_str = row['Game'].split(' ', 1)[1]
        teams = match_str.split('-')
        my_team = row['Team']
        for t in teams:
            t = t.strip()
            is_me = (t == my_team) or (NAME_MAPPING.get(t) == my_team)
            if not is_me: return NAME_MAPPING.get(t, t)
        return "Unknown"
    except: return "Unknown"

def normalize_series(series):
    """Transforma em percentil (0 a 100) relativo √† liga."""
    return series.rank(pct=True) * 100

def aggregate_and_rank(df):
    print("‚öôÔ∏è Calculando Ranking (Temporada 25/26 Consolidada)...")
    
    df['Opponent'] = df.apply(get_opponent, axis=1)
    
    # --- MUDAN√áA AQUI: Agrupar APENAS por Player ---
    # Se ele trocou de time, somamos tudo e pegamos o √∫ltimo time (last)
    grouped = df.groupby(['Player']).agg({
        'Team': 'last',          # Pega o time atual do jogador
        'PSxG_Net': 'sum',       # Soma saldo de gols da temporada toda
        'PSxG': 'sum',
        'Crosses_Stopped': 'sum',
        'Crosses_Faced': 'sum',
        'OPA': 'sum',
        'Minutes': 'sum',
        'Game': 'count'
    }).reset_index()
    
    grouped.rename(columns={'Game': 'Games_Played'}, inplace=True)
    
    # Filtro: M√≠nimo 5 jogos na temporada inteira
    ranked = grouped[grouped['Games_Played'] >= 5].copy()
    
    # --- M√âTRICAS POR 90 MIN (Efici√™ncia) ---
    ranked['PSxG_Net_p90'] = (ranked['PSxG_Net'] / ranked['Minutes']) * 90
    
    ranked['Cross_Stop_Pct'] = np.where(ranked['Crosses_Faced'] > 0, 
                                        ranked['Crosses_Stopped'] / ranked['Crosses_Faced'], 0)
    
    ranked['OPA_p90'] = (ranked['OPA'] / ranked['Minutes']) * 90
    
    # --- PONTUA√á√ÉO (NORMALIZADA 0-100) ---
    ranked['Score_Shot_Stopping'] = normalize_series(ranked['PSxG_Net_p90'])
    ranked['Score_Aerial'] = normalize_series(ranked['Cross_Stop_Pct'])
    ranked['Score_Distribution'] = normalize_series(ranked['OPA_p90'])
    
    # Score Final
    ranked['Final_Score'] = (
        (ranked['Score_Shot_Stopping'] * WEIGHTS['Shot_Stopping']) +
        (ranked['Score_Aerial'] * WEIGHTS['Aeriel_Control']) +
        (ranked['Score_Distribution'] * WEIGHTS['Distribution'])
    )
    
    # Pequeno b√¥nus por consist√™ncia (quem jogou mais tem leve vantagem no empate)
    games_bonus = (ranked['Games_Played'] / ranked['Games_Played'].max()) * 5
    ranked['Final_Score'] += games_bonus
    
    # Ordena√ß√£o Final
    ranked = ranked.sort_values('Final_Score', ascending=False)
    ranked['Rank'] = range(1, len(ranked) + 1)
    
    print("\nüèÜ TOP 10 RANKING (Efici√™ncia Consolidada):")
    cols_show = ['Rank', 'Player', 'Team', 'Final_Score', 'PSxG_Net', 'PSxG_Net_p90', 'Games_Played']
    print(ranked[cols_show].head(10).to_string(index=False))
    
    return ranked

if __name__ == "__main__":
    df_matches = load_and_clean_data()
    df_ranking = aggregate_and_rank(df_matches)
    df_ranking.to_csv("data/processed/final_ranking_table.csv", index=False)

‚öôÔ∏è Calculando Ranking (Temporada 25/26 Consolidada)...

üèÜ TOP 10 RANKING (Efici√™ncia Consolidada):
 Rank               Player                   Team  Final_Score  PSxG_Net  PSxG_Net_p90  Games_Played
    1          Robin Roefs             Sunderland    92.962963       3.1      0.147619            21
    2      Jordan Pickford                Everton    84.814815       2.6      0.123810            21
    3       Robert S√°nchez                Chelsea    83.465608       1.0      0.052509            20
    4       Dean Henderson         Crystal Palace    83.333333       3.8      0.180952            21
    5    Emiliano Mart√≠nez            Aston Villa    81.825397       2.0      0.121212            17
    6    Guglielmo Vicario      Tottenham Hotspur    75.370370       0.4      0.019048            21
    7            Nick Pope       Newcastle United    74.497354      -1.2     -0.080838            15
    8      Bart Verbruggen Brighton & Hove Albion    73.333333       0.9      0.042