In [1]:
import numpy as np
import pandas as pd
import warnings
import os
from statsbombpy import sb
pd.set_option('display.max_columns', None)

import joblib  # Para guardar el modelo
import shap

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dict_matches = {'arg_col_final': 3943077,
 'can_uru_3rd_place_final': 3943076,
 'uru_col_semi-finals': 3942852,
 'arg_can_semi-finals': 3942785,
 'col_pan_quarter-finals': 3942416,
 'uru_bra_quarter-finals': 3942415,
 'ven_can_quarter-finals': 3942229,
 'arg_ecu_quarter-finals': 3942228}

In [3]:
def prepare_model_data(match_id, team_name=None):
    # Obtener los eventos del partido usando match_id
    match_data = sb.events(match_id=match_id)
    
    # Obtener la lista de equipos participantes en el partido
    teams_in_match = match_data['team'].unique()
    
    # Si se proporciona un equipo, verificar si está en el partido
    if team_name:
        if team_name not in teams_in_match:
            return f"El equipo '{team_name}' no jugó este partido."
    
    # Especificar los tipos de eventos a incluir (solo los indicados en model_cols)
    model_cols = ['Ball Receipt*', 'Ball Recovery', 'Block', 'Carry', 'Dribble', 'Duel', 'Pass']
    
    # Filtrar los eventos para incluir solo los tipos especificados
    events_df = match_data[match_data['type'].isin(model_cols)].copy()
    
    # Si se proporciona un equipo, filtrar los eventos por equipo
    if team_name:
        events_df = events_df[events_df['team'] == team_name].copy()

    # Obtener los eventos de tiros para calcular las métricas de xG
    shot_events = match_data[match_data['type'] == 'Shot'].copy()
    if team_name:
        shot_events = shot_events[shot_events['team'] == team_name].copy()
    
    # Calcular total de xG por jugador y equipo
    xg_df = (
        shot_events
        .groupby(['player', 'team'])['shot_statsbomb_xg']
        .sum()
        .reset_index(name='xG')
    )
    
    # Calcular número de goles por jugador (donde 'shot_outcome' == 'Goal')
    goals_df = (
        shot_events[shot_events['shot_outcome'] == 'Goal']
        .groupby(['player', 'team'])
        .size()
        .reset_index(name='Goals')
    )
    
    # Calcular total de tiros por jugador
    total_shots = (
        shot_events
        .groupby(['player', 'team'])
        .size()
        .reset_index(name='Shot')
    )
    
    # Crear un DataFrame para contener las estadísticas de tiro
    shot_stats_df = total_shots.merge(xg_df, on=['player', 'team'], how='left').fillna({'xG': 0})
    shot_stats_df = shot_stats_df.merge(goals_df, on=['player', 'team'], how='left').fillna({'Goals': 0})
    
    # Calcular la sobreperformance de xG
    shot_stats_df['xG Overperformance'] = shot_stats_df['Goals'] - shot_stats_df['xG']
    
    # Calcular xG por tiro
    shot_stats_df['xG per Shot'] = (shot_stats_df['xG'] / shot_stats_df['Shot']).fillna(0)
    
    # Calcular tiros a puerta (Shots on Target)
    shots_on_target = shot_events[shot_events['shot_outcome'].isin(['Goal', 'Saved'])]
    shots_on_target_count = shots_on_target.groupby(['player', 'team']).size().reset_index(name='Shots on Target')
    
    # Unir tiros a puerta al DataFrame de estadísticas de tiro
    shot_stats_df = shot_stats_df.merge(shots_on_target_count, on=['player', 'team'], how='left').fillna({'Shots on Target': 0})
    
    # Calcular precisión de tiro (Shot Accuracy %)
    shot_stats_df['Shot Accuracy %'] = (shot_stats_df['Shots on Target'] / shot_stats_df['Shot']) * 100
    shot_stats_df['Shot Accuracy %'] = shot_stats_df['Shot Accuracy %'].fillna(0)
    
    # Calcular grandes ocasiones falladas (xG >= 0.3)
    big_chances_missed = shot_events[
        (shot_events['shot_statsbomb_xg'] >= 0.3) &
        (shot_events['shot_outcome'] != 'Goal')
    ]
    big_chances_missed_count = big_chances_missed.groupby(['player', 'team']).size().reset_index(name='Big Chances Missed')
    
    # Unir grandes ocasiones falladas al DataFrame de estadísticas de tiro
    shot_stats_df = shot_stats_df.merge(big_chances_missed_count, on=['player', 'team'], how='left').fillna({'Big Chances Missed': 0})
    
    # Crear una tabla pivote para contar los eventos de interés por jugador y equipo (solo model_cols)
    events_summary_df = (
        events_df.pivot_table(
            index=['player', 'team'],
            columns='type',
            aggfunc='size',
            fill_value=0
        )
        .reset_index()
    )
    
    # Unir las estadísticas de tiro con el resumen de eventos
    events_df = events_summary_df.merge(shot_stats_df, on=['player', 'team'], how='left').fillna(0)
    
    # Redondear todas las columnas numéricas a 4 decimales
    events_df = events_df.round(4)
    
    # Devolver el DataFrame final con solo los eventos de model_cols y las estadísticas de xG
    return events_df


In [4]:
def output_shaps(id_partido_1, team_1, id_partido_2, team_2):
    '''
    
    '''
    model_cols = ['Ball Receipt*', 'Ball Recovery', 'Block', 'Carry',
              'Dribble', 'Duel', 'Pass', 
              ]
    target = ['xG']
    loaded_scaler = joblib.load('scaler_model.pkl')
    best_rf = joblib.load('best_gradient_boosting_model.pkl')
    explainer = shap.TreeExplainer(best_rf)
    t1 = prepare_model_data(id_partido_1, team_1)
    t2 = prepare_model_data(id_partido_2, team_2)

    events_df = pd.concat([t1, t2])

    X_scaled = loaded_scaler.transform(events_df[model_cols])

    # Calcular los valores SHAP
    shap_values = explainer.shap_values(X_scaled)

    # Crear un DataFrame con los valores SHAP
    shap_df = pd.DataFrame(shap_values, columns=[f'sh_{col}' for col in model_cols])

    # Calcular la media de los valores SHAP para cada fila y agregarla como una nueva columna
    shap_df['shapley'] = shap_df.mean(axis=1)

    # Hacer predicciones con el modelo cargado y agregar la columna y_pred al DataFrame original
    shap_df['y_pred'] = best_rf.predict(X_scaled)

    # Unir el DataFrame original con el DataFrame de SHAP
    final_df = pd.concat([events_df.reset_index(drop=True), shap_df], axis=1)

    # Filtrar el DataFrame para incluir sólo las columnas usadas en el modelo, SHAP values, el target y y_pred
    columns_to_keep = ['player', 'team'] + model_cols + list(shap_df.columns) + target
    final_df_filtered = final_df[columns_to_keep]

    sh_team_1 = final_df_filtered[final_df_filtered['team'] == team_1]['shapley'].mean()
    sh_team_2 =  final_df_filtered[final_df_filtered['team'] == team_2]['shapley'].mean()
    sh_glb = final_df_filtered['shapley'].mean()

    final_df_filtered['plus_' + team_1[:3]] = final_df_filtered['shapley'] >= sh_team_1 
    final_df_filtered['plus_' + team_2[:3]] = final_df_filtered['shapley'] >= sh_team_2
    final_df_filtered['plus_glb'] = final_df_filtered['shapley'] >= sh_glb

    return final_df_filtered

In [5]:
dict_matches

{'arg_col_final': 3943077,
 'can_uru_3rd_place_final': 3943076,
 'uru_col_semi-finals': 3942852,
 'arg_can_semi-finals': 3942785,
 'col_pan_quarter-finals': 3942416,
 'uru_bra_quarter-finals': 3942415,
 'ven_can_quarter-finals': 3942229,
 'arg_ecu_quarter-finals': 3942228}

In [6]:
arg_can = output_shaps(dict_matches['arg_ecu_quarter-finals'], 'Argentina',
             dict_matches['ven_can_quarter-finals'], 'Canada'
             )

In [7]:
arg_can

Unnamed: 0,player,team,Ball Receipt*,Ball Recovery,Block,Carry,Dribble,Duel,Pass,sh_Ball Receipt*,sh_Ball Recovery,sh_Block,sh_Carry,sh_Dribble,sh_Duel,sh_Pass,shapley,y_pred,xG,plus_Arg,plus_Can,plus_glb
0,Alexis Mac Allister,Argentina,34,6,2,40,2,7,45,-0.031724,-0.002291,0.000519,-0.001142,-0.002493,-0.012579,-0.055245,-0.014994,0.053493,0.7835,False,False,False
1,Cristian Gabriel Romero,Argentina,51,5,3,51,0,4,62,0.059732,-0.002629,-0.001793,-0.024285,-0.027067,-0.015119,-0.063655,-0.010688,0.083631,0.0,False,False,False
2,Damián Emiliano Martínez,Argentina,13,6,0,17,0,0,31,-0.01889,-0.012036,-0.001376,0.009247,-0.02413,0.00281,-0.055869,-0.014321,0.058202,0.0,False,False,False
3,Enzo Fernandez,Argentina,28,5,2,30,1,3,28,0.01765,-0.002534,0.000519,-0.000832,0.010638,-0.00419,-0.057866,-0.005231,0.121832,0.1909,True,False,False
4,Giovani Lo Celso,Argentina,6,2,0,7,0,0,7,-0.040214,-0.012114,-0.000933,0.00455,-0.029236,0.008554,-0.003847,-0.010463,0.085206,0.0,False,False,False
5,Gonzalo Ariel Montiel,Argentina,1,1,0,1,0,0,1,-0.135911,-0.003461,-0.009635,0.025815,-0.0256,-0.009267,0.038053,-0.017144,0.038442,0.7835,False,False,False
6,Julián Álvarez,Argentina,8,1,1,4,0,2,5,-0.031521,-0.002758,0.000583,0.007503,-0.032635,-0.003554,0.052551,-0.001404,0.148616,0.7835,True,True,True
7,Lautaro Javier Martínez,Argentina,24,2,2,13,0,3,14,0.022964,-0.002293,0.000816,0.004488,-0.022596,-0.002298,-0.028748,-0.003953,0.130779,0.0499,True,False,True
8,Lionel Andrés Messi Cuccittini,Argentina,36,1,0,24,0,0,25,0.022377,-0.004089,-0.000933,0.004619,-0.026527,0.00725,-0.024876,-0.003168,0.136268,0.8999,True,False,True
9,Lisandro Martínez,Argentina,41,4,1,39,0,2,43,0.021985,-0.00253,0.00014,-0.000907,-0.017183,-0.000705,-0.062212,-0.008773,0.097036,0.5021,False,False,False
