# Codigo Limpio:

In [11]:
import numpy as np
import pandas as pd
import warnings
import os
from statsbombpy import sb
pd.set_option('display.max_columns', None)

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib  # Para guardar el modelo
import shap
from sklearn.preprocessing import StandardScaler


warnings.filterwarnings("ignore")
warnings.filterwarnings(action='ignore', category = DataConversionWarning)

## Comando para saber qué competiciones están disponibles

In [12]:
sb.competitions().columns

Index(['competition_id', 'season_id', 'country_name', 'competition_name',
       'competition_gender', 'competition_youth', 'competition_international',
       'season_name', 'match_updated', 'match_updated_360',
       'match_available_360', 'match_available'],
      dtype='object')

## Identificación de competiciones internacionales

In [13]:
open_data = sb.competitions()
open_data[(open_data['competition_international'] == True) & (open_data['competition_gender'] == 'male')]


Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
2,1267,107,Africa,African Cup of Nations,male,False,True,2023,2024-06-13T07:51:02.452825,,,2024-06-13T07:51:02.452825
21,223,282,South America,Copa America,male,False,True,2024,2024-07-15T18:00:33.653673,,,2024-07-15T18:00:33.653673
29,43,106,International,FIFA World Cup,male,False,True,2022,2024-05-15T10:23:32.854925,2024-06-11T11:30:50.462448,2024-06-11T11:30:50.462448,2024-05-15T10:23:32.854925
30,43,3,International,FIFA World Cup,male,False,True,2018,2024-06-12T07:38:19.345758,2021-06-13T16:17:31.694,,2024-06-12T07:38:19.345758
31,43,55,International,FIFA World Cup,male,False,True,1990,2023-06-28T10:58:20.137929,2021-06-12T16:17:31.694,,2023-06-28T10:58:20.137929
32,43,54,International,FIFA World Cup,male,False,True,1986,2023-12-26T22:34:04.263530,2021-06-13T16:17:31.694,,2023-12-26T22:34:04.263530
33,43,51,International,FIFA World Cup,male,False,True,1974,2024-02-13T02:52:29.582599,2021-06-13T16:17:31.694,,2024-02-13T02:52:29.582599
34,43,272,International,FIFA World Cup,male,False,True,1970,2024-02-13T14:23:06.735299,,,2024-02-13T14:23:06.735299
35,43,270,International,FIFA World Cup,male,False,True,1962,2023-06-26T10:38:00.323984,,,2023-06-26T10:38:00.323984
36,43,269,International,FIFA World Cup,male,False,True,1958,2024-02-13T14:22:08.222297,,,2024-02-13T14:22:08.222297


## Identificación de partidos

In [14]:
sb.matches(competition_id=223, season_id= 282).head(10)

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,match_status_360,last_updated,last_updated_360,match_week,competition_stage,stadium,referee,home_managers,away_managers,data_version,shot_fidelity_version,xy_fidelity_version
0,3943077,2024-07-15,04:15:00.000,South America - Copa America,2024,Argentina,Colombia,1,0,available,unscheduled,2024-07-15T15:50:08.671355,,6,Final,Hard Rock Stadium,Raphael Claus,Lionel Sebastián Scaloni,Néstor Gabriel Lorenzo,1.1.0,2,2
1,3943076,2024-07-14,03:00:00.000,South America - Copa America,2024,Canada,Uruguay,2,2,available,unscheduled,2024-07-15T07:57:02.660641,,6,3rd Place Final,Bank of America Stadium,Alexis Herrera,Jesse Marsch,Marcelo Alberto Bielsa Caldera,1.1.0,2,2
2,3942852,2024-07-11,03:00:00.000,South America - Copa America,2024,Uruguay,Colombia,0,1,available,unscheduled,2024-07-15T18:00:33.653673,,5,Semi-finals,Bank of America Stadium,César Arturo Ramos Palazuelos,Marcelo Alberto Bielsa Caldera,Néstor Gabriel Lorenzo,1.1.0,2,2
3,3942785,2024-07-10,03:00:00.000,South America - Copa America,2024,Argentina,Canada,2,0,available,unscheduled,2024-07-14T15:55:49.351182,,5,Semi-finals,MetLife Stadium,Piero Maza Gomez,Lionel Sebastián Scaloni,Jesse Marsch,1.1.0,2,2
4,3942416,2024-07-07,01:00:00.000,South America - Copa America,2024,Colombia,Panama,5,0,available,unscheduled,2024-07-10T06:49:40.099252,,4,Quarter-finals,State Farm Stadium,Maurizio Mariani,Néstor Gabriel Lorenzo,Thomas Christiansen Tarín,1.1.0,2,2
5,3942415,2024-07-07,04:00:00.000,South America - Copa America,2024,Uruguay,Brazil,0,0,available,unscheduled,2024-07-14T15:32:06.707155,,4,Quarter-finals,Allegiant Stadium,Darío Herrera,Marcelo Alberto Bielsa Caldera,Dorival Silvestre Júnior,1.1.0,2,2
6,3939986,2024-06-30,03:00:00.000,South America - Copa America,2024,Argentina,Peru,2,0,available,unscheduled,2024-07-10T08:52:05.304046,,3,Group Stage,Hard Rock Stadium,César Arturo Ramos Palazuelos,Lionel Sebastián Scaloni,Jorge Daniel Fossati Lurachi,1.1.0,2,2
7,3939973,2024-06-24,04:00:00.000,South America - Copa America,2024,Uruguay,Panama,3,1,available,unscheduled,2024-07-08T22:09:32.590631,,1,Group Stage,Hard Rock Stadium,Piero Maza Gomez,Marcelo Alberto Bielsa Caldera,Thomas Christiansen Tarín,1.1.0,2,2
8,3942229,2024-07-06,04:00:00.000,South America - Copa America,2024,Venezuela,Canada,1,1,available,unscheduled,2024-07-13T23:56:59.669800,,4,Quarter-finals,AT&T Stadium,Wilton Pereira Sampaio,Fernando Ariel Batista,Jesse Marsch,1.1.0,2,2
9,3942228,2024-07-05,04:00:00.000,South America - Copa America,2024,Argentina,Ecuador,1,1,available,unscheduled,2024-07-11T14:18:57.536262,,4,Quarter-finals,NRG Stadium,Andrés Matonte,Lionel Sebastián Scaloni,Félix Sánchez Bas,1.1.0,2,2


## Exploración de datos:

In [15]:
# Ejemplo usando partido de Argentina Colombia:

print("Columns in the dataset for Argentina - Colombia:\n")
print(sb.events(match_id=3943077).columns)


print("\nShape of the dataset for matct Argentina - Colombia (rows, columns):")
print(sb.events(match_id=3943077).shape)

Columns in the dataset for Argentina - Colombia:

Index(['50_50', 'bad_behaviour_card', 'ball_receipt_outcome',
       'ball_recovery_offensive', 'ball_recovery_recovery_failure',
       'block_deflection', 'block_offensive', 'carry_end_location',
       'clearance_aerial_won', 'clearance_body_part', 'clearance_head',
       'clearance_left_foot', 'clearance_right_foot', 'counterpress',
       'dribble_nutmeg', 'dribble_outcome', 'dribble_overrun', 'duel_outcome',
       'duel_type', 'duration', 'foul_committed_advantage',
       'foul_committed_card', 'foul_committed_type', 'foul_won_advantage',
       'foul_won_defensive', 'goalkeeper_body_part', 'goalkeeper_end_location',
       'goalkeeper_outcome', 'goalkeeper_position', 'goalkeeper_technique',
       'goalkeeper_type', 'id', 'index', 'injury_stoppage_in_chain',
       'interception_outcome', 'location', 'match_id', 'minute', 'off_camera',
       'out', 'pass_aerial_won', 'pass_angle', 'pass_assisted_shot_id',
       'pass_body_pa

# Tabla de eventos:

In [16]:
arg_col_f_3943077 = sb.events(match_id=3943077)
arg_col_f_3943077.head(10)

Unnamed: 0,50_50,bad_behaviour_card,ball_receipt_outcome,ball_recovery_offensive,ball_recovery_recovery_failure,block_deflection,block_offensive,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,clearance_left_foot,clearance_right_foot,counterpress,dribble_nutmeg,dribble_outcome,dribble_overrun,duel_outcome,duel_type,duration,foul_committed_advantage,foul_committed_card,foul_committed_type,foul_won_advantage,foul_won_defensive,goalkeeper_body_part,goalkeeper_end_location,goalkeeper_outcome,goalkeeper_position,goalkeeper_technique,goalkeeper_type,id,index,injury_stoppage_in_chain,interception_outcome,location,match_id,minute,off_camera,out,pass_aerial_won,pass_angle,pass_assisted_shot_id,pass_body_part,pass_cross,pass_cut_back,pass_end_location,pass_goal_assist,pass_height,pass_inswinging,pass_length,pass_miscommunication,pass_outcome,pass_outswinging,pass_recipient,pass_recipient_id,pass_shot_assist,pass_switch,pass_technique,pass_through_ball,pass_type,period,play_pattern,player,player_id,position,possession,possession_team,possession_team_id,related_events,second,shot_aerial_won,shot_body_part,shot_end_location,shot_first_time,shot_freeze_frame,shot_key_pass_id,shot_one_on_one,shot_outcome,shot_statsbomb_xg,shot_technique,shot_type,substitution_outcome,substitution_outcome_id,substitution_replacement,substitution_replacement_id,tactics,team,team_id,timestamp,type,under_pressure
0,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,279b7d66-92b5-4daa-8ff6-cba8fce271d9,1,,,,3943077,0,,,,,,,,,,,,,,,,,,,,,,,,1,Regular Play,,,,1,Argentina,779,,0,,,,,,,,,,,,,,,,"{'formation': 442, 'lineup': [{'player': {'id'...",Argentina,779,00:00:00.000,Starting XI,
1,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,06195288-8ea1-489f-9496-3060ac479c36,2,,,,3943077,0,,,,,,,,,,,,,,,,,,,,,,,,1,Regular Play,,,,1,Argentina,779,,0,,,,,,,,,,,,,,,,"{'formation': 41212, 'lineup': [{'player': {'i...",Colombia,769,00:00:00.000,Starting XI,
2,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,de92dcc1-0dd8-4f5d-af24-f853cd9de51a,3,,,,3943077,0,,,,,,,,,,,,,,,,,,,,,,,,1,Regular Play,,,,1,Argentina,779,[90bb1cd7-1134-4d4c-8a12-6b8310da9362],0,,,,,,,,,,,,,,,,,Colombia,769,00:00:00.000,Half Start,
3,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,90bb1cd7-1134-4d4c-8a12-6b8310da9362,4,,,,3943077,0,,,,,,,,,,,,,,,,,,,,,,,,1,Regular Play,,,,1,Argentina,779,[de92dcc1-0dd8-4f5d-af24-f853cd9de51a],0,,,,,,,,,,,,,,,,,Argentina,779,00:00:00.000,Half Start,
4,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,b7d094ef-702b-402c-a27a-048b4d88e674,1781,,,,3943077,45,,,,,,,,,,,,,,,,,,,,,,,,2,Regular Play,,,,70,Argentina,779,[27bf5985-e7b5-4595-8f68-097de0b2e678],0,,,,,,,,,,,,,,,,,Colombia,769,00:00:00.000,Half Start,
5,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,27bf5985-e7b5-4595-8f68-097de0b2e678,1782,,,,3943077,45,,,,,,,,,,,,,,,,,,,,,,,,2,Regular Play,,,,70,Argentina,779,[b7d094ef-702b-402c-a27a-048b4d88e674],0,,,,,,,,,,,,,,,,,Argentina,779,00:00:00.000,Half Start,
6,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,40ed0026-436b-45a7-a22b-2c3d10b31d96,3118,,,,3943077,90,,,,,,,,,,,,,,,,,,,,,,,,3,From Goal Kick,,,,142,Argentina,779,[6d962001-56df-40e7-9fef-2ffb58324768],0,,,,,,,,,,,,,,,,,Colombia,769,00:00:00.000,Half Start,
7,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,6d962001-56df-40e7-9fef-2ffb58324768,3119,,,,3943077,90,,,,,,,,,,,,,,,,,,,,,,,,3,From Goal Kick,,,,142,Argentina,779,[40ed0026-436b-45a7-a22b-2c3d10b31d96],0,,,,,,,,,,,,,,,,,Argentina,779,00:00:00.000,Half Start,
8,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,49d35108-aa53-49cb-9a89-08048db30eab,3680,,,,3943077,105,,,,,,,,,,,,,,,,,,,,,,,,4,Regular Play,,,,180,Colombia,769,[f532105c-9d86-44b4-a02e-7203394a9e4b],0,,,,,,,,,,,,,,,,,Colombia,769,00:00:00.000,Half Start,
9,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,f532105c-9d86-44b4-a02e-7203394a9e4b,3681,,,,3943077,105,,,,,,,,,,,,,,,,,,,,,,,,4,Regular Play,,,,180,Colombia,769,[49d35108-aa53-49cb-9a89-08048db30eab],0,,,,,,,,,,,,,,,,,Argentina,779,00:00:00.000,Half Start,


# Identificación de eventos: 

In [17]:
arg_col_f_3943077['type'].unique()

array(['Starting XI', 'Half Start', 'Pass', 'Ball Receipt*', 'Carry',
       'Pressure', 'Shot', 'Goal Keeper', 'Clearance', 'Block',
       'Dribbled Past', 'Dribble', 'Duel', 'Ball Recovery', 'Miscontrol',
       'Interception', 'Foul Committed', 'Foul Won', 'Dispossessed',
       'Tactical Shift', 'Injury Stoppage', 'Offside', 'Player Off',
       'Player On', 'Shield', 'Half End', '50/50', 'Referee Ball-Drop',
       'Substitution', 'Bad Behaviour'], dtype=object)

## Creación de tabla (Ejemplo)

In [18]:
# Create a list of event types to include
event_types = [
    'Shot', 'Pass', 'Carry', 'Dribble', 'Interception', 'Foul Committed', 'Block',
    'Clearance', 'Ball Receipt*', 'Foul Won', 'Ball Recovery','Duel'
]

# Create a DataFrame with all unique players and teams
players_df = arg_col_f_3943077[['player', 'team']].drop_duplicates().reset_index(drop=True)

# Filter the data to include only the events of interest
filtered_events = arg_col_f_3943077[arg_col_f_3943077['type'].isin(event_types)]

# Create a pivot table to count occurrences of each event type per player and team
events_df = (
    filtered_events.pivot_table(
        index=['player', 'team'],
        columns='type',
        aggfunc='size',
        fill_value=0
    )
    .reset_index()
)


# Create shot_events DataFrame containing only 'Shot' events
shot_events = arg_col_f_3943077[arg_col_f_3943077['type'] == 'Shot']

# Calculate total xG for 'Shot' events
xg_df = (
    shot_events
    .groupby(['player', 'team'])['shot_statsbomb_xg']
    .sum()
    .reset_index(name='xG')
)

# Calculate number of goals per player (Shot events where 'shot_outcome' == 'Goal')
goals_df = (
    shot_events[shot_events['shot_outcome'] == 'Goal']
    .groupby(['player', 'team'])
    .size()
    .reset_index(name='Goals')
)

# Calculate total shots per player
total_shots = (
    shot_events
    .groupby(['player', 'team'])
    .size()
    .reset_index(name='Shot')
)

# Create a DataFrame to hold shot statistics
shot_stats_df = total_shots.merge(xg_df, on=['player', 'team'], how='left').fillna({'xG': 0})
shot_stats_df = shot_stats_df.merge(goals_df, on=['player', 'team'], how='left').fillna({'Goals': 0})

# Calculate xG Overperformance
shot_stats_df['xG Overperformance'] = shot_stats_df['Goals'] - shot_stats_df['xG']

# Calculate xG per Shot
shot_stats_df['xG per Shot'] = (shot_stats_df['xG'] / shot_stats_df['Shot']).fillna(0)

# Calculate shots on target
shots_on_target = shot_events[shot_events['shot_outcome'].isin(['Goal', 'Saved'])]
shots_on_target_count = shots_on_target.groupby(['player', 'team']).size().reset_index(name='Shots on Target')

# Merge shots on target into shot_stats_df
shot_stats_df = shot_stats_df.merge(shots_on_target_count, on=['player', 'team'], how='left').fillna({'Shots on Target': 0})

# Calculate Shot Accuracy %
shot_stats_df['Shot Accuracy %'] = (shot_stats_df['Shots on Target'] / shot_stats_df['Shot']) * 100
shot_stats_df['Shot Accuracy %'] = shot_stats_df['Shot Accuracy %'].fillna(0)

# Calculate Big Chances Missed (assuming xG threshold of 0.3)
big_chances_missed = shot_events[
    (shot_events['shot_statsbomb_xg'] >= 0.3) &
    (shot_events['shot_outcome'] != 'Goal')
]
big_chances_missed_count = big_chances_missed.groupby(['player', 'team']).size().reset_index(name='Big Chances Missed')

# Merge Big Chances Missed into shot_stats_df
shot_stats_df = shot_stats_df.merge(big_chances_missed_count, on=['player', 'team'], how='left').fillna({'Big Chances Missed': 0})

# At the end, merge shot_stats_df into events_df
events_df = events_df.merge(shot_stats_df, on=['player', 'team'], how='left').fillna(0)

# Print the final DataFrame
events_df.head(10)

Unnamed: 0,player,team,Ball Receipt*,Ball Recovery,Block,Carry,Clearance,Dribble,Duel,Foul Committed,Foul Won,Interception,Pass,Shot_x,Shot_y,xG,Goals,xG Overperformance,xG per Shot,Shots on Target,Shot Accuracy %,Big Chances Missed
0,Alexis Mac Allister,Argentina,62,4,3,44,1,1,4,2,2,0,59,1,1.0,0.040733,0.0,-0.040733,0.040733,0.0,0.0,0.0
1,Andrés Mateus Uribe Villa,Colombia,7,1,0,9,0,0,1,0,1,1,10,1,1.0,0.018107,0.0,-0.018107,0.018107,0.0,0.0,0.0
2,Camilo Andrés Vargas Gil,Colombia,19,7,1,23,0,0,0,0,0,0,44,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Carlos Eccehomo Cuesta Figueroa,Colombia,50,3,1,48,0,0,5,0,0,2,57,3,3.0,0.320038,0.0,-0.320038,0.106679,1.0,33.333333,0.0
4,Cristian Gabriel Romero,Argentina,30,0,1,27,2,0,3,0,1,2,41,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Damián Emiliano Martínez,Argentina,15,6,0,23,0,0,0,0,0,0,46,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Davinson Sánchez Mina,Colombia,76,5,0,71,5,1,5,1,0,1,86,1,1.0,0.141692,0.0,-0.141692,0.141692,0.0,0.0,0.0
7,Enzo Fernandez,Argentina,57,4,1,54,2,0,2,3,1,1,64,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Giovani Lo Celso,Argentina,10,2,0,8,0,0,0,1,1,0,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Gonzalo Ariel Montiel,Argentina,25,1,1,18,1,1,1,1,1,0,32,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
events_df.describe()

Unnamed: 0,Ball Receipt*,Ball Recovery,Block,Carry,Clearance,Dribble,Duel,Foul Committed,Foul Won,Interception,Pass,Shot_x,Shot_y,xG,Goals,xG Overperformance,xG per Shot,Shots on Target,Shot Accuracy %,Big Chances Missed
count,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0
mean,32.705882,2.617647,1.323529,27.647059,0.823529,1.058824,2.352941,0.882353,0.852941,0.470588,35.176471,0.882353,0.882353,0.060908,0.029412,-0.031496,0.039293,0.264706,17.156863,0.0
std,20.304544,2.510761,1.6278,19.003799,1.54666,1.686689,1.998217,1.249599,1.158169,0.861123,23.258066,0.945955,0.945955,0.10161,0.171499,0.17714,0.059133,0.511019,34.202136,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.448171,0.0,0.0,0.0,0.0
25%,15.25,1.0,0.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,14.25,0.0,0.0,0.0,0.0,-0.06893,0.0,0.0,0.0,0.0
50%,28.5,2.0,1.0,25.0,0.0,0.5,2.0,0.0,0.5,0.0,34.0,1.0,1.0,0.02006,0.0,-0.01229,0.017739,0.0,0.0,0.0
75%,51.5,4.0,2.0,47.0,1.0,1.0,3.75,1.0,1.0,1.0,55.0,1.0,1.0,0.073379,0.0,0.0,0.045543,0.0,0.0,0.0
max,76.0,11.0,6.0,71.0,6.0,7.0,7.0,5.0,5.0,3.0,86.0,3.0,3.0,0.448171,1.0,0.801189,0.224085,2.0,100.0,0.0


## Modelado (Preliminar)

## Incorporación Shapley:

# Creación de funciones: 

## 1. Preparación de df con datos: 

In [20]:
def prepare_model_data(match_id, team_name=None):
    # Obtener los eventos del partido usando match_id
    match_data = sb.events(match_id=match_id)
    
    # Obtener la lista de equipos participantes en el partido
    teams_in_match = match_data['team'].unique()
    
    # Si se proporciona un equipo, verificar si está en el partido
    if team_name:
        if team_name not in teams_in_match:
            return f"El equipo '{team_name}' no jugó este partido."
    
    # Especificar los tipos de eventos a incluir (solo los indicados en model_cols)
    model_cols = ['Ball Receipt*', 'Ball Recovery', 'Block', 'Carry', 'Dribble', 'Duel', 'Pass']
    
    # Filtrar los eventos para incluir solo los tipos especificados
    events_df = match_data[match_data['type'].isin(model_cols)].copy()
    
    # Si se proporciona un equipo, filtrar los eventos por equipo
    if team_name:
        events_df = events_df[events_df['team'] == team_name].copy()

    # Obtener los eventos de tiros para calcular las métricas de xG
    shot_events = match_data[match_data['type'] == 'Shot'].copy()
    if team_name:
        shot_events = shot_events[shot_events['team'] == team_name].copy()
    
    # Calcular total de xG por jugador y equipo
    xg_df = (
        shot_events
        .groupby(['player', 'team'])['shot_statsbomb_xg']
        .sum()
        .reset_index(name='xG')
    )
    
    # Calcular número de goles por jugador (donde 'shot_outcome' == 'Goal')
    goals_df = (
        shot_events[shot_events['shot_outcome'] == 'Goal']
        .groupby(['player', 'team'])
        .size()
        .reset_index(name='Goals')
    )
    
    # Calcular total de tiros por jugador
    total_shots = (
        shot_events
        .groupby(['player', 'team'])
        .size()
        .reset_index(name='Shot')
    )
    
    # Crear un DataFrame para contener las estadísticas de tiro
    shot_stats_df = total_shots.merge(xg_df, on=['player', 'team'], how='left').fillna({'xG': 0})
    shot_stats_df = shot_stats_df.merge(goals_df, on=['player', 'team'], how='left').fillna({'Goals': 0})
    
    # Calcular la sobreperformance de xG
    shot_stats_df['xG Overperformance'] = shot_stats_df['Goals'] - shot_stats_df['xG']
    
    # Calcular xG por tiro
    shot_stats_df['xG per Shot'] = (shot_stats_df['xG'] / shot_stats_df['Shot']).fillna(0)
    
    # Calcular tiros a puerta (Shots on Target)
    shots_on_target = shot_events[shot_events['shot_outcome'].isin(['Goal', 'Saved'])]
    shots_on_target_count = shots_on_target.groupby(['player', 'team']).size().reset_index(name='Shots on Target')
    
    # Unir tiros a puerta al DataFrame de estadísticas de tiro
    shot_stats_df = shot_stats_df.merge(shots_on_target_count, on=['player', 'team'], how='left').fillna({'Shots on Target': 0})
    
    # Calcular precisión de tiro (Shot Accuracy %)
    shot_stats_df['Shot Accuracy %'] = (shot_stats_df['Shots on Target'] / shot_stats_df['Shot']) * 100
    shot_stats_df['Shot Accuracy %'] = shot_stats_df['Shot Accuracy %'].fillna(0)
    
    # Calcular grandes ocasiones falladas (xG >= 0.3)
    big_chances_missed = shot_events[
        (shot_events['shot_statsbomb_xg'] >= 0.3) &
        (shot_events['shot_outcome'] != 'Goal')
    ]
    big_chances_missed_count = big_chances_missed.groupby(['player', 'team']).size().reset_index(name='Big Chances Missed')
    
    # Unir grandes ocasiones falladas al DataFrame de estadísticas de tiro
    shot_stats_df = shot_stats_df.merge(big_chances_missed_count, on=['player', 'team'], how='left').fillna({'Big Chances Missed': 0})
    
    # Crear una tabla pivote para contar los eventos de interés por jugador y equipo (solo model_cols)
    events_summary_df = (
        events_df.pivot_table(
            index=['player', 'team'],
            columns='type',
            aggfunc='size',
            fill_value=0
        )
        .reset_index()
    )
    
    # Unir las estadísticas de tiro con el resumen de eventos
    events_df = events_summary_df.merge(shot_stats_df, on=['player', 'team'], how='left').fillna(0)
    
    # Redondear todas las columnas numéricas a 4 decimales
    events_df = events_df.round(4)
    
    # Devolver el DataFrame final con solo los eventos de model_cols y las estadísticas de xG
    return events_df


## 2. Obtener partido de fase de grupos: 

In [21]:
def obtener_partidos_fase_grupos(competition_id, season_id):

    # Obtener todos los partidos de la competición y temporada proporcionadas
    matches = sb.matches(competition_id=competition_id, season_id=season_id)    

    # Filtrar solo los partidos de la fase de grupos
    group_stage_matches = matches[
        matches['competition_stage'].apply(lambda x: x['name'] if isinstance(x, dict) else x) == 'Group Stage'
    ]

    return  list(group_stage_matches['match_id'])
    

In [22]:
fase_grupos = obtener_partidos_fase_grupos(223, 282)

df_groups = []

for partido in fase_grupos:
    df_groups.append(prepare_model_data(partido))

all_groups = pd.concat(df_groups)

all_groups = all_groups[all_groups['xG'] != 0] 

## 3. Creación modelo: 

In [24]:
model_cols = ['Ball Receipt*', 'Ball Recovery', 'Block', 'Carry',
              'Dribble', 'Duel', 'Pass', 
              ]
target = ['xG']

In [26]:
# Definir las variables predictoras (features) y la variable target
X = all_groups[model_cols]
y = all_groups[target].values.ravel()

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Escalar las variables predictoras en el conjunto de entrenamiento
scaler = QuantileTransformer()
X_train_scaled = scaler.fit_transform(X_train)

# Guardar el escalador de entrenamiento
scaler_filename = 'scaler_model.pkl'
joblib.dump(scaler, scaler_filename)
print(f"Scaler guardado como: {scaler_filename}")

# Inicializar el modelo Gradient Boosting
gbr = GradientBoostingRegressor(random_state=42)

# Definir el diccionario de hiperparámetros para el GridSearchCV
param_grid = {
    'n_estimators': [100, 300, 500],  # Número de estimadores (árboles)
    'learning_rate': [0.01, 0.05, 0.1],  # Tasa de aprendizaje
    'max_depth': [3, 5, 7],  # Profundidad máxima de los árboles
    'min_samples_split': [2, 5, 10],  # Mínimo número de muestras para dividir un nodo
    'min_samples_leaf': [1, 2, 4],  # Mínimo número de muestras en una hoja
    'subsample': [0.3, 0.8, 1.0],  # Fracción de muestras utilizadas para cada árbol
}

# Configurar el GridSearch con validación cruzada de 5 pliegues y ejecución paralela
grid_search = GridSearchCV(estimator=gbr, 
                           param_grid=param_grid, 
                           cv=5, 
                           n_jobs=-1, 
                           verbose=1, 
                           scoring='neg_mean_squared_error')

# Ajustar el modelo a los datos de entrenamiento escalados
grid_search.fit(X_train_scaled, y_train)

# Imprimir los mejores hiperparámetros encontrados
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)

# Evaluar el modelo con los datos de prueba

# Cargar el escalador guardado antes de transformar los datos de prueba
loaded_scaler = joblib.load(scaler_filename)

# Escalar los datos de prueba utilizando el escalador cargado
X_test_scaled = loaded_scaler.transform(X_test)

# Predecir los valores sobre el conjunto de prueba
y_pred = grid_search.best_estimator_.predict(X_test_scaled)

# Calcular el Error Cuadrático Medio (MSE) y el coeficiente de determinación R^2
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R² (Coeficiente de Determinación): {r2}")

# Guardar el mejor modelo entrenado
model_filename = 'best_random_forest_model.pkl'
joblib.dump(grid_search.best_estimator_, model_filename)
print(f"Modelo guardado como: {model_filename}")

Scaler guardado como: scaler_model.pkl
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Mejores hiperparámetros encontrados:
{'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 1.0}
Mean Squared Error (MSE): 0.08042246663419299
R² (Coeficiente de Determinación): -0.03777456300628357
Modelo guardado como: best_random_forest_model.pkl


In [27]:
loaded_scaler = joblib.load('scaler_model.pkl')
best_rf = joblib.load('best_random_forest_model.pkl')

# Crear un objeto explainer de SHAP para el modelo entrenado

### procesos de join de los dos equipos con las variables de model cols + target

explainer = shap.TreeExplainer(best_rf)
X_scaled = loaded_scaler.transform(events_df[model_cols])

# Calcular los valores SHAP
shap_values = explainer.shap_values(X_scaled)

# Crear un DataFrame con los valores SHAP
shap_df = pd.DataFrame(shap_values, columns=[f'sh_{col}' for col in model_cols])

# Calcular la media de los valores SHAP para cada fila y agregarla como una nueva columna
shap_df['shapley'] = shap_df.mean(axis=1)

# Hacer predicciones con el modelo cargado y agregar la columna y_pred al DataFrame original
shap_df['y_pred'] = best_rf.predict(X_scaled)

# Unir el DataFrame original con el DataFrame de SHAP
final_df = pd.concat([events_df.reset_index(drop=True), shap_df], axis=1)

# Filtrar el DataFrame para incluir sólo las columnas usadas en el modelo, SHAP values, el target y y_pred
columns_to_keep = ['player'] + model_cols + list(shap_df.columns) + target
final_df_filtered = final_df[columns_to_keep]

final_df_filtered.head(10)

Unnamed: 0,player,Ball Receipt*,Ball Recovery,Block,Carry,Dribble,Duel,Pass,sh_Ball Receipt*,sh_Ball Recovery,sh_Block,sh_Carry,sh_Dribble,sh_Duel,sh_Pass,shapley,y_pred,xG
0,Alexis Mac Allister,62,4,3,44,1,4,59,0.036143,0.000167,0.000986,0.000826,-0.004448,-0.006035,-0.041812,-0.002025,0.149473,0.040733
1,Andrés Mateus Uribe Villa,7,1,0,9,0,1,10,-0.017936,0.000589,0.00118,0.000692,-0.005952,0.00293,-0.0042,-0.003242,0.140949,0.018107
2,Camilo Andrés Vargas Gil,19,7,1,23,0,0,44,-0.0185,-0.004827,0.001059,-0.001542,-0.005947,0.008962,-0.014714,-0.005073,0.128137,0.0
3,Carlos Eccehomo Cuesta Figueroa,50,3,1,48,0,5,57,0.019944,0.000259,0.000986,-0.001392,-0.004492,-0.003561,-0.03347,-0.003104,0.14192,0.320038
4,Cristian Gabriel Romero,30,0,1,27,0,3,41,0.011395,0.000614,0.001099,-0.00023,-0.004425,-0.003561,-0.020048,-0.002165,0.14849,0.0
5,Damián Emiliano Martínez,15,6,0,23,0,0,46,-0.01842,0.000589,0.001023,-0.001542,-0.005983,0.008962,-0.014144,-0.004216,0.134132,0.0
6,Davinson Sánchez Mina,76,5,0,71,1,5,86,0.035629,0.000167,0.000986,-0.001392,-0.004448,-0.006035,-0.04342,-0.002644,0.145134,0.141692
7,Enzo Fernandez,57,4,1,54,0,2,64,0.033469,0.000167,0.000986,-0.001392,-0.004492,-0.006035,-0.044429,-0.003104,0.14192,0.0
8,Giovani Lo Celso,10,2,0,8,0,0,7,-0.017967,0.000589,0.00118,0.000692,-0.005952,0.00293,-0.004169,-0.003242,0.140949,0.0
9,Gonzalo Ariel Montiel,25,1,1,18,1,1,32,-0.017967,0.000589,0.001059,0.000692,-0.005944,0.00293,-0.004056,-0.003242,0.140949,0.0
