In [1]:
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
from mplsoccer import VerticalPitch, Pitch
from highlight_text import ax_text, fig_text
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
m1 = pd.read_csv("WhoScored_ENG-Premier League.csv")
# m2 = pd.read_csv("WhoScored_ESP-La Liga.csv")
# m3 = pd.read_csv("WhoScored_FRA-Ligue 1.csv")
# m4 = pd.read_csv("WhoScored_GER-Bundesliga.csv")
# m5 = pd.read_csv("WhoScored_ITA-Serie A.csv")


In [3]:
matches = pd.concat([m1], axis = 0)

In [4]:
matches = matches[["game","game_id","type","outcome_type","team","fecha","player", "x", "y","minute"]]

In [5]:
matches.shape

(7583712, 10)

In [6]:
example_match = matches[matches["game_id"] == 1729483] # 1729340  1729483 615168 410988

In [7]:
print(example_match["type"].unique())
print(example_match["outcome_type"].unique())

['FormationSet' 'Start' 'Pass' 'End' 'BallTouch' 'Foul' 'Aerial'
 'Interception' 'Dispossessed' 'Tackle' 'TakeOn' 'Clearance'
 'CornerAwarded' 'Claim' 'BlockedPass' 'Challenge' 'BallRecovery'
 'SavedShot' 'Save' 'KeeperPickup' 'OffsidePass' 'OffsideProvoked'
 'OffsideGiven' 'MissedShots' 'KeeperSweeper' 'ShieldBallOpp'
 'SubstitutionOff' 'SubstitutionOn' 'Goal' 'Punch' 'FormationChange'
 'CrossNotClaimed' 'Card']
['Successful' 'Unsuccessful']


In [8]:
def get_features_individuales_temporal(df_matches, match, bins_x, bins_y, time_division): # time_division : divisor de 90 (usar por ejemplo 3,5,15,30,45)
    
    divisiones = list(range(0,90+1,time_division))
    # Agregar tiempo extra
    divisiones[-1] += 15
    ## Features elegidos
    features = ['Pass', 'TakeOn', 'Tackle', 'BallRecovery'
                'Interception', 'BlockedPass', 'Clearance', 'CornerAwarded', 'Aerial', 'Foul',
                'Dispossessed', 'BallTouch', 'ShieldBallOpp', 'Challenge', 'MissedShots',
                'SavedShot', 'Save', 'KeeperPickup', 'Goal', 'KeeperSweeper', 'Card', 'Smother',
                'OffsideProvoked', 'OffsidePass', 'OffsideGiven', 'Punch', 'Error']
    ##
    date = match["fecha"].unique()[0]

    df_matches_limited = df_matches[df_matches["fecha"] < date]
    str_teams = match["team"].unique()
    str_team1 = str_teams[0]
    str_team2 = str_teams[1]
    
    #### SELECCIONAR LOS 10 CON MAYOR PARTICIPACION + PORTERO
    team1 = []
    team2 = []
    ## Seleccionar portero
    keeperPickup = match[match["type"]== "KeeperPickup"]
    keeper1 = keeperPickup[keeperPickup["team"] == str_team1].player.unique()
    keeper2 = keeperPickup[keeperPickup["team"] == str_team2].player.unique()
    if len(keeper1) > 0:
        team1.append(keeper1[0])
    if len(keeper2) > 0:
        team2.append(keeper2[0])
    ### Rellenar hasta llegar a 11:
    
    no_keeper_team1 = match[match["team"] == str_team1]["player"].value_counts().drop(keeper1[0]).index[:11-len(keeper1)].to_list()
    no_keeper_team2 = match[match["team"] == str_team2]["player"].value_counts().drop(keeper2[0]).index[:11-len(keeper2)].to_list()

    team1.extend(no_keeper_team1)
    team2.extend(no_keeper_team2)
    ####

    # Features team1
    all_players = []
    for player in team1:
        dict_player = {"Player": player}
        player_events = df_matches_limited[df_matches_limited["player"] == player]

        pitch = Pitch(pitch_type='opta', pitch_color='white',
                      line_color='black', line_zorder=2)
        for fea in features:
            for div in divisiones[:-1]:
                player_events_time = player_events[(player_events["minute"]>div) & (player_events["minute"]<=div+time_division)]

                event_succesful = player_events_time[(player_events_time["type"] == fea) & (
                    player_events_time["outcome_type"] == 'Successful')]
                event_unsuccessful = player_events_time[(player_events_time["type"] == fea) & (
                    player_events_time["outcome_type"] == 'Unsuccessful')]

                bin_statistic_succesful = pitch.bin_statistic(
                    event_succesful.x, event_succesful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

                bin_statistic_unsuccesful = pitch.bin_statistic(
                    event_unsuccessful.x, event_unsuccessful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

                bins_successful = bin_statistic_succesful["statistic"].flatten()
                bins_unsuccessful = bin_statistic_unsuccesful["statistic"].flatten()

                for i in range(len(bins_successful)):
                    dict_player[f"{fea}_Successful_bin_{i}_time_{div}-{div+time_division}"] = bins_successful[i]
                    dict_player[f"{fea}_Unsuccessful_bin_{i}_time_{div}-{div+time_division}"] = bins_unsuccessful[i]

        # Agregar la cantidad de juegos 
        dict_player["Number_matches"] = len(player_events["game"].unique())

        all_players.append(dict_player)

    df_all_players_team1 = pd.DataFrame(all_players)

    # Features team2
    all_players = []
    for player in team2:
        dict_player = {"Player": player}
        player_events = df_matches_limited[df_matches_limited["player"] == player]

        pitch = Pitch(pitch_type='opta', pitch_color='white',
                      line_color='black', line_zorder=2)
        for fea in features:
            for div in divisiones[:-1]:
                player_events_time = player_events[(player_events["minute"]>div) & (player_events["minute"]<=div+time_division)]

                event_succesful = player_events_time[(player_events_time["type"] == fea) & (
                    player_events_time["outcome_type"] == 'Successful')]
                event_unsuccessful = player_events_time[(player_events_time["type"] == fea) & (
                    player_events_time["outcome_type"] == 'Unsuccessful')]

                bin_statistic_succesful = pitch.bin_statistic(
                    event_succesful.x, event_succesful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

                bin_statistic_unsuccesful = pitch.bin_statistic(
                    event_unsuccessful.x, event_unsuccessful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

                bins_successful = bin_statistic_succesful["statistic"].flatten()
                bins_unsuccessful = bin_statistic_unsuccesful["statistic"].flatten()

                for i in range(len(bins_successful)):
                    dict_player[f"{fea}_Successful_bin_{i}_time_{div}-{div+time_division}"] = bins_successful[i]
                    dict_player[f"{fea}_Unsuccessful_bin_{i}_time_{div}-{div+time_division}"] = bins_unsuccessful[i]

        # Agregar la cantidad de juegos 
        dict_player["Number_matches"] = len(player_events["game"].unique())

        all_players.append(dict_player)

    df_all_players_team2 = pd.DataFrame(all_players)

    # Agrgar el nombre del equipo
    df_all_players_team1["Team"] = str_team1 
    df_all_players_team2["Team"] = str_team2

    # Agregar fecha
    df_all_players_team1["Date"] = date
    df_all_players_team2["Date"] = date
    
    # Agregar id

    df_all_players_team1["game_id"] = match.game_id.unique()[0]
    df_all_players_team2["game_id"] = match.game_id.unique()[0]

    # Agregar resultado goles del equipo final
    df_all_players_team1["Goals"] = len(match[(match["type"] == "Goal") & (match["team"] == str_team1)])
    df_all_players_team2["Goals"] = len(match[(match["type"] == "Goal") & (match["team"] == str_team2)])

    columnas_a_mover = ["Player","Team","Date","Goals","Number_matches","game_id"]

    # Obtén una lista de todas las columnas en el DataFrame
    columnas = df_all_players_team1.columns.tolist()

    # Mueve las columnas deseadas al principio
    for columna in reversed(columnas_a_mover):
        columnas.insert(0, columnas.pop(columnas.index(columna)))
    # Reordena las columnas del DataFrame
    df_all_players_team1 = df_all_players_team1[columnas]
    df_all_players_team2 = df_all_players_team2[columnas]

    return df_all_players_team1, df_all_players_team2

df_all_players_team1, df_all_players_team2 = get_features_individuales_temporal(matches, example_match, bins_x = 2, bins_y = 2, time_division=15) #usar divisor de 90

In [9]:
df_all_players_team1

Unnamed: 0,Player,Team,Date,Goals,Number_matches,game_id,Pass_Successful_bin_0_time_0-15,Pass_Unsuccessful_bin_0_time_0-15,Pass_Successful_bin_1_time_0-15,Pass_Unsuccessful_bin_1_time_0-15,...,Error_Successful_bin_3_time_60-75,Error_Unsuccessful_bin_3_time_60-75,Error_Successful_bin_0_time_75-90,Error_Unsuccessful_bin_0_time_75-90,Error_Successful_bin_1_time_75-90,Error_Unsuccessful_bin_1_time_75-90,Error_Successful_bin_2_time_75-90,Error_Unsuccessful_bin_2_time_75-90,Error_Successful_bin_3_time_75-90,Error_Unsuccessful_bin_3_time_75-90
0,Jordan Pickford,Everton,2024-02-19,1,237,1729483,506.0,310.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
1,James Tarkowski,Everton,2024-02-19,1,221,1729483,381.0,93.0,62.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ben Godfrey,Everton,2024-02-19,1,79,1729483,260.0,33.0,57.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,Vitalii Mykolenko,Everton,2024-02-19,1,53,1729483,141.0,40.0,136.0,34.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,James Garner,Everton,2024-02-19,1,41,1729483,15.0,7.0,24.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Jarrad Branthwaite,Everton,2024-02-19,1,25,1729483,88.0,22.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Dominic Calvert-Lewin,Everton,2024-02-19,1,184,1729483,32.0,22.0,105.0,57.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Idrissa Gueye,Everton,2024-02-19,1,183,1729483,326.0,39.0,278.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Dwight McNeil,Everton,2024-02-19,1,153,1729483,166.0,70.0,288.0,164.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Ashley Young,Everton,2024-02-19,1,273,1729483,347.0,86.0,607.0,237.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df_all_players_team2

Unnamed: 0,Player,Team,Date,Goals,Number_matches,game_id,Pass_Successful_bin_0_time_0-15,Pass_Unsuccessful_bin_0_time_0-15,Pass_Successful_bin_1_time_0-15,Pass_Unsuccessful_bin_1_time_0-15,...,Error_Successful_bin_3_time_60-75,Error_Unsuccessful_bin_3_time_60-75,Error_Successful_bin_0_time_75-90,Error_Unsuccessful_bin_0_time_75-90,Error_Successful_bin_1_time_75-90,Error_Unsuccessful_bin_1_time_75-90,Error_Successful_bin_2_time_75-90,Error_Unsuccessful_bin_2_time_75-90,Error_Successful_bin_3_time_75-90,Error_Unsuccessful_bin_3_time_75-90
0,Sam Johnstone,Crystal Palace,2024-02-19,1,62,1729483,89.0,56.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,Tyrick Mitchell,Crystal Palace,2024-02-19,1,82,1729483,257.0,122.0,197.0,65.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jordan Ayew,Crystal Palace,2024-02-19,1,230,1729483,106.0,28.0,255.0,68.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Joachim Andersen,Crystal Palace,2024-02-19,1,87,1729483,132.0,22.0,19.0,8.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Adam Wharton,Crystal Palace,2024-02-19,1,2,1729483,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Chris Richards,Crystal Palace,2024-02-19,1,24,1729483,25.0,4.0,9.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Daniel Muñoz,Crystal Palace,2024-02-19,1,2,1729483,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Jean-Philippe Mateta,Crystal Palace,2024-02-19,1,57,1729483,7.0,2.0,17.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Joel Ward,Crystal Palace,2024-02-19,1,266,1729483,130.0,64.0,140.0,67.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
9,Odsonne Édouard,Crystal Palace,2024-02-19,1,53,1729483,18.0,5.0,32.0,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# bins_x = 12
# bins_y = 8
# time_division = 15
# n_workers = 12
# matches_id = matches["game_id"].unique()
# resultado = []

# #En serie 
# for id in tqdm(matches_id):
#     data = get_features_individuales_temporal(matches, matches[matches["game_id"] == id], bins_x, bins_y, time_division)
#     resultado.append(data)

# #En paralelo
# resultado = Parallel(n_jobs=n_workers, verbose=10)(
#     delayed(get_features_individuales_temporal)(matches[matches["fecha"]<=matches[matches["game_id"] == id]["fecha"].unique()[0]], matches[matches["game_id"] == id], bins_x, bins_y, time_division) for id in tqdm(matches_id))

In [12]:
# dfs_list = []
# for df in resultado:
#     dfs_list.append(df[0])
#     dfs_list.append(df[1])
# df = pd.concat(dfs_list, axis = 0)
# # df = df.drop(["Player","Team","Date","Goals"],axis = 1)

In [13]:
# df.to_csv("data_espacial_whoscored.csv")

In [14]:
# df.head()
