In [1]:
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
from mplsoccer import VerticalPitch, Pitch
from highlight_text import ax_text, fig_text
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
m1 = pd.read_csv("WhoScored_ENG-Premier League.csv")
m2 = pd.read_csv("WhoScored_ESP-La Liga.csv")
m3 = pd.read_csv("WhoScored_FRA-Ligue 1.csv")
m4 = pd.read_csv("WhoScored_GER-Bundesliga.csv")
m5 = pd.read_csv("WhoScored_ITA-Serie A.csv")


In [3]:
matches = pd.concat([m1,m2,m3,m4,m5], axis = 0)

In [4]:
len(matches["game_id"].unique()[:])

22968

In [5]:
matches = matches[["game","game_id","type","outcome_type","team","fecha","player", "x", "y","minute"]]
matches = matches.sort_values(by=["fecha","game_id","minute"],ignore_index=True)

In [6]:
matches.shape

(35832810, 10)

In [7]:
example_match = matches[matches["game_id"] == 1729340] # 1729340  1729483 615168 410988 433797

In [8]:
example_match

Unnamed: 0,game,game_id,type,outcome_type,team,fecha,player,x,y,minute
35813304,2024-02-21 Liverpool-Luton,1729340,FormationSet,Successful,Liverpool,2024-02-21,,0.0,0.0,0
35813305,2024-02-21 Liverpool-Luton,1729340,FormationSet,Successful,Luton,2024-02-21,,0.0,0.0,0
35813306,2024-02-21 Liverpool-Luton,1729340,Start,Successful,Liverpool,2024-02-21,,0.0,0.0,0
35813307,2024-02-21 Liverpool-Luton,1729340,Start,Successful,Luton,2024-02-21,,0.0,0.0,0
35813308,2024-02-21 Liverpool-Luton,1729340,Pass,Successful,Luton,2024-02-21,Albert Sambi Lokonga,50.2,50.1,0
...,...,...,...,...,...,...,...,...,...,...
35814893,2024-02-21 Liverpool-Luton,1729340,BallTouch,Unsuccessful,Liverpool,2024-02-21,James McConnell,79.5,32.5,97
35814894,2024-02-21 Liverpool-Luton,1729340,Pass,Successful,Luton,2024-02-21,Amari'i Bell,29.3,51.2,97
35814895,2024-02-21 Liverpool-Luton,1729340,Pass,Successful,Luton,2024-02-21,Issa Kaboré,42.5,7.7,98
35814896,2024-02-21 Liverpool-Luton,1729340,End,Successful,Liverpool,2024-02-21,,0.0,0.0,98


In [9]:
print(example_match["type"].unique())
print(example_match["outcome_type"].unique())

['FormationSet' 'Start' 'Pass' 'End' 'TakeOn' 'Tackle' 'BallRecovery'
 'Interception' 'BlockedPass' 'Clearance' 'CornerAwarded' 'Aerial' 'Foul'
 'Dispossessed' 'BallTouch' 'ShieldBallOpp' 'Challenge' 'MissedShots'
 'SavedShot' 'Save' 'KeeperPickup' 'Goal' 'KeeperSweeper' 'Card' 'Smother'
 'OffsideProvoked' 'OffsidePass' 'OffsideGiven' 'SubstitutionOff'
 'SubstitutionOn' 'FormationChange' 'Punch' 'Error']
['Successful' 'Unsuccessful']


In [10]:
def features_per_match(match, bins_x, bins_y, time_division): # time_division : divisor de 90 (usar por ejemplo 3,5,15,30,45)
    
    divisiones = list(range(0,90+1,time_division))
    # Agregar tiempo extra
    divisiones[-1] += 15
    ## Features elegidos
    features = ['Pass', 'TakeOn', 'Tackle', 'BallRecovery'
                'Interception', 'BlockedPass', 'Clearance', 'CornerAwarded', 'Aerial', 'Foul',
                'Dispossessed', 'BallTouch', 'ShieldBallOpp', 'Challenge', 'MissedShots',
                'SavedShot', 'Save', 'KeeperPickup', 'Goal', 'KeeperSweeper', 'Card', 'Smother',
                'OffsideProvoked', 'OffsidePass', 'OffsideGiven', 'Punch', 'Error']
    ##
    date = match["fecha"].unique()[0]

    str_teams = match["team"].unique()
    str_team1 = str_teams[0]
    str_team2 = str_teams[1]

    team1_match_ids = match[match["team"] == str_team1].game_id.unique()
    team1_events = match[(match["team"] == str_team1) & (match["game_id"].isin(team1_match_ids))]

    team2_match_ids = match[match["team"] == str_team2].game_id.unique()
    team2_events = match[(match["team"] == str_team2) & (match["game_id"].isin(team2_match_ids))]


    dict_team = {"Team" : str_team1}
    for fea in features:
        for div in divisiones[:-1]:
            pitch = Pitch(pitch_type='opta', pitch_color='white',
                            line_color='black', line_zorder=2)
            k = 0
            if div+time_division>=90:
                k = 20 # Tiempo extra
            player_events_time = team1_events[(team1_events["minute"]>div) & (team1_events["minute"]<=div+time_division+k)]

            event_succesful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Successful')]
            event_unsuccessful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Unsuccessful')]

            bin_statistic_succesful = pitch.bin_statistic(
                event_succesful.x, event_succesful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bin_statistic_unsuccesful = pitch.bin_statistic(
                event_unsuccessful.x, event_unsuccessful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bins_successful = bin_statistic_succesful["statistic"].flatten()
            bins_unsuccessful = bin_statistic_unsuccesful["statistic"].flatten()

            for i in range(len(bins_successful)):
                dict_team[f"{fea}_Successful_bin_{i}_time_{div}-{div+time_division}"] = bins_successful[i]
                dict_team[f"{fea}_Unsuccessful_bin_{i}_time_{div}-{div+time_division}"] = bins_unsuccessful[i]

    df_team1 = pd.DataFrame([dict_team])

    dict_team = {"Team" : str_team2}
    for fea in features:
        for div in divisiones[:-1]:
            pitch = Pitch(pitch_type='opta', pitch_color='white',
                            line_color='black', line_zorder=2)
            k = 0
            if div+time_division>=90:
                k = 20 # Tiempo extra, con 20 minutos nos aseguramos.
            player_events_time = team2_events[(team2_events["minute"]>div) & (team2_events["minute"]<=div+time_division + k)]

            event_succesful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Successful')]
            event_unsuccessful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Unsuccessful')]

            bin_statistic_succesful = pitch.bin_statistic(
                event_succesful.x, event_succesful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bin_statistic_unsuccesful = pitch.bin_statistic(
                event_unsuccessful.x, event_unsuccessful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bins_successful = bin_statistic_succesful["statistic"].flatten()
            bins_unsuccessful = bin_statistic_unsuccesful["statistic"].flatten()

            for i in range(len(bins_successful)):
                dict_team[f"{fea}_Successful_bin_{i}_time_{div}-{div+time_division}"] = bins_successful[i]
                dict_team[f"{fea}_Unsuccessful_bin_{i}_time_{div}-{div+time_division}"] = bins_unsuccessful[i]

    df_team2 = pd.DataFrame([dict_team])

    # Agregar fecha
    df_team1["Date"] = date
    df_team2["Date"] = date
    
    # Agregar id

    df_team1["game_id"] = match.game_id.unique()[0]
    df_team2["game_id"] = match.game_id.unique()[0]

    # Agregar resultado goles del equipo final
    df_team1["Goals"] = len(match[(match["type"] == "Goal") & (match["team"] == str_team1)])
    df_team2["Goals"] = len(match[(match["type"] == "Goal") & (match["team"] == str_team2)])

    columnas_a_mover = ["Team","Date","Goals","game_id"]

    # Obtén una lista de todas las columnas en el DataFrame
    columnas = df_team1.columns.tolist()

    # Mueve las columnas deseadas al principio
    for columna in reversed(columnas_a_mover):
        columnas.insert(0, columnas.pop(columnas.index(columna)))
    # Reordena las columnas del DataFrame
    df_team1 = df_team1[columnas]
    df_team2 = df_team2[columnas]

    return df_team1, df_team2

df_all_players_team1, df_all_players_team2 = features_per_match(example_match, bins_x = 8, bins_y = 6, time_division=90) #usar divisor de 90

In [11]:
df_all_players_team1

Unnamed: 0,Team,Date,Goals,game_id,Pass_Successful_bin_0_time_0-90,Pass_Unsuccessful_bin_0_time_0-90,Pass_Successful_bin_1_time_0-90,Pass_Unsuccessful_bin_1_time_0-90,Pass_Successful_bin_2_time_0-90,Pass_Unsuccessful_bin_2_time_0-90,...,Error_Successful_bin_43_time_0-90,Error_Unsuccessful_bin_43_time_0-90,Error_Successful_bin_44_time_0-90,Error_Unsuccessful_bin_44_time_0-90,Error_Successful_bin_45_time_0-90,Error_Unsuccessful_bin_45_time_0-90,Error_Successful_bin_46_time_0-90,Error_Unsuccessful_bin_46_time_0-90,Error_Successful_bin_47_time_0-90,Error_Unsuccessful_bin_47_time_0-90
0,Liverpool,2024-02-21,4,1729340,1.0,0.0,2.0,0.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df_all_players_team2

Unnamed: 0,Team,Date,Goals,game_id,Pass_Successful_bin_0_time_0-90,Pass_Unsuccessful_bin_0_time_0-90,Pass_Successful_bin_1_time_0-90,Pass_Unsuccessful_bin_1_time_0-90,Pass_Successful_bin_2_time_0-90,Pass_Unsuccessful_bin_2_time_0-90,...,Error_Successful_bin_43_time_0-90,Error_Unsuccessful_bin_43_time_0-90,Error_Successful_bin_44_time_0-90,Error_Unsuccessful_bin_44_time_0-90,Error_Successful_bin_45_time_0-90,Error_Unsuccessful_bin_45_time_0-90,Error_Successful_bin_46_time_0-90,Error_Unsuccessful_bin_46_time_0-90,Error_Successful_bin_47_time_0-90,Error_Unsuccessful_bin_47_time_0-90
0,Luton,2024-02-21,1,1729340,3.0,3.0,2.0,2.0,8.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
bins_x = 4
bins_y = 3
time_division = [5]

n_workers = 12
matches_id = matches["game_id"].unique()[:]
resultado = []


# #En serie 
# for id in tqdm(matches_id):
#     data = features_per_match(matches[matches["game_id"] == id], bins_x, bins_y, time_division)
#     resultado.append(data)

for td in time_division:
    #En paralelo
    resultado = Parallel(n_jobs=n_workers, verbose=10)(
        delayed(features_per_match)(matches[matches["game_id"] == id], bins_x, bins_y, td) for id in tqdm(matches_id))

    dfs_list = []
    for df in resultado:
        dfs_list.append(df[0])
        dfs_list.append(df[1])
    df = pd.concat(dfs_list, axis = 0)

    df.to_csv(f"WhoScoredTeamPerMatchSpatial{bins_x}x{bins_y}TimeDiv{td}.csv")

  0%|          | 0/22968 [00:00<?, ?it/s][Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
  0%|          | 22/22968 [00:00<13:08, 29.10it/s][Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   16.4s
  0%|          | 33/22968 [00:16<4:22:29,  1.46it/s][Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:   17.4s
  0%|          | 45/22968 [00:29<3:57:38,  1.61it/s] [Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:   30.4s
  0%|          | 57/22968 [00:43<4:03:21,  1.57it/s] [Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:   43.6s
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:   56.1s
  0%|          | 69/22968 [00:56<3:52:09,  1.64it/s] [Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  1.0min
  0%|          | 81/22968 [01:10<4:01:47,  1.58it/s] [Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  1.4min
  0%|          | 105/22968 [01:38<4:16:15,  1.49it/s][Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  1.6min
  1%|          | 11

In [15]:
df

Unnamed: 0,Team,Date,Goals,game_id,Pass_Successful_bin_0_time_0-5,Pass_Unsuccessful_bin_0_time_0-5,Pass_Successful_bin_1_time_0-5,Pass_Unsuccessful_bin_1_time_0-5,Pass_Successful_bin_2_time_0-5,Pass_Unsuccessful_bin_2_time_0-5,...,Error_Successful_bin_7_time_85-90,Error_Unsuccessful_bin_7_time_85-90,Error_Successful_bin_8_time_85-90,Error_Unsuccessful_bin_8_time_85-90,Error_Successful_bin_9_time_85-90,Error_Unsuccessful_bin_9_time_85-90,Error_Successful_bin_10_time_85-90,Error_Unsuccessful_bin_10_time_85-90,Error_Successful_bin_11_time_85-90,Error_Unsuccessful_bin_11_time_85-90
0,Lorient,2010-08-07,2,406097,3.0,0.0,11.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,Auxerre,2010-08-07,2,406097,1.0,0.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,Lens,2010-08-07,1,406098,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,Nancy,2010-08-07,2,406098,0.0,0.0,2.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,Monaco,2010-08-07,0,406099,0.0,0.0,1.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Monaco,2024-02-25,2,1741135,0.0,0.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,Le Havre,2024-02-25,1,1741137,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,Reims,2024-02-25,2,1741137,3.0,0.0,1.0,0.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,Rennes,2024-02-25,1,1741159,3.0,0.0,5.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
