In [1]:
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
from mplsoccer import VerticalPitch, Pitch
from highlight_text import ax_text, fig_text
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
m1 = pd.read_csv("WhoScored_ENG-Premier League.csv")
# m2 = pd.read_csv("WhoScored_ESP-La Liga.csv")
# m3 = pd.read_csv("WhoScored_FRA-Ligue 1.csv")
# m4 = pd.read_csv("WhoScored_GER-Bundesliga.csv")
# m5 = pd.read_csv("WhoScored_ITA-Serie A.csv")


In [3]:
matches = pd.concat([m1], axis = 0)

In [4]:
len(matches["game_id"].unique()[:])

4810

In [5]:
matches = matches[["game","game_id","type","outcome_type","team","fecha","player", "x", "y","minute"]]
matches = matches.sort_values(by=["fecha","game_id","minute"],ignore_index=True)

In [6]:
matches.shape

(7583712, 10)

In [7]:
example_match = matches[matches["game_id"] == 1729340] # 1729340  1729483 615168 410988 433797

In [8]:
example_match

Unnamed: 0,game,game_id,type,outcome_type,team,fecha,player,x,y,minute
7582118,2024-02-21 Liverpool-Luton,1729340,FormationSet,Successful,Liverpool,2024-02-21,,0.0,0.0,0
7582119,2024-02-21 Liverpool-Luton,1729340,FormationSet,Successful,Luton,2024-02-21,,0.0,0.0,0
7582120,2024-02-21 Liverpool-Luton,1729340,Start,Successful,Liverpool,2024-02-21,,0.0,0.0,0
7582121,2024-02-21 Liverpool-Luton,1729340,Start,Successful,Luton,2024-02-21,,0.0,0.0,0
7582122,2024-02-21 Liverpool-Luton,1729340,Pass,Successful,Luton,2024-02-21,Albert Sambi Lokonga,50.2,50.1,0
...,...,...,...,...,...,...,...,...,...,...
7583707,2024-02-21 Liverpool-Luton,1729340,BallTouch,Unsuccessful,Liverpool,2024-02-21,James McConnell,79.5,32.5,97
7583708,2024-02-21 Liverpool-Luton,1729340,Pass,Successful,Luton,2024-02-21,Amari'i Bell,29.3,51.2,97
7583709,2024-02-21 Liverpool-Luton,1729340,Pass,Successful,Luton,2024-02-21,Issa Kaboré,42.5,7.7,98
7583710,2024-02-21 Liverpool-Luton,1729340,End,Successful,Liverpool,2024-02-21,,0.0,0.0,98


In [9]:
print(example_match["type"].unique())
print(example_match["outcome_type"].unique())

['FormationSet' 'Start' 'Pass' 'End' 'TakeOn' 'Tackle' 'BallRecovery'
 'Interception' 'BlockedPass' 'Clearance' 'CornerAwarded' 'Aerial' 'Foul'
 'Dispossessed' 'BallTouch' 'ShieldBallOpp' 'Challenge' 'MissedShots'
 'SavedShot' 'Save' 'KeeperPickup' 'Goal' 'KeeperSweeper' 'Card' 'Smother'
 'OffsideProvoked' 'OffsidePass' 'OffsideGiven' 'SubstitutionOff'
 'SubstitutionOn' 'FormationChange' 'Punch' 'Error']
['Successful' 'Unsuccessful']


In [11]:
def get_features_team_temporal(df_matches, match, n_matches, bins_x, bins_y, time_division): # time_division : divisor de 90 (usar por ejemplo 3,5,15,30,45)
    
    divisiones = list(range(0,90+1,time_division))
    # Agregar tiempo extra
    divisiones[-1] += 15
    ## Features elegidos
    features = ['Pass', 'TakeOn', 'Tackle', 'BallRecovery'
                'Interception', 'BlockedPass', 'Clearance', 'CornerAwarded', 'Aerial', 'Foul',
                'Dispossessed', 'BallTouch', 'ShieldBallOpp', 'Challenge', 'MissedShots',
                'SavedShot', 'Save', 'KeeperPickup', 'Goal', 'KeeperSweeper', 'Card', 'Smother',
                'OffsideProvoked', 'OffsidePass', 'OffsideGiven', 'Punch', 'Error']
    ##
    date = match["fecha"].unique()[0]

    df_matches_limited = df_matches[df_matches["fecha"] < date]
    str_teams = match["team"].unique()
    str_team1 = str_teams[0]
    str_team2 = str_teams[1]

    team1_match_ids = df_matches_limited[df_matches_limited["team"] == str_team1].game_id.unique()[-n_matches:]
    team1_events = df_matches_limited[(df_matches_limited["team"] == str_team1) & (df_matches_limited["game_id"].isin(team1_match_ids))]

    team2_match_ids = df_matches_limited[df_matches_limited["team"] == str_team2].game_id.unique()[-n_matches:]
    team2_events = df_matches_limited[(df_matches_limited["team"] == str_team2) & (df_matches_limited["game_id"].isin(team2_match_ids))]


    dict_team = {"Team" : str_team1}
    for fea in features:
        for div in divisiones[:-1]:
            pitch = Pitch(pitch_type='opta', pitch_color='white',
                            line_color='black', line_zorder=2)
            player_events_time = team1_events[(team1_events["minute"]>div) & (team1_events["minute"]<=div+time_division)]

            event_succesful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Successful')]
            event_unsuccessful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Unsuccessful')]

            bin_statistic_succesful = pitch.bin_statistic(
                event_succesful.x, event_succesful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bin_statistic_unsuccesful = pitch.bin_statistic(
                event_unsuccessful.x, event_unsuccessful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bins_successful = bin_statistic_succesful["statistic"].flatten()
            bins_unsuccessful = bin_statistic_unsuccesful["statistic"].flatten()

            for i in range(len(bins_successful)):
                dict_team[f"{fea}_Successful_bin_{i}_time_{div}-{div+time_division}"] = bins_successful[i]
                dict_team[f"{fea}_Unsuccessful_bin_{i}_time_{div}-{div+time_division}"] = bins_unsuccessful[i]

        # Agregar la cantidad de juegos 
        dict_team["Number_matches"] = len(player_events_time["game"].unique())
    
    df_team1 = pd.DataFrame([dict_team])

    dict_team = {"Team" : str_team2}
    for fea in features:
        for div in divisiones[:-1]:
            pitch = Pitch(pitch_type='opta', pitch_color='white',
                            line_color='black', line_zorder=2)
            player_events_time = team2_events[(team2_events["minute"]>div) & (team2_events["minute"]<=div+time_division)]

            event_succesful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Successful')]
            event_unsuccessful = player_events_time[(player_events_time["type"] == fea) & (
                player_events_time["outcome_type"] == 'Unsuccessful')]

            bin_statistic_succesful = pitch.bin_statistic(
                event_succesful.x, event_succesful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bin_statistic_unsuccesful = pitch.bin_statistic(
                event_unsuccessful.x, event_unsuccessful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bins_successful = bin_statistic_succesful["statistic"].flatten()
            bins_unsuccessful = bin_statistic_unsuccesful["statistic"].flatten()

            for i in range(len(bins_successful)):
                dict_team[f"{fea}_Successful_bin_{i}_time_{div}-{div+time_division}"] = bins_successful[i]
                dict_team[f"{fea}_Unsuccessful_bin_{i}_time_{div}-{div+time_division}"] = bins_unsuccessful[i]

        # Agregar la cantidad de juegos 
        dict_team["Number_matches"] = len(player_events_time["game"].unique())
    df_team2 = pd.DataFrame([dict_team])

    # Agregar fecha
    df_team1["Date"] = date
    df_team2["Date"] = date
    
    # Agregar id

    df_team1["game_id"] = match.game_id.unique()[0]
    df_team2["game_id"] = match.game_id.unique()[0]

    # Agregar resultado goles del equipo final
    df_team1["Goals"] = len(match[(match["type"] == "Goal") & (match["team"] == str_team1)])
    df_team2["Goals"] = len(match[(match["type"] == "Goal") & (match["team"] == str_team2)])

    columnas_a_mover = ["Team","Date","Goals","Number_matches","game_id"]

    # Obtén una lista de todas las columnas en el DataFrame
    columnas = df_team1.columns.tolist()

    # Mueve las columnas deseadas al principio
    for columna in reversed(columnas_a_mover):
        columnas.insert(0, columnas.pop(columnas.index(columna)))
    # Reordena las columnas del DataFrame
    df_team1 = df_team1[columnas]
    df_team2 = df_team2[columnas]

    return df_team1, df_team2

df_all_players_team1, df_all_players_team2 = get_features_team_temporal(matches, example_match, n_matches = 15, bins_x = 8, bins_y = 6, time_division=90) #usar divisor de 90

In [12]:
df_all_players_team1

Unnamed: 0,Team,Date,Goals,Number_matches,game_id,Pass_Successful_bin_0_time_0-90,Pass_Unsuccessful_bin_0_time_0-90,Pass_Successful_bin_1_time_0-90,Pass_Unsuccessful_bin_1_time_0-90,Pass_Successful_bin_2_time_0-90,...,Error_Successful_bin_43_time_0-90,Error_Unsuccessful_bin_43_time_0-90,Error_Successful_bin_44_time_0-90,Error_Unsuccessful_bin_44_time_0-90,Error_Successful_bin_45_time_0-90,Error_Unsuccessful_bin_45_time_0-90,Error_Successful_bin_46_time_0-90,Error_Unsuccessful_bin_46_time_0-90,Error_Successful_bin_47_time_0-90,Error_Unsuccessful_bin_47_time_0-90
0,Liverpool,2024-02-21,4,15,1729340,15.0,11.0,57.0,21.0,109.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_all_players_team2

Unnamed: 0,Team,Date,Goals,Number_matches,game_id,Pass_Successful_bin_0_time_0-90,Pass_Unsuccessful_bin_0_time_0-90,Pass_Successful_bin_1_time_0-90,Pass_Unsuccessful_bin_1_time_0-90,Pass_Successful_bin_2_time_0-90,...,Error_Successful_bin_43_time_0-90,Error_Unsuccessful_bin_43_time_0-90,Error_Successful_bin_44_time_0-90,Error_Unsuccessful_bin_44_time_0-90,Error_Successful_bin_45_time_0-90,Error_Unsuccessful_bin_45_time_0-90,Error_Successful_bin_46_time_0-90,Error_Unsuccessful_bin_46_time_0-90,Error_Successful_bin_47_time_0-90,Error_Unsuccessful_bin_47_time_0-90
0,Luton,2024-02-21,1,15,1729340,17.0,19.0,94.0,34.0,157.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
n_matches = 15
bins_x = 8
bins_y = 6
time_division = 90

n_workers = 12
matches_id = matches["game_id"].unique()[:]
resultado = []


# #En serie 
for id in tqdm(matches_id):
    data = get_features_team_temporal(matches, matches[matches["game_id"] == id], n_matches, bins_x, bins_y, time_division)
    resultado.append(data)

#En paralelo
# resultado = Parallel(n_jobs=n_workers, verbose=10)(
#     delayed(get_features_team_temporal)(matches[matches["fecha"]<=matches[matches["game_id"] == id]["fecha"].unique()[0]], matches[matches["game_id"] == id], n_matches, bins_x, bins_y, time_division) for id in tqdm(matches_id))

dfs_list = []
for df in resultado:
    dfs_list.append(df[0])
    dfs_list.append(df[1])
df = pd.concat(dfs_list, axis = 0)

df.to_csv(f"ENG-Premier LeagueWhoScoredTeamNumMatches{n_matches}Spatial{bins_x}x{bins_y}TimeDiv{time_division}.csv")

 18%|█▊        | 863/4810 [13:47<1:03:04,  1.04it/s]


KeyboardInterrupt: 

In [15]:
########### RESUMEN 
LIGAS = ["FRA-Ligue 1", "GER-Bundesliga", "ITA-Serie A"]

n_matches = 15
bins_x = 6
bins_y = 4
time_division = 15

n_workers = 12

for liga in LIGAS:
    matches = pd.read_csv(f"WhoScored_{liga}.csv")
    matches = matches[["game","game_id","type","outcome_type","team","fecha","player", "x", "y","minute"]]
    matches = matches.sort_values(by=["fecha","game_id","minute"],ignore_index=True)

    matches_id = matches["game_id"].unique()[:]
    resultado = []


    # #En serie 
    for id in tqdm(matches_id):
        data = get_features_team_temporal(matches, matches[matches["game_id"] == id], n_matches, bins_x, bins_y, time_division)
        resultado.append(data)

    #En paralelo
    # resultado = Parallel(n_jobs=n_workers, verbose=10)(
    #     delayed(get_features_team_temporal)(matches[matches["fecha"]<=matches[matches["game_id"] == id]["fecha"].unique()[0]], matches[matches["game_id"] == id], n_matches, bins_x, bins_y, time_division) for id in tqdm(matches_id))

    dfs_list = []
    for df in resultado:
        dfs_list.append(df[0])
        dfs_list.append(df[1])
    df = pd.concat(dfs_list, axis = 0)

    df.to_csv(f"{liga}WhoScoredTeamNumMatches{n_matches}Spatial{bins_x}x{bins_y}TimeDiv{time_division}.csv")

KeyboardInterrupt: 

In [None]:
# n1 = pd.read_csv(f"ENG-Premier LeagueWhoScoredTeamNumMatches{n_matches}Spatial{bins_x}x{bins_y}TimeDiv{time_division}.csv")
# n2 = pd.read_csv(f"ESP-La LigaWhoScoredTeamNumMatches{n_matches}Spatial{bins_x}x{bins_y}TimeDiv{time_division}.csv")
# n3 = pd.read_csv(f"FRA-Ligue 1WhoScoredTeamNumMatches{n_matches}Spatial{bins_x}x{bins_y}TimeDiv{time_division}.csv")
# n4 = pd.read_csv(f"GER-BundesligaWhoScoredTeamNumMatches{n_matches}Spatial{bins_x}x{bins_y}TimeDiv{time_division}.csv")
# n5 = pd.read_csv(f"ITA-Serie AWhoScoredTeamNumMatches{n_matches}Spatial{bins_x}x{bins_y}TimeDiv{time_division}.csv")

In [None]:
# df_final = pd.concat([n1, n2, n3, n4, n5], axis = 0)

In [None]:
# df_final.to_csv(f"WhoScoredTeamNumMatches{n_matches}Spatial{bins_x}x{bins_y}TimeDiv{time_division}.csv")

In [None]:
# df_final