In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import os
import sys
from multiprocessing import Pool, cpu_count
from tqdm.auto import tqdm
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [4]:
project_root = os.path.abspath(
    os.path.join("C:/Users/jllgo/OneDrive/Documentos/UFMG/MSc & MSI 2/MatchSegmentation")
)
if project_root not in sys.path:
    sys.path.append(project_root)

In [5]:
from src.pass_networks.process_games import process_game
from src.pass_networks.process_games import load_game
from src.pass_networks.process_games import save_game
from src.pass_networks.process_match_info import get_match_info
from src.pass_networks.process_match_info import process_players
from src.pass_networks.process_intervals import get_interval_graphs
from src.pass_networks.process_intervals import load_graphs
from src.pass_networks.process_intervals import save_graphs

In [6]:
RAW_DATA_PATH = '../data/raw/PL-22-23'
INTERMEDIATE_DATA_PATH = '../data/intermediate/PL-22-23'
PROCESSED_DATA_PATH = "../data/processed/PL-22-23"
CSV_DATA_PATH = "../data/csv"

# 1. Get Data

In [7]:
games = os.listdir(RAW_DATA_PATH)
raw_game_ids = [game.split('.')[0] for game in games]
print(f"Games to process: {len(raw_game_ids)}")

Games to process: 190


In [26]:
pre_processed_games = os.listdir(INTERMEDIATE_DATA_PATH)
pre_processed_game_ids = [game.split('.')[0] for game in pre_processed_games]
print(f"Games already pre-processed: {len(pre_processed_game_ids)}")

Games already pre-processed: 9


['4436', '4438', '4439', '4440', '4441', '4442', '4443', '4444', '4446']

In [24]:
processed = os.listdir(PROCESSED_DATA_PATH)
processed_game_ids = [game.split('.')[0] for game in processed]
print(f"Games already processed: {len(processed_game_ids)}")

Games already processed: 0


In [34]:
failed_ids = pd.read_csv(f"{CSV_DATA_PATH}/failed_matches.csv", header=None).values.astype(str).flatten().tolist()
failed_ids

['4437', '4445']

In [None]:
matches_to_process = [game_id for game_id in raw_game_ids if game_id not in pre_processed_game_ids and game_id not in failed_ids][:20]
print(f"Games to process: {len(matches_to_process)}")

Games to process: 3


In [None]:
matches = []

failed_matches = []

# #Check if the game has already been pre-processed
# if len(pre_processed_game_ids) > 0:
#     tasks = [(INTERMEDIATE_DATA_PATH,game_id) for game_id in pre_processed_game_ids]

#     for task in tqdm(tasks, desc="Loading Pre-Processed Games", total=len(tasks)):
#         metadata_df, players_df, events_df = load_game(task)
#         matches.append((task[1],metadata_df, players_df, events_df))

if len(matches_to_process) > 0:

    for game_id in tqdm(matches_to_process, desc="Processing Games", total=len(matches_to_process)):
        metadata_df, players_df, events_df = process_game((RAW_DATA_PATH, game_id))
        if metadata_df.empty or players_df.empty or events_df.empty:
            failed_matches.append(game_id)
            continue
        else:
            matches.append((game_id,metadata_df, players_df, events_df))
            save_game(metadata_df,players_df,events_df,INTERMEDIATE_DATA_PATH,game_id)



Loading Pre-Processed Games:   0%|          | 0/8 [00:00<?, ?it/s]

Processing Games:   0%|          | 0/3 [00:00<?, ?it/s]

Loading frames from match 4437: 0 frames [00:00, ? frames/s]

Validating frames:   0%|          | 0/146439 [00:00<?, ? frames/s]

Error processing match_id 4437: Error getting match events for match_id=4437


Loading frames from match 4445: 0 frames [00:00, ? frames/s]

Validating frames:   0%|          | 0/175531 [00:00<?, ? frames/s]

Error processing match_id 4445: Error getting match events for match_id=4445


Loading frames from match 4446: 0 frames [00:00, ? frames/s]

Validating frames:   0%|          | 0/181548 [00:00<?, ? frames/s]

In [14]:
failed_matches = pd.DataFrame(failed_matches, columns=["game_id"])
failed_matches.to_csv(f"{CSV_DATA_PATH}/failed_matches.csv", index=False, header=False, mode='a')

In [37]:
players_info, teams_info, games_info = get_match_info(CSV_DATA_PATH)

# 2. Data Treatment

In [38]:
players_dfs = []
events_dfs = []
metadata_dfs = []
match_ids = []

for match_dfs in tqdm(matches, desc="Data Processing"):
    match_id, metadata_df, players_df, events_df = match_dfs
    players_match_info = players_info[players_info['match_id'] == int(match_id)].reset_index(drop=True)
    match_info = games_info[games_info['match_id'] == int(match_id)].reset_index(drop=True)

    players_df = process_players(players_df, match_info, players_match_info, metadata_df)

    players_dfs.append(players_df)
    events_dfs.append(events_df)
    metadata_dfs.append(metadata_df)
    match_ids.append(match_id)

players_df = pd.concat(players_dfs)
events_df = pd.concat(events_dfs)
metadata_df = pd.concat(metadata_dfs)

Data Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [39]:
players_interval_df = players_df.groupby(['match_id','interval_id','team_id','player_id','nickname','shirt']).agg(
    x=('x', 'mean'),
    y=('y', 'mean')
).reset_index()

players_interval_df['player_id'] = players_interval_df['player_id'].astype(int)
players_interval_df['match_id'] = players_interval_df['match_id'].astype(int)
players_interval_df['interval_id'] = players_interval_df['interval_id'].astype(int)
players_interval_df['team_id'] = players_interval_df['team_id'].astype(int)
players_interval_df['shirt'] = players_interval_df['shirt'].astype(int)

players_interval_df[players_interval_df['interval_id']==1]

Unnamed: 0,match_id,interval_id,team_id,player_id,nickname,shirt,x,y
0,4436,1,2,15,Oleksandr Zinchenko,35,13.573355,-7.494776
1,4436,1,2,30,Gabriel Jesus,9,-3.277079,14.615077
2,4436,1,2,157,Granit Xhaka,34,7.887703,0.839211
3,4436,1,2,162,Gabriel Martinelli,11,-0.625160,-2.459962
4,4436,1,2,163,Bukayo Saka,7,5.372688,26.414956
...,...,...,...,...,...,...,...,...
8669,4446,1,9,220,James Maddison,10,1.202907,-0.480332
8670,4446,1,9,225,Jamie Vardy,9,-7.655601,-3.839095
8671,4446,1,9,1960,Timothy Castagne,27,8.776463,18.250230
8672,4446,1,9,2069,Wesley Fofana,3,20.699112,9.525530


In [41]:
match_intervals = players_df[['match_id','interval_id','team_id','team']].drop_duplicates().reset_index(drop=True)
match_intervals = match_intervals[match_intervals['interval_id'].notna()].reset_index(drop=True)
match_intervals

Unnamed: 0,match_id,interval_id,team_id,team
0,4436,1.0,7,home
1,4436,1.0,2,away
2,4436,2.0,7,home
3,4436,2.0,2,away
4,4436,3.0,7,home
...,...,...,...,...
877,4446,49.0,9,away
878,4446,50.0,2,home
879,4446,50.0,9,away
880,4446,51.0,2,home


In [42]:
data = [{
    'match_id':row['match_id'],
    'interval_id':row['interval_id'],
    'team_id': row['team_id'],
    'player_id': -1,
    'nickname': 'Goal',
    'shirt': -1,
    'x': 52 if row['team']=='home' else -52,
    'y': 0
} for _,row in match_intervals.iterrows()]

data = pd.DataFrame(data)

players_interval_df = pd.concat([players_interval_df, data], ignore_index=True).sort_values(['match_id','interval_id','team_id','player_id']).reset_index(drop=True)
players_interval_df

Unnamed: 0,match_id,interval_id,team_id,player_id,nickname,shirt,x,y
0,4436,1.0,2,-1,Goal,-1,-52.000000,0.000000
1,4436,1.0,2,15,Oleksandr Zinchenko,35,13.573355,-7.494776
2,4436,1.0,2,30,Gabriel Jesus,9,-3.277079,14.615077
3,4436,1.0,2,157,Granit Xhaka,34,7.887703,0.839211
4,4436,1.0,2,162,Gabriel Martinelli,11,-0.625160,-2.459962
...,...,...,...,...,...,...,...,...
10610,4446,51.0,9,226,Kelechi Iheanacho,14,-25.892583,6.703000
10611,4446,51.0,9,1960,Timothy Castagne,27,3.158194,13.366986
10612,4446,51.0,9,2069,Wesley Fofana,3,9.033496,0.745741
10613,4446,51.0,9,5096,Patson Daka,20,-17.482180,4.933504


In [43]:
players_interval_df = players_interval_df[players_interval_df['interval_id'].notna()].reset_index(drop=True)
players_interval_df

Unnamed: 0,match_id,interval_id,team_id,player_id,nickname,shirt,x,y
0,4436,1.0,2,-1,Goal,-1,-52.000000,0.000000
1,4436,1.0,2,15,Oleksandr Zinchenko,35,13.573355,-7.494776
2,4436,1.0,2,30,Gabriel Jesus,9,-3.277079,14.615077
3,4436,1.0,2,157,Granit Xhaka,34,7.887703,0.839211
4,4436,1.0,2,162,Gabriel Martinelli,11,-0.625160,-2.459962
...,...,...,...,...,...,...,...,...
10610,4446,51.0,9,226,Kelechi Iheanacho,14,-25.892583,6.703000
10611,4446,51.0,9,1960,Timothy Castagne,27,3.158194,13.366986
10612,4446,51.0,9,2069,Wesley Fofana,3,9.033496,0.745741
10613,4446,51.0,9,5096,Patson Daka,20,-17.482180,4.933504


### Events

In [44]:
pass_df = events_df[(events_df['possession_type']=='PASS') & (events_df['outcome']=='C')][['match_id','player_id','receiver_id','event_id','possession_id','team_id']]
pass_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
0,4436,1940,2073.0,4427560,4245025,7
1,4436,2073,5090.0,4427697,4245173,7
2,4436,5090,4213.0,4427699,4245175,7
3,4436,4213,2073.0,4427701,4245177,7
4,4436,2073,5090.0,4427705,4245181,7


In [45]:
carry_df = events_df[(events_df['possession_type']=='CARRY') & ((events_df['carry_type'].isin(['T','C'])) | ((events_df['carry_type']=='D')&(events_df['outcome'].isin(['K','B']))))][['match_id','player_id','receiver_id','event_id','possession_id','team_id']]
carry_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
22,4436,1688,1688.0,4427757,4521767,2
24,4436,30,30.0,4427766,4250669,2
45,4436,6016,6016.0,4427834,4250178,7
91,4436,15,15.0,4427963,4245440,2
132,4436,15,15.0,4428086,4245561,2


In [46]:
shots_df = events_df[(events_df['possession_type']=='SHOT')][['match_id','player_id','receiver_id','event_id','possession_id','team_id']]
shots_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
62,4436,162,-1.0,4427873,4245343,2
63,4436,30,-1.0,4427881,4245353,2
114,4436,15,-1.0,4428024,4245498,2
118,4436,1380,-1.0,4428045,4245519,2
253,4436,162,-1.0,4428502,4245951,2


In [47]:
network_events_df = pd.concat([pass_df,carry_df, shots_df]).reset_index(drop=True)
network_events_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
0,4436,1940,2073.0,4427560,4245025,7
1,4436,2073,5090.0,4427697,4245173,7
2,4436,5090,4213.0,4427699,4245175,7
3,4436,4213,2073.0,4427701,4245177,7
4,4436,2073,5090.0,4427705,4245181,7


### Networks

In [48]:
network_frame_df = network_events_df.merge(metadata_df[['match_id','interval_id','frame_id','event_id','possession_id']], on=['match_id','event_id','possession_id'], how='left')
network_frame_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id,interval_id,frame_id
0,4436,1940,2073.0,4427560,4245025,7,1.0,1594.0
1,4436,2073,5090.0,4427697,4245173,7,1.0,1669.0
2,4436,5090,4213.0,4427699,4245175,7,1.0,1733.0
3,4436,4213,2073.0,4427701,4245177,7,1.0,1759.0
4,4436,2073,5090.0,4427705,4245181,7,1.0,1810.0


In [49]:
# Group by match_id, interval_id, team_id, player_id, receiver_id to get the number of interactions between players

grouped_df = network_frame_df.groupby(['match_id','interval_id','team_id','player_id','receiver_id']).size().reset_index(name='count')
grouped_df['match_id']=grouped_df['match_id'].astype(int)
grouped_df['interval_id']=grouped_df['interval_id'].astype(int)
grouped_df['team_id']=grouped_df['team_id'].astype(int)
grouped_df['player_id']=grouped_df['player_id'].astype(int)
grouped_df['receiver_id']=grouped_df['receiver_id'].astype(int)

grouped_df.head()

Unnamed: 0,match_id,interval_id,team_id,player_id,receiver_id,count
0,4436,1,2,15,1894,1
1,4436,1,2,30,30,1
2,4436,1,2,157,162,1
3,4436,1,2,162,163,1
4,4436,1,2,163,30,1


In [50]:
grouped_df = grouped_df.merge(players_info[['match_id','player_id','nickname','shirt_number']], how='left', left_on=['match_id','player_id'], right_on=['match_id','player_id']).drop_duplicates().reset_index(drop=True)
grouped_df = grouped_df.merge(players_info[['match_id','player_id','nickname','shirt_number']], how='left', left_on=['match_id','receiver_id'], right_on=['match_id','player_id']).drop_duplicates().reset_index(drop=True)

grouped_df = grouped_df.rename(columns={'player_id_x':'player_id','nickname_x':'player_nickname','shirt_number_x':'player_shirt','nickname_y':'receiver_nickname','shirt_number_y':'receiver_shirt'})
grouped_df = grouped_df.drop(['player_id_y'], axis=1)

grouped_df['receiver_nickname'] = grouped_df['receiver_nickname'].fillna('Goal')
grouped_df['receiver_shirt'] = grouped_df['receiver_shirt'].fillna(-1)
grouped_df['receiver_shirt'] = grouped_df['receiver_shirt'].astype(int)

grouped_df.head()

Unnamed: 0,match_id,interval_id,team_id,player_id,receiver_id,count,player_nickname,player_shirt,receiver_nickname,receiver_shirt
0,4436,1,2,15,1894,1,Oleksandr Zinchenko,35,Gabriel,6
1,4436,1,2,30,30,1,Gabriel Jesus,9,Gabriel Jesus,9
2,4436,1,2,157,162,1,Granit Xhaka,34,Gabriel Martinelli,11
3,4436,1,2,162,163,1,Gabriel Martinelli,11,Bukayo Saka,7
4,4436,1,2,163,30,1,Bukayo Saka,7,Gabriel Jesus,9


# Graphs

In [64]:
graph_games = os.listdir(PROCESSED_DATA_PATH)
graph_game_ids = [game.split('.')[0] for game in graph_games]
print(f"Games to process: {len(graph_game_ids)}")

Games to process: 9


In [65]:
pre_processed_ids = [game_id for game_id in match_ids if game_id not in graph_game_ids]
print(f"Games to process graphs: {len(pre_processed_ids)}")

Games to process graphs: 0


In [69]:
#graphs = []

# if len(graph_game_ids) > 0:

#     for game_id in tqdm(graph_game_ids, desc="Loading Graphs", total=len(graph_game_ids)):
#         match_graphs = load_graphs(game_id, PROCESSED_DATA_PATH)
#         graphs.append((task[1],match_graphs))

if len(pre_processed_ids) > 0:

    for game_id in tqdm(pre_processed_ids, desc="Processing Graphs", total=len(pre_processed_ids)):
        print(game_id)

        match_grouped_df = grouped_df[grouped_df['match_id']==int(game_id)].reset_index(drop=True)
        match_players_interval_df = players_interval_df[players_interval_df['match_id']==int(game_id)].reset_index(drop=True)

        match_graphs = get_interval_graphs(match_grouped_df, match_players_interval_df)
        save_graphs(game_id, PROCESSED_DATA_PATH, match_graphs)
        #graphs.append((game_id,match_graphs))

Loading Graphs:   0%|          | 0/9 [00:00<?, ?it/s]