In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import os
import sys
from multiprocessing import Pool, cpu_count
from tqdm.auto import tqdm
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [4]:
project_root = os.path.abspath(
    os.path.join("C:/Users/jllgo/OneDrive/Documentos/UFMG/MSc & MSI 2/MatchSegmentation")
)
if project_root not in sys.path:
    sys.path.append(project_root)

In [5]:
from src.pass_networks.process_games import process_game
from src.pass_networks.process_games import load_game
from src.pass_networks.process_games import save_game
from src.pass_networks.process_match_info import get_match_info
from src.pass_networks.process_match_info import process_players
from src.pass_networks.process_intervals import get_interval_graphs
from src.pass_networks.process_intervals import load_graphs
from src.pass_networks.process_intervals import save_graphs

In [6]:
RAW_DATA_PATH = '../data/raw/PL-22-23'
INTERMEDIATE_DATA_PATH = '../data/intermediate/PL-22-23'
PROCESSED_DATA_PATH = "../data/processed/PL-22-23"
CSV_DATA_PATH = "../data/csv"

# 1. Get Data

In [7]:
games = os.listdir(RAW_DATA_PATH)
game_ids = [game.split('.')[0] for game in games]
print(f"Games to process: {len(game_ids)}")

Games to process: 190


In [8]:
pre_processed_games = os.listdir(INTERMEDIATE_DATA_PATH)
pre_processed_game_ids = [game.split('.')[0] for game in pre_processed_games]
print(f"Games already pre-processed: {len(pre_processed_game_ids)}")

Games already pre-processed: 190


In [9]:
processed = os.listdir(PROCESSED_DATA_PATH)
processed_game_ids = [game.split('.')[0] for game in processed]
print(f"Games already processed: {len(processed_game_ids)}")

Games already processed: 80


In [17]:
matches_to_process = [game_id for game_id in game_ids if game_id not in processed_game_ids]
print(f"Games to process: {len(matches_to_process)}")

Games to process: 110


In [18]:
all_games = pre_processed_game_ids + game_ids

In [19]:
matches = []

#Check if the game has already been pre-processed
if len(matches_to_process) > 0:
    tasks = [(INTERMEDIATE_DATA_PATH,game_id) for game_id in matches_to_process]

    for task in tqdm(tasks, desc="Loading Pre-Processed Games", total=len(tasks)):
        metadata_df, players_df, events_df = load_game(task)
        matches.append((task[1],metadata_df, players_df, events_df))

# if len(raw_game_ids) > 0:

#     for game_id in tqdm(raw_game_ids, desc="Processing Games", total=len(raw_game_ids)):
#         metadata_df, players_df, events_df = process_game((RAW_DATA_PATH, game_id))
#         matches.append((game_id,metadata_df, players_df, events_df))
#         save_game(metadata_df,players_df,events_df,INTERMEDIATE_DATA_PATH,game_id)

Loading Pre-Processed Games:   0%|          | 0/110 [00:00<?, ?it/s]

In [20]:
players_info, teams_info, games_info = get_match_info(CSV_DATA_PATH)

# 2. Data Treatment

In [21]:
len(matches)

110

In [83]:
players_dfs = []
events_dfs = []
metadata_dfs = []
match_ids = []

for match_dfs in tqdm(matches[100:], desc="Data Processing"):
    match_id, metadata_df, players_df, events_df = match_dfs
    players_match_info = players_info[players_info['match_id'] == int(match_id)].reset_index(drop=True)
    match_info = games_info[games_info['match_id'] == int(match_id)].reset_index(drop=True)

    players_df = process_players(players_df, match_info, players_match_info, metadata_df)

    players_dfs.append(players_df)
    events_dfs.append(events_df)
    metadata_dfs.append(metadata_df)
    match_ids.append(match_id)

players_df = pd.concat(players_dfs)
events_df = pd.concat(events_dfs)
metadata_df = pd.concat(metadata_dfs)

Data Processing:   0%|          | 0/10 [00:00<?, ?it/s]

In [84]:
players_df = players_df[players_df['interval_id'].notna()].reset_index(drop=True)
metadata_df = metadata_df[metadata_df['interval_id'].notna()].reset_index(drop=True)

In [85]:
players_interval_df = players_df.groupby(['match_id','interval_id','team_id','player_id','nickname','shirt']).agg(
    x=('x', 'mean'),
    y=('y', 'mean')
).reset_index()

players_interval_df['player_id'] = players_interval_df['player_id'].astype(int)
players_interval_df['match_id'] = players_interval_df['match_id'].astype(int)
players_interval_df['interval_id'] = players_interval_df['interval_id'].astype(int)
players_interval_df['team_id'] = players_interval_df['team_id'].astype(int)
players_interval_df['shirt'] = players_interval_df['shirt'].astype(int)

players_interval_df.head()

Unnamed: 0,match_id,interval_id,team_id,player_id,nickname,shirt,x,y
0,4616,1,2,15,Oleksandr Zinchenko,35,-20.241368,8.668105
1,4616,1,2,157,Granit Xhaka,34,-13.845737,12.999579
2,4616,1,2,162,Gabriel Martinelli,11,-1.596737,22.053105
3,4616,1,2,163,Bukayo Saka,7,-0.802632,-24.583737
4,4616,1,2,168,Eddie Nketiah,14,-2.476211,4.055158


In [86]:
match_intervals = players_df[['match_id','interval_id','team_id','team']].drop_duplicates().reset_index(drop=True)
match_intervals

Unnamed: 0,match_id,interval_id,team_id,team
0,4616,1.0,2,home
1,4616,1.0,13,away
2,4616,2.0,2,home
3,4616,2.0,13,away
4,4616,3.0,2,home
...,...,...,...,...
973,4625,48.0,221,away
974,4625,49.0,16,home
975,4625,49.0,221,away
976,4625,50.0,16,home


In [87]:
data = [{
    'match_id':row['match_id'],
    'interval_id':row['interval_id'],
    'team_id': row['team_id'],
    'player_id': -1,
    'nickname': 'Goal',
    'shirt': -1,
    'x': 52 if row['team']=='home' else -52,
    'y': 0
} for _,row in match_intervals.iterrows()]

data = pd.DataFrame(data)

players_interval_df = pd.concat([players_interval_df, data], ignore_index=True).sort_values(['match_id','interval_id','team_id','player_id']).reset_index(drop=True)
players_interval_df

Unnamed: 0,match_id,interval_id,team_id,player_id,nickname,shirt,x,y
0,4616,1.0,2,-1,Goal,-1,52.000000,0.000000
1,4616,1.0,2,15,Oleksandr Zinchenko,35,-20.241368,8.668105
2,4616,1.0,2,157,Granit Xhaka,34,-13.845737,12.999579
3,4616,1.0,2,162,Gabriel Martinelli,11,-1.596737,22.053105
4,4616,1.0,2,163,Bukayo Saka,7,-0.802632,-24.583737
...,...,...,...,...,...,...,...,...
11762,4625,50.0,221,7001,Ryan Yates,22,35.124000,-6.359000
11763,4625,50.0,221,7003,Joe Worrall,4,42.979000,-1.358000
11764,4625,50.0,221,7174,Harry Toffolo,15,41.727000,-11.497000
11765,4625,50.0,221,8328,Remo Freuler,23,40.901000,-6.860000


### Events

In [88]:
pass_df = events_df[(events_df['possession_type']=='PASS') & (events_df['outcome']=='C')][['match_id','player_id','receiver_id','event_id','possession_id','team_id']]
pass_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
0,4616.0,335.0,320.0,6900393.0,6789845.0,13.0
1,4616.0,320.0,3942.0,6900400.0,6789850.0,13.0
2,4616.0,3942.0,320.0,6900417.0,6789869.0,13.0
3,4616.0,320.0,3942.0,6900421.0,6789873.0,13.0
4,4616.0,3942.0,441.0,6900425.0,6789877.0,13.0


In [89]:
carry_df = events_df[(events_df['possession_type']=='CARRY') & ((events_df['carry_type'].isin(['T','C'])) | ((events_df['carry_type']=='D')&(events_df['outcome'].isin(['K','B']))))][['match_id','player_id','receiver_id','event_id','possession_id','team_id']]
carry_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
34,4616.0,162.0,162.0,6900652.0,6790101.0,2.0
164,4616.0,157.0,157.0,6901256.0,6797380.0,2.0
168,4616.0,162.0,162.0,6901269.0,6797712.0,2.0
236,4616.0,311.0,311.0,6901694.0,6791142.0,13.0
237,4616.0,311.0,311.0,6901694.0,6791142.0,13.0


In [90]:
shots_df = events_df[(events_df['possession_type']=='SHOT')][['match_id','player_id','receiver_id','event_id','possession_id','team_id']]
shots_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
30,4616.0,162.0,-1.0,6900628.0,6790071.0,2.0
31,4616.0,162.0,-1.0,6900628.0,6790071.0,2.0
32,4616.0,162.0,-1.0,6900628.0,6790071.0,2.0
33,4616.0,162.0,-1.0,6900628.0,6790071.0,2.0
38,4616.0,1688.0,-1.0,6900681.0,6790123.0,2.0


In [91]:
network_events_df = pd.concat([pass_df,carry_df, shots_df]).reset_index(drop=True)
network_events_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
0,4616.0,335.0,320.0,6900393.0,6789845.0,13.0
1,4616.0,320.0,3942.0,6900400.0,6789850.0,13.0
2,4616.0,3942.0,320.0,6900417.0,6789869.0,13.0
3,4616.0,320.0,3942.0,6900421.0,6789873.0,13.0
4,4616.0,3942.0,441.0,6900425.0,6789877.0,13.0


### Networks

In [92]:
network_frame_df = network_events_df.merge(metadata_df[['match_id','interval_id','frame_id','event_id','possession_id']], on=['match_id','event_id','possession_id'], how='left')
network_frame_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id,interval_id,frame_id
0,4616.0,335.0,320.0,6900393.0,6789845.0,13.0,,
1,4616.0,320.0,3942.0,6900400.0,6789850.0,13.0,1.0,3534.0
2,4616.0,3942.0,320.0,6900417.0,6789869.0,13.0,1.0,3630.0
3,4616.0,320.0,3942.0,6900421.0,6789873.0,13.0,1.0,3710.0
4,4616.0,3942.0,441.0,6900425.0,6789877.0,13.0,1.0,3806.0


In [93]:
# Group by match_id, interval_id, team_id, player_id, receiver_id to get the number of interactions between players

grouped_df = network_frame_df.groupby(['match_id','interval_id','team_id','player_id','receiver_id']).size().reset_index(name='count')
grouped_df['match_id']=grouped_df['match_id'].astype(int)
grouped_df['interval_id']=grouped_df['interval_id'].astype(int)
grouped_df['team_id']=grouped_df['team_id'].astype(int)
grouped_df['player_id']=grouped_df['player_id'].astype(int)
grouped_df['receiver_id']=grouped_df['receiver_id'].astype(int)

grouped_df.head()

Unnamed: 0,match_id,interval_id,team_id,player_id,receiver_id,count
0,4616,1,2,15,1380,1
1,4616,1,2,157,168,2
2,4616,1,2,168,15,1
3,4616,1,2,281,168,1
4,4616,1,2,281,1380,1


In [94]:
# Get player info
grouped_df = grouped_df.merge(players_info[['match_id','team_id','player_id','nickname','shirt_number']], how='left', left_on=['match_id','player_id'], right_on=['match_id','player_id']).drop_duplicates().reset_index(drop=True)

# Get receiver info
grouped_df = grouped_df.merge(players_info[['match_id','team_id','player_id','nickname','shirt_number']], how='left', left_on=['match_id','receiver_id'], right_on=['match_id','player_id'], suffixes=('_player','_receiver')).drop_duplicates().reset_index(drop=True)

grouped_df.head()

Unnamed: 0,match_id,interval_id,team_id_x,player_id_player,receiver_id,count,team_id_y,nickname_player,shirt_number_player,team_id,player_id_receiver,nickname_receiver,shirt_number_receiver
0,4616,1,2,15,1380,1,2,Oleksandr Zinchenko,35,2.0,1380.0,Thomas Partey,5.0
1,4616,1,2,157,168,2,2,Granit Xhaka,34,2.0,168.0,Eddie Nketiah,14.0
2,4616,1,2,168,15,1,2,Eddie Nketiah,14,2.0,15.0,Oleksandr Zinchenko,35.0
3,4616,1,2,281,168,1,2,Aaron Ramsdale,1,2.0,168.0,Eddie Nketiah,14.0
4,4616,1,2,281,1380,1,2,Aaron Ramsdale,1,2.0,1380.0,Thomas Partey,5.0


# Graphs

In [95]:
graph_games = os.listdir(PROCESSED_DATA_PATH)
graph_game_ids = [game.split('.')[0] for game in graph_games]
print(f"Games to process: {len(graph_game_ids)}")

Games to process: 180


In [96]:
pre_processed_ids = [game_id for game_id in match_ids if game_id not in graph_game_ids]
print(f"Games to process graphs: {len(pre_processed_ids)}")

Games to process graphs: 10


In [97]:
graphs = []

# if len(graph_game_ids) > 0:
#     tasks = [(PROCESSED_DATA_PATH,game_id) for game_id in graph_game_ids]

#     for task in tqdm(tasks, desc="Loading Pre-Processed Graphs", total=len(tasks)):
#         match_graphs = load_graphs(task)
#         graphs.append((task[1],match_graphs))

if len(pre_processed_ids) > 0:

    for game_id in tqdm(pre_processed_ids, desc="Processing Graphs", total=len(pre_processed_ids)):

        match_grouped_df = grouped_df[grouped_df['match_id']==int(game_id)]
        match_players_interval_df = players_interval_df[players_interval_df['match_id']==int(game_id)]

        match_graphs = get_interval_graphs(match_grouped_df, match_players_interval_df)

        save_graphs(game_id, PROCESSED_DATA_PATH, match_graphs)
        graphs.append((game_id,match_graphs))

Processing Graphs:   0%|          | 0/10 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/50 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/48 [00:00<?, ?it/s]

Processing intervals: 0it [00:00, ?it/s]

Processing intervals: 0it [00:00, ?it/s]

Processing intervals:   0%|          | 0/47 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/47 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/48 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/49 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/48 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/49 [00:00<?, ?it/s]