In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import os
import sys
from multiprocessing import Pool, cpu_count
from tqdm.auto import tqdm
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [4]:
project_root = os.path.abspath(
    os.path.join("C:/Users/jllgo/OneDrive/Documentos/UFMG/MSc & MSI 2/MatchSegmentation")
)
if project_root not in sys.path:
    sys.path.append(project_root)

In [5]:
from src.pass_networks.process_games import process_game
from src.pass_networks.process_games import load_game
from src.pass_networks.process_games import save_game
from src.pass_networks.process_match_info import get_match_info
from src.pass_networks.process_match_info import process_players
from src.pass_networks.process_intervals import get_interval_graphs
from src.pass_networks.process_intervals import load_graphs
from src.pass_networks.process_intervals import save_graphs

In [6]:
RAW_DATA_PATH = '../data/raw/PL-22-23'
INTERMEDIATE_DATA_PATH = '../data/intermediate/PL-22-23'
PROCESSED_DATA_PATH = "../data/processed/PL-22-23"
CSV_DATA_PATH = "../data/csv"

# 1. Get Data

In [7]:
games = os.listdir(RAW_DATA_PATH)
game_ids = [game.split('.')[0] for game in games]
print(f"Games to process: {len(game_ids)}")

Games to process: 190


In [8]:
pre_processed_games = os.listdir(INTERMEDIATE_DATA_PATH)
pre_processed_game_ids = [game.split('.')[0] for game in pre_processed_games][:80]
print(f"Games already pre-processed: {len(pre_processed_game_ids)}")

Games already pre-processed: 80


In [9]:
raw_game_ids = [game_id for game_id in game_ids if game_id not in pre_processed_game_ids]
print(f"Games to process: {len(raw_game_ids)}")

Games to process: 110


In [10]:
all_games = pre_processed_game_ids + game_ids

In [11]:
matches = []

# Check if the game has already been pre-processed
if len(pre_processed_game_ids) > 0:
    tasks = [(INTERMEDIATE_DATA_PATH,game_id) for game_id in pre_processed_game_ids]

    for task in tqdm(tasks, desc="Loading Pre-Processed Games", total=len(tasks)):
        metadata_df, players_df, events_df = load_game(task)
        matches.append((task[1],metadata_df, players_df, events_df))

# if len(raw_game_ids) > 0:

#     for game_id in tqdm(raw_game_ids, desc="Processing Games", total=len(raw_game_ids)):
#         metadata_df, players_df, events_df = process_game((RAW_DATA_PATH, game_id))
#         matches.append((game_id,metadata_df, players_df, events_df))
#         save_game(metadata_df,players_df,events_df,INTERMEDIATE_DATA_PATH,game_id)

Loading Pre-Processed Games:   0%|          | 0/80 [00:00<?, ?it/s]

In [12]:
players_info, teams_info, games_info = get_match_info(CSV_DATA_PATH)

# 2. Data Treatment

In [13]:
len(matches)

80

In [14]:
players_dfs = []
events_dfs = []
metadata_dfs = []
match_ids = []

for match_dfs in tqdm(matches[50:], desc="Data Processing"):
    match_id, metadata_df, players_df, events_df = match_dfs
    players_match_info = players_info[players_info['match_id'] == int(match_id)].reset_index(drop=True)
    match_info = games_info[games_info['match_id'] == int(match_id)].reset_index(drop=True)

    players_df = process_players(players_df, match_info, players_match_info, metadata_df)

    players_dfs.append(players_df)
    events_dfs.append(events_df)
    metadata_dfs.append(metadata_df)
    match_ids.append(match_id)

players_df = pd.concat(players_dfs)
events_df = pd.concat(events_dfs)
metadata_df = pd.concat(metadata_dfs)

Data Processing:   0%|          | 0/30 [00:00<?, ?it/s]

In [15]:
players_interval_df = players_df.groupby(['match_id','interval_id','team_id','player_id','nickname','shirt']).agg(
    x=('x', 'mean'),
    y=('y', 'mean')
).reset_index()

players_interval_df['player_id'] = players_interval_df['player_id'].astype(int)
players_interval_df['match_id'] = players_interval_df['match_id'].astype(int)
players_interval_df['interval_id'] = players_interval_df['interval_id'].astype(int)
players_interval_df['team_id'] = players_interval_df['team_id'].astype(int)
players_interval_df['shirt'] = players_interval_df['shirt'].astype(int)

players_interval_df.head()

Unnamed: 0,match_id,interval_id,team_id,player_id,nickname,shirt,x,y
0,4486,1,3,144,Emiliano Martínez,1,-38.007435,-0.833326
1,4486,1,3,176,Lucas Digne,27,-6.749065,16.700413
2,4486,1,3,355,Tyrone Mings,5,-14.922913,2.708
3,4486,1,3,357,Ezri Konsa,4,-13.971152,-7.011413
4,4486,1,3,366,Douglas Luiz,6,-1.808326,-10.5105


In [16]:
goal_position = pd.DataFrame(columns=['match_id','interval_id','team_id','player_id','x','y'])

data = [{
    'match_id':4625,
    'interval_id':row['interval_id'],
    'team_id': row['team_id'],
    'player_id': -1,
    'nickname': 'Goal',
    'shirt': -1,
    'x': 52 if row['team']=='home' else -52,
    'y': 0 if row['team']=='home' else 0
} for _,row in players_df[['interval_id','team_id', 'team']].drop_duplicates().iterrows()]

players_interval_df = pd.concat([players_interval_df, pd.DataFrame(data)], ignore_index=True).sort_values(['interval_id','team_id','player_id']).reset_index(drop=True)
players_interval_df

Unnamed: 0,match_id,interval_id,team_id,player_id,nickname,shirt,x,y
0,4625,1.0,1,-1,Goal,-1,-52.000000,0.000000
1,4625,1.0,1,-1,Goal,-1,52.000000,0.000000
2,4493,1.0,1,286,Chris Mepham,6,17.304357,13.146179
3,4512,1.0,1,286,Chris Mepham,6,21.547568,10.243324
4,4493,1.0,1,289,Lloyd Kelly,5,16.153464,-3.479179
...,...,...,...,...,...,...,...,...
34412,4625,,55,-1,Goal,-1,52.000000,0.000000
34413,4625,,119,-1,Goal,-1,52.000000,0.000000
34414,4625,,119,-1,Goal,-1,-52.000000,0.000000
34415,4625,,221,-1,Goal,-1,52.000000,0.000000


In [17]:
players_interval_df['match_id'].nunique()

31

### Events

In [18]:
pass_df = events_df[(events_df['possession_type']=='PASS') & (events_df['outcome']=='C')][['match_id','player_id','receiver_id','event_id','possession_id','team_id']]
pass_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
0,4486.0,4687.0,18.0,5059430.0,4892648.0,11.0
1,4486.0,18.0,27.0,5059538.0,4892752.0,11.0
2,4486.0,27.0,16.0,5059546.0,4892760.0,11.0
3,4486.0,16.0,10.0,5059558.0,4892772.0,11.0
4,4486.0,10.0,2066.0,5059564.0,4892779.0,11.0


In [19]:
carry_df = events_df[(events_df['possession_type']=='CARRY') & ((events_df['carry_type'].isin(['T','C'])) | ((events_df['carry_type']=='D')&(events_df['outcome'].isin(['K','B']))))][['match_id','player_id','receiver_id','event_id','possession_id','team_id']]
carry_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
12,4486.0,21.0,21.0,5059686.0,4892897.0,11.0
13,4486.0,21.0,21.0,5059686.0,4892897.0,11.0
14,4486.0,21.0,21.0,5059686.0,4892897.0,11.0
234,4486.0,21.0,21.0,5062007.0,4895188.0,11.0
235,4486.0,21.0,21.0,5062007.0,4895188.0,11.0


In [20]:
shots_df = events_df[(events_df['possession_type']=='SHOT')][['match_id','player_id','receiver_id','event_id','possession_id','team_id']]
shots_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
133,4486.0,16.0,-1.0,5060959.0,4894165.0,11.0
134,4486.0,16.0,-1.0,5060959.0,4894165.0,11.0
208,4486.0,22.0,-1.0,5061746.0,4894928.0,11.0
209,4486.0,22.0,-1.0,5061746.0,4894928.0,11.0
231,4486.0,4687.0,-1.0,5061971.0,4895151.0,11.0


In [21]:
network_events_df = pd.concat([pass_df,carry_df, shots_df]).reset_index(drop=True)
network_events_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id
0,4486.0,4687.0,18.0,5059430.0,4892648.0,11.0
1,4486.0,18.0,27.0,5059538.0,4892752.0,11.0
2,4486.0,27.0,16.0,5059546.0,4892760.0,11.0
3,4486.0,16.0,10.0,5059558.0,4892772.0,11.0
4,4486.0,10.0,2066.0,5059564.0,4892779.0,11.0


### Networks

In [22]:
network_frame_df = network_events_df.merge(metadata_df[['match_id','interval_id','frame_id','event_id','possession_id']], on=['match_id','event_id','possession_id'], how='left')
network_frame_df.head()

Unnamed: 0,match_id,player_id,receiver_id,event_id,possession_id,team_id,interval_id,frame_id
0,4486.0,4687.0,18.0,5059430.0,4892648.0,11.0,,
1,4486.0,18.0,27.0,5059538.0,4892752.0,11.0,24.0,86355.0
2,4486.0,27.0,16.0,5059546.0,4892760.0,11.0,24.0,86468.0
3,4486.0,16.0,10.0,5059558.0,4892772.0,11.0,24.0,86533.0
4,4486.0,10.0,2066.0,5059564.0,4892779.0,11.0,24.0,86655.0


In [23]:
# Group by match_id, interval_id, team_id, player_id, receiver_id to get the number of interactions between players

grouped_df = network_frame_df.groupby(['match_id','interval_id','team_id','player_id','receiver_id']).size().reset_index(name='count')
grouped_df['match_id']=grouped_df['match_id'].astype(int)
grouped_df['interval_id']=grouped_df['interval_id'].astype(int)
grouped_df['team_id']=grouped_df['team_id'].astype(int)
grouped_df['player_id']=grouped_df['player_id'].astype(int)
grouped_df['receiver_id']=grouped_df['receiver_id'].astype(int)

grouped_df.head()

Unnamed: 0,match_id,interval_id,team_id,player_id,receiver_id,count
0,4486,1,3,144,1900,1
1,4486,1,3,176,2062,1
2,4486,1,3,1896,144,1
3,4486,1,3,2062,176,3
4,4486,1,3,3977,1896,1


In [24]:
# Get player info
grouped_df = grouped_df.merge(players_info[['match_id','team_id','player_id','nickname','shirt_number']], how='left', left_on=['match_id','player_id'], right_on=['match_id','player_id']).drop_duplicates().reset_index(drop=True)

# Get receiver info
grouped_df = grouped_df.merge(players_info[['match_id','team_id','player_id','nickname','shirt_number']], how='left', left_on=['match_id','receiver_id'], right_on=['match_id','player_id'], suffixes=('_player','_receiver')).drop_duplicates().reset_index(drop=True)

grouped_df.head()

Unnamed: 0,match_id,interval_id,team_id_x,player_id_player,receiver_id,count,team_id_y,nickname_player,shirt_number_player,team_id,player_id_receiver,nickname_receiver,shirt_number_receiver
0,4486,1,3,144,1900,1,3,Emiliano Martínez,1,3.0,1900.0,Ollie Watkins,11.0
1,4486,1,3,176,2062,1,3,Lucas Digne,27,3.0,2062.0,Jacob Ramsey,41.0
2,4486,1,3,1896,144,1,3,Matty Cash,2,3.0,144.0,Emiliano Martínez,1.0
3,4486,1,3,2062,176,3,3,Jacob Ramsey,41,3.0,176.0,Lucas Digne,27.0
4,4486,1,3,3977,1896,1,3,Boubacar Kamara,44,3.0,1896.0,Matty Cash,2.0


# Graphs

In [25]:
graph_games = os.listdir(PROCESSED_DATA_PATH)
graph_game_ids = [game.split('.')[0] for game in graph_games]
print(f"Games to process: {len(graph_game_ids)}")

Games to process: 50


In [26]:
pre_processed_ids = [game_id for game_id in match_ids if game_id not in graph_game_ids]
print(f"Games to process graphs: {len(pre_processed_ids)}")

Games to process graphs: 30


In [27]:
graphs = []

# if len(graph_game_ids) > 0:
#     tasks = [(PROCESSED_DATA_PATH,game_id) for game_id in graph_game_ids]

#     for task in tqdm(tasks, desc="Loading Pre-Processed Graphs", total=len(tasks)):
#         match_graphs = load_graphs(task)
#         graphs.append((task[1],match_graphs))

if len(pre_processed_ids) > 0:

    for game_id in tqdm(pre_processed_ids, desc="Processing Graphs", total=len(pre_processed_ids)):

        match_grouped_df = grouped_df[grouped_df['match_id']==int(game_id)]
        match_players_interval_df = players_interval_df[players_interval_df['match_id']==int(game_id)]

        match_graphs = get_interval_graphs(match_grouped_df, match_players_interval_df)

        save_graphs(game_id, PROCESSED_DATA_PATH, match_graphs)
        graphs.append((game_id,match_graphs))

Processing Graphs:   0%|          | 0/30 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/48 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/45 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/46 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/47 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/49 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/48 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/48 [00:00<?, ?it/s]

Processing intervals: 0it [00:00, ?it/s]

Processing intervals:   0%|          | 0/52 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/49 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/48 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/48 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/49 [00:00<?, ?it/s]

Processing intervals: 0it [00:00, ?it/s]

Processing intervals:   0%|          | 0/48 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/49 [00:00<?, ?it/s]

Processing intervals: 0it [00:00, ?it/s]

Processing intervals:   0%|          | 0/49 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/49 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/49 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/48 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/47 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/50 [00:00<?, ?it/s]

Processing intervals: 0it [00:00, ?it/s]

Processing intervals: 0it [00:00, ?it/s]

Processing intervals:   0%|          | 0/51 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/50 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/47 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/47 [00:00<?, ?it/s]

Processing intervals:   0%|          | 0/47 [00:00<?, ?it/s]

In [28]:
import networkx as nx
nx.draw(graphs[0][1][0]['2'])

KeyError: '2'