## Metrica Data Processing

- 연구에 활용할 수 있도록 metrica-data전처리하는 코드
- Metrica 1,2,3경기를 loading & preprocess하는 코드

In [1]:
import os
import sys

sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))

In [2]:
%load_ext autoreload
%autoreload 2

import json
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
import torch
from matplotlib import animation
from tqdm import tqdm

from datatools.metrica_helper import MetricaHelper
from datatools.trace_helper import TraceHelper

In [3]:
match_id = 1
team1_file = f"../data/sample-data/Sample_Game_{match_id}/Sample_Game_{match_id}_RawTrackingData_Home_Team.csv"
team2_file = f"../data/sample-data/Sample_Game_{match_id}/Sample_Game_{match_id}_RawTrackingData_Away_Team.csv"
team1_traces = pd.read_csv(team1_file, header=[0, 1, 2])
team2_traces = pd.read_csv(team2_file, header=[0, 1, 2])

In [4]:
team2_traces

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Away,Unnamed: 4_level_0,Away,Unnamed: 6_level_0,Away,Unnamed: 8_level_0,Away,Away,Away,Unnamed: 24_level_0,Away,Unnamed: 26_level_0,Away,Unnamed: 28_level_0,Away,Unnamed: 30_level_0,Away,Unnamed: 32_level_0
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,25,Unnamed: 4_level_1,15,Unnamed: 6_level_1,16,Unnamed: 8_level_1,17,...,24,Unnamed: 24_level_1,26,Unnamed: 26_level_1,27,Unnamed: 28_level_1,28,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
Unnamed: 0_level_2,Period,Frame,Time [s],Player25,Unnamed: 4_level_2,Player15,Unnamed: 6_level_2,Player16,Unnamed: 8_level_2,Player17,...,Player24,Unnamed: 24_level_2,Player26,Unnamed: 26_level_2,Player27,Unnamed: 28_level_2,Player28,Unnamed: 30_level_2,Ball,Unnamed: 32_level_2
0,1,1,0.04,0.90509,0.47462,0.58393,0.20794,0.67658,0.46710,0.67310,...,0.37833,0.27383,,,,,,,0.45472,0.38709
1,1,2,0.08,0.90494,0.47462,0.58393,0.20794,0.67658,0.46710,0.67310,...,0.37833,0.27383,,,,,,,0.49645,0.40656
2,1,3,0.12,0.90434,0.47463,0.58393,0.20794,0.67658,0.46710,0.67310,...,0.37833,0.27383,,,,,,,0.53716,0.42556
3,1,4,0.16,0.90377,0.47463,0.58351,0.20868,0.67640,0.46762,0.67279,...,0.37756,0.27473,,,,,,,0.55346,0.42231
4,1,5,0.20,0.90324,0.47464,0.58291,0.21039,0.67599,0.46769,0.67253,...,0.37663,0.27543,,,,,,,0.55512,0.40570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145001,2,145002,5800.08,0.12564,0.55386,0.17876,0.56826,0.25818,0.59952,0.09880,...,,,0.35600,0.55371,0.19836,0.45137,0.21798,0.81079,,
145002,2,145003,5800.12,0.12564,0.55386,0.17838,0.56727,0.25799,0.59999,0.09880,...,,,0.35583,0.55283,0.19831,0.45182,0.21798,0.81079,,
145003,2,145004,5800.16,0.12564,0.55386,0.17792,0.56682,0.25757,0.60019,0.09880,...,,,0.35561,0.55254,0.19805,0.45200,0.21798,0.81079,,
145004,2,145005,5800.20,0.12564,0.55386,0.17730,0.56621,0.25721,0.60089,0.09880,...,,,0.35532,0.55243,0.19766,0.45237,0.21798,0.81079,,


### Parsing Metrica Sample Game 3 Data

In [5]:
tree = ET.parse("../data/sample-data/Sample_Game_3/Sample_Game_3_metadata.xml")
root = tree.getroot()
root[0].tag, root[1].tag

('Metadata', 'DataFormatSpecifications')

In [6]:
player_records = []

for player in root.iter("Player"):
    team_code = player.get("teamId")[-1]
    squad_num = int(player.findtext("ShirtNumber"))
    player_code = f"{team_code}{squad_num:02d}"

    for param in player.iter("ProviderParameter"):
        if param.findtext("Name") == "position_type":
            position = param.findtext("Value")

    player_records.append([squad_num, player_code, position])

player_records = pd.DataFrame(player_records, columns=["squad_num", "code", "position"]).set_index("squad_num")
player_records

Unnamed: 0_level_0,code,position
squad_num,Unnamed: 1_level_1,Unnamed: 2_level_1
11,A11,Goalkeeper
1,A01,Right Back
2,A02,Right Center Back (4)
3,A03,Left Center Back (4)
4,A04,Left Back
5,A05,"Attacking Right Midfielder (4,5)"
6,A06,Defensive Right Center Midfielder (5)
7,A07,Attacking Center Midfielder
8,A08,Defensive Left Center Midfielder (5)
9,A09,"Attacking Left Midfielder (4,5)"


In [7]:
phase_records = []

for i, data_spec in enumerate(root[1]):
    start_frame = int(data_spec.get("startFrame"))
    end_frame = int(data_spec.get("endFrame"))
    session = 1 if i == 0 else 2

    player_codes = []
    gk_codes = []

    for player_xy in data_spec[1]:
        squad_num = int(player_xy[0].get("playerChannelId")[6:-2])
        player_code = player_records.at[squad_num, "code"]
        player_codes.append(player_code)

        position = player_records.at[squad_num, "position"]
        if position == "Goalkeeper":
            gk_codes.append(player_code)
    
    player_codes = player_codes[10:11] + player_codes[:10] + player_codes[-1:] + player_codes[11:-1]
    phase_records.append([i + 1, session, start_frame, end_frame, player_codes, gk_codes])

header = ["phase", "session", "start_frame", "end_frame", "player_codes", "gk_codes"]
phase_records = pd.DataFrame(phase_records, columns=header).set_index("phase")
phase_records

Unnamed: 0_level_0,session,start_frame,end_frame,player_codes,gk_codes
phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,1,69661,"[A11, A01, A02, A03, A04, A05, A06, A07, A08, ...","[A11, B28]"
2,2,69662,89697,"[A11, A01, A02, A12, A04, A05, A06, A07, A08, ...","[A11, B28]"
3,2,89698,93452,"[A11, A01, A02, A12, A04, A05, A06, A07, A08, ...","[A11, B28]"
4,2,93453,93835,"[A11, A01, A02, A12, A04, A05, A06, A07, A08, ...","[A11, B28]"
5,2,93836,94657,"[A11, A01, A02, A12, A04, A05, A06, A07, A08, ...","[A11, B28]"
6,2,94658,98472,"[A11, A01, A02, A12, A04, A05, A06, A07, A08, ...","[A11, B28]"
7,2,98473,102811,"[A11, A01, A02, A12, A04, A15, A06, A07, A08, ...","[A11, B28]"
8,2,102812,110298,"[A11, A01, A02, A12, A04, A15, A06, A07, A16, ...","[A11, B28]"
9,2,110299,120212,"[A11, A01, A02, A12, A04, A15, A06, A07, A16, ...",[A11]
10,2,120213,129831,"[A11, A01, A02, A12, A04, A15, A06, A07, A16, ...",[A11]


### Metrica-data3경기는 추가적인 전처리작업 수행

- metrica-data3의 event데이터는 json형태로 되어있으므로 json -> csv변환
- metrica-data3의 tracking데이터는 txt형태로 되어있으므로 txt -> csv변환

In [8]:
event3_json_file = f"../data/sample-data/Sample_Game_3/Sample_Game_3_events.json"
with open(event3_json_file) as f:
    js = json.loads(f.read()) ## json 라이브러리 이용

# 먼저 json_normalize를 사용하여 "data" 부분을 평탄화합니다.
events3 = pd.json_normalize(js, record_path='data')

# "metadata" 부분을 따로 처리할 필요가 있을 수도 있습니다.
metadata_df = pd.json_normalize(js, record_path='metadata')

rename_columns = {'team.name':'team', 'type.name':'type', 'subtypes.name':'subtype', 'period':'session', 'start.frame':'start_frame', 'start.time':'start_time',
                  'end.frame':'end_frame', 'end.time':'end_time', 'from.name':'from', 'to.name':'to',
                  'start.x':'start_x', 'start.y':'start_y','end.x':'end_x', 'end.y':'end_y'}

events3 = events3[list(rename_columns.keys())]
events3.rename(columns=rename_columns, inplace=True)
events3 = events3[list(rename_columns.values())]
events3['team'] = events3['team'].map({'Team A':'Away', 'Team B':'Home'})

events3.to_csv(f"../data/sample-data/Sample_Game_3/Sample_Game_3_RawEventsData.csv", index=False)
events3

Unnamed: 0,team,type,subtype,session,start_frame,start_time,end_frame,end_time,from,to,start_x,start_y,end_x,end_y
0,Away,SET PIECE,KICK OFF,1,361,14.44,361,14.44,Player 10,,,,,
1,Away,PASS,,1,361,14.44,377,15.08,Player 10,Player 7,0.50125,0.48725,0.49864,0.48705
2,Away,CARRY,,1,377,15.08,384,15.36,Player 7,,0.49864,0.48705,0.49700,0.48500
3,Away,PASS,,1,384,15.36,426,17.04,Player 7,Player 8,0.49700,0.48500,0.63373,0.63449
4,Away,CARRY,,1,426,17.04,465,18.60,Player 8,,0.63373,0.63449,0.66986,0.59707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3615,Home,PASS,,2,143406,5736.24,143469,5738.76,Player 33,Player 20,0.73416,0.40874,0.71353,0.85950
3616,Home,PASS,,2,143469,5738.76,143532,5741.28,Player 20,Player 28,0.71353,0.85950,0.88776,0.51189
3617,Home,CARRY,,2,143469,5738.76,143470,5738.80,Player 20,,0.71353,0.85950,0.71353,0.85950
3618,Home,CARRY,,2,143532,5741.28,143553,5742.12,Player 28,,0.88776,0.51189,0.89225,0.50456


In [9]:
time_cols = ["frame", "session", "time"]
xy_cols = np.array([[f"{p}_x", f"{p}_y"] for p in player_records["code"].tolist() + ["ball"]]).flatten().tolist()

traces_txt = pd.read_csv("../data/sample-data/Sample_Game_3/Sample_Game_3_tracking.txt", sep=";", header=None)
traces = pd.DataFrame(index=traces_txt.index, columns=time_cols + xy_cols)

for phase in tqdm(phase_records.index):
    i0 = phase_records.at[phase, "start_frame"] - 1
    i1 = phase_records.at[phase, "end_frame"] - 1
    player_codes = phase_records.at[phase, "player_codes"]

    phase_traces = traces_txt.loc[i0:i1]
    phase_traces.columns = player_codes
    leftmost = phase_traces[player_codes[0]].str.split(":", expand=True)
    leftmost.columns = ["frame", player_codes[0]]
    rightmost = phase_traces[player_codes[-1]].str.split(":", expand=True)
    rightmost.columns = [player_codes[-1], "ball"]
    phase_traces = pd.concat([leftmost, phase_traces[player_codes[1:-1]], rightmost], axis=1)

    traces.loc[phase_traces.index, "frame"] = phase_traces["frame"].astype(int)
    traces.loc[phase_traces.index, "session"] = phase_records.at[phase, "session"]

    for p in phase_traces.columns[1:]:
        xy = phase_traces[p].str.split(",", expand=True).astype(float).values
        traces.loc[phase_traces.index, [f"{p}_x", f"{p}_y"]] = xy

traces["time"] = (traces["frame"] * 0.04).astype(float).round(2)
traces.to_csv(f"../data/sample-data/Sample_Game_3/Sample_Game_3_RawTrackingData.csv", index=False)
traces

100%|██████████| 11/11 [00:07<00:00,  1.56it/s]


Unnamed: 0,frame,session,time,A11_x,A11_y,A01_x,A01_y,A02_x,A02_y,A03_x,...,B32_x,B32_y,B33_x,B33_y,B34_x,B34_y,B35_x,B35_y,ball_x,ball_y
0,1,1,0.04,0.84722,0.52855,0.65268,0.24792,0.66525,0.46562,0.68103,...,,,,,,,,,,
1,2,1,0.08,0.84722,0.52855,0.65231,0.24513,0.66482,0.46548,0.68095,...,,,,,,,,,,
2,3,1,0.12,0.84722,0.52855,0.65197,0.24387,0.66467,0.46537,0.68078,...,,,,,,,,,,
3,4,1,0.16,0.84722,0.52855,0.65166,0.24288,0.6646,0.46488,0.68063,...,,,,,,,,,,
4,5,1,0.20,0.84722,0.52855,0.65141,0.24251,0.66452,0.46469,0.68052,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143756,143757,2,5750.28,0.11993,0.51783,0.47808,0.45408,,,,...,0.80792,0.27106,0.73396,0.8533,0.90315,0.5375,0.50199,0.55081,,
143757,143758,2,5750.32,0.11993,0.51783,0.47786,0.45521,,,,...,0.80712,0.27184,0.73251,0.85289,0.90301,0.53788,0.50164,0.55178,,
143758,143759,2,5750.36,0.11993,0.51783,0.47743,0.45709,,,,...,0.80582,0.27242,0.73086,0.85218,0.90264,0.53799,0.50099,0.55329,,
143759,143760,2,5750.40,0.11993,0.51783,0.47669,0.45947,,,,...,0.80444,0.2726,0.72892,0.85192,0.90204,0.53782,0.50003,0.55502,,


### Processing Metrica Data

- metrica데이터의 여러 코드 작업(MetricaHelper클래스)

1. tracking-data & event-data의 경기장 규격 맞추기
2. 

In [10]:
import warnings
warnings.filterwarnings('ignore')

match_ids = [1, 2, 3]

for match_id in match_ids:

    event_file = f"../data/sample-data/Sample_Game_{match_id}/Sample_Game_{match_id}_RawEventsData.csv"
    events = pd.read_csv(event_file)

    if match_id <= 2:
        team1_file = f"../data/sample-data/Sample_Game_{match_id}/Sample_Game_{match_id}_RawTrackingData_Home_Team.csv"
        team2_file = f"../data/sample-data/Sample_Game_{match_id}/Sample_Game_{match_id}_RawTrackingData_Away_Team.csv"
        team1_traces = pd.read_csv(team1_file, header=[0, 1, 2])
        team2_traces = pd.read_csv(team2_file, header=[0, 1, 2])
        helper = MetricaHelper(team1_traces, team2_traces, events=events)
    else:  # match_id == 3
        trace_file = f"../data/sample-data/Sample_Game_{match_id}/Sample_Game_{match_id}_RawTrackingData.csv"
        traces = pd.read_csv(trace_file, index_col=0)
        helper = MetricaHelper(traces_from_txt=traces, events=events)

    # phase정의 함수 : phase란 선수 교체/퇴장이 발생하거나 후반전이 시작하는 상황을 의미한다.
    # phase가 음수인 상황은 한 팀의 측정된 필드플레이어 수가 10명 미만이 되는 경우로 분석에 사용되지 않는다.
    helper.generate_phase_records()
    helper.generate_phase()

    # 패스의 초기 위치가 부자연스러워지는 문제 때문에 다운샘플링 사용하지 않음 -> helper.downsample_to_10fps() 함수 사용X
    # 자세한 이유: 추적 데이터에는 측정되지 않은 위치값이 많아 보간 필수.
    # 기존 코드는 25fps에서 10fps로 다운샘플링 후 보간 사용. 
    # 그러나 본 연구에서는 0.4초 동안의 패스 위치가 중요한데, 다운샘플링으로 인해 패스 방향이 부자연스러운 장면이 생김
    # helper.downsample_to_10fps()

    # episode정의 함수 : CARD, OFFSIDE등의 상황으로 인해 경기가 중단된 상황을 의미한다.
    helper.split_into_episodes()

    # 공과 선수들의 위치좌표를 바탕으로 SPEED, ACCEL계산
    helper.calc_running_features(remove_outliers=True, smoothing=True)

    # 추적데이터의 소유권 정의 함수
    helper.find_gt_player_poss()
    helper.find_gt_team_poss()

    # tracking_data와 event_data의 player_possession정보가 mismatch되는 문제
    if match_id == 3:
        helper.correct_event_player_ids()

    helper.traces.to_csv(f"../data/preprocess-data/tracking-data/match{match_id}.csv", index=False)
    helper.events.to_csv(f"../data/preprocess-data/event-data/match{match_id}.csv", index=False)

Combining tracking and event data: 100%|██████████| 1745/1745 [00:00<00:00, 8376.58it/s]
Calculating running features: 100%|██████████| 28/28 [00:01<00:00, 24.12it/s]
Combining tracking and event data: 100%|██████████| 1935/1935 [00:00<00:00, 8342.73it/s]
Calculating running features: 100%|██████████| 26/26 [00:01<00:00, 23.56it/s]
Combining tracking and event data: 100%|██████████| 3620/3620 [00:00<00:00, 7039.49it/s]
Calculating running features: 100%|██████████| 35/35 [00:01<00:00, 29.14it/s]



Correcting event player IDs:


Phase 1: 100%|██████████| 1764/1764 [00:01<00:00, 1279.07it/s]
Phase 2: 100%|██████████| 550/550 [00:00<00:00, 1298.07it/s]
Phase 3: 100%|██████████| 66/66 [00:00<00:00, 1410.09it/s]
Phase 6: 100%|██████████| 113/113 [00:00<00:00, 1230.82it/s]
Phase 7: 100%|██████████| 96/96 [00:00<00:00, 1277.10it/s]
Phase 8: 100%|██████████| 196/196 [00:00<00:00, 1468.74it/s]
Phase 9: 100%|██████████| 259/259 [00:00<00:00, 1207.29it/s]
Phase 10: 100%|██████████| 213/213 [00:00<00:00, 1407.04it/s]
Phase 11: 100%|██████████| 363/363 [00:00<00:00, 1438.06it/s]


In [11]:
helper.events

Unnamed: 0,team,type,subtype,session,start_frame,start_time,end_frame,end_time,from,to,start_x,start_y,end_x,end_y,phase
0,Away,SET PIECE,KICK OFF,1,361,14.44,361,14.44,A10,,,,,,1
1,Away,PASS,PASS,1,361,14.44,377,15.08,A10,A07,54.13500,35.08200,53.85312,35.06760,1
2,Away,CARRY,CARRY,1,377,15.08,384,15.36,A07,,53.85312,35.06760,53.67600,34.92000,1
3,Away,PASS,PASS,1,384,15.36,426,17.04,A07,A08,53.67600,34.92000,68.44284,45.68328,1
4,Away,CARRY,CARRY,1,426,17.04,465,18.60,A08,,68.44284,45.68328,72.34488,42.98904,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3615,Home,PASS,PASS,2,143406,5736.24,143469,5738.76,B29,B33,79.28928,29.42928,77.06124,61.88400,11
3616,Home,PASS,PASS,2,143469,5738.76,143532,5741.28,B33,B34,77.06124,61.88400,95.87808,36.85608,11
3617,Home,CARRY,CARRY,2,143469,5738.76,143470,5738.80,B33,,77.06124,61.88400,77.06124,61.88400,11
3618,Home,CARRY,CARRY,2,143532,5741.28,143553,5742.12,B34,,95.87808,36.85608,96.36300,36.32832,11


In [12]:
helper.traces

Unnamed: 0,frame,session,time,phase,episode,team_poss,player_poss,event_player,event_type,A11_x,...,B35_vx,B35_vy,B35_speed,B35_accel,ball_x,ball_y,ball_vx,ball_vy,ball_speed,ball_accel
0,1,1,0.04,1,0,A,,,,91.49976,...,,,,,54.13500,35.0820,0.0,0.0,0.0,0.0
1,2,1,0.08,1,0,A,,,,91.49976,...,,,,,54.13500,35.0820,0.0,0.0,0.0,0.0
2,3,1,0.12,1,0,A,,,,91.49976,...,,,,,54.13500,35.0820,0.0,0.0,0.0,0.0
3,4,1,0.16,1,0,A,,,,91.49976,...,,,,,54.13500,35.0820,0.0,0.0,0.0,0.0
4,5,1,0.20,1,0,A,,,,91.49976,...,,,,,54.13500,35.0820,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143756,143757,2,5750.28,11,0,B,,,,12.95244,...,-0.014148,0.356543,0.356823,1.131416,96.34356,36.3492,0.0,0.0,0.0,0.0
143757,143758,2,5750.32,11,0,B,,,,12.95244,...,-0.006269,0.414931,0.414979,1.044616,96.34356,36.3492,0.0,0.0,0.0,0.0
143758,143759,2,5750.36,11,0,B,,,,12.95244,...,-0.017081,0.451083,0.451406,0.795448,96.34356,36.3492,0.0,0.0,0.0,0.0
143759,143760,2,5750.40,11,0,B,,,,12.95244,...,-0.046586,0.464996,0.467324,0.383913,96.34356,36.3492,0.0,0.0,0.0,0.0


In [13]:
col_x = [col for col in helper.traces.columns if col.endswith("_x")]
col_y = [col for col in helper.traces.columns if col.endswith("_y")]

print("x좌표 통계 : ", helper.traces[col_x].max().max(), helper.traces[col_x].min().min())
print("y좌표 통계 : ", helper.traces[col_y].max().max(), helper.traces[col_y].min().min())

x좌표 통계 :  111.76272 -3.93336
y좌표 통계 :  74.97792 -5.60664
