# Statsbomb를 Feature, Label를 정의하기

In [1]:
import os
import warnings
import tqdm
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
%load_ext autoreload
%autoreload 2
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab

### Train, Valid, Test 데이터의 SPADL, Feature, Label 정의하기

In [3]:
datafolder = "./data-fifa/La Liga"
#0은 train, 1은 valid, 2은 test

spadl0_h5 = os.path.join(datafolder, "spadl-statsbomb_train_competitions.h5")
features0_h5 = os.path.join(datafolder, "features_train.h5")
labels0_h5 = os.path.join(datafolder, "labels_train.h5")

spadl1_h5 = os.path.join(datafolder, "spadl-statsbomb_valid_competitions.h5")
features1_h5 = os.path.join(datafolder, "features_valid.h5")
labels1_h5 = os.path.join(datafolder, "labels_valid.h5")

spadl2_h5 = os.path.join(datafolder, "spadl-statsbomb_test_competitions.h5")
features2_h5 = os.path.join(datafolder, "features_test.h5")
labels2_h5 = os.path.join(datafolder, "labels_test.h5")

In [4]:
train_games = pd.read_hdf(spadl0_h5, "games")
print("nb of games:", len(train_games))

valid_games = pd.read_hdf(spadl1_h5, "games")
print("nb of games:", len(valid_games))

test_games = pd.read_hdf(spadl2_h5, "games")
print("nb of games:", len(test_games))

nb of games: 452
nb of games: 33
nb of games: 35


In [5]:
train_games.shape, valid_games.shape, test_games.shape

((452, 12), (33, 12), (35, 12))

In [8]:
#Select feature set X

xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    
    #foot, head, head/other, other
    #foot중 왼/오른발 구분은 bodypart에서 하지 않음
    fs.bodypart,
    fs.bodypart_onehot,
    
    #각 수행한 액션의 왼/ 오른발로 구분함
    # fs.bodypart_detailed,
    # fs.bodypart_detailed_onehot,
    
    fs.result,
    fs.result_onehot,
    
    #goalscore_team, goalscore_opponent, goalscore_diff 3개의 feature를 불러옴
    fs.goalscore,
    
    #action type-result사이의 원핫인코딩
    # fs.actiontype_result_onehot,
    
    
    #action의 시작, 끝 위치
    fs.startlocation,
    fs.endlocation,
    
    #movement는 x좌표변화량, y좌표변화량
    fs.movement,
    
    #space_delta는 (x,y)좌표의 변화량
    fs.space_delta,
    
    #startlocation, endloaction에서 goal까지 거리랑 각도
    fs.startpolar,
    fs.endpolar,
    
    #home인지 away인지
    fs.team,
    
    #time : 3가지 feature가 나옴
    #1.period_id = 전반1 / 후반2
    #2.time_seconds = 전/후반의 시작 후에 time
    #3.time_seconds_overall = 게임시작후 초
    # fs.time,
    
    #time_delta1 = 현 action time_seconds - 전 action time_seconds
    #time_delta2 = 현 action time_seconds - 전전 action time_seconds
    fs.time_delta,
    
]

### train 데이터 feature, label 처리하기

In [31]:
for game in tqdm.tqdm(list(train_games.itertuples()), desc=f"Generating and storing features in {features0_h5}"):
    actions = pd.read_hdf(spadl0_h5, f"actions/game_{game.game_id}")
    gamestates = fs.gamestates(spadl.add_names(actions), 3)
    gamestates = fs.play_left_to_right(gamestates, game.home_team_id)

    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)

    X.to_hdf(features0_h5, f"game_{game.game_id}")
    
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

for game in tqdm.tqdm(list(train_games.itertuples()), desc=f"Computing and storing labels in {labels0_h5}"):
    actions = pd.read_hdf(spadl0_h5, f"actions/game_{game.game_id}")   
    Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
    Y.to_hdf(labels0_h5, f"game_{game.game_id}")

Generating and storing features in ./data-fifa/La Liga\features_train.h5: 100%|██████████| 452/452 [01:01<00:00,  7.40it/s]
Computing and storing labels in ./data-fifa/La Liga\labels_train.h5: 100%|██████████| 452/452 [00:51<00:00,  8.70it/s]


### valid데이터 feature, label 처리하기

In [32]:
for game in tqdm.tqdm(list(valid_games.itertuples()), desc=f"Generating and storing features in {features1_h5}"):
    actions = pd.read_hdf(spadl1_h5, f"actions/game_{game.game_id}")
    gamestates = fs.gamestates(spadl.add_names(actions), 3)
    gamestates = fs.play_left_to_right(gamestates, game.home_team_id)
    
    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
    X.to_hdf(features1_h5, f"game_{game.game_id}")
    
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

for game in tqdm.tqdm(list(valid_games.itertuples()), desc=f"Computing and storing labels in {labels1_h5}"):
    actions = pd.read_hdf(spadl1_h5, f"actions/game_{game.game_id}")   
    Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
    Y.to_hdf(labels1_h5, f"game_{game.game_id}")

Generating and storing features in ./data-fifa/La Liga\features_valid.h5: 100%|██████████| 33/33 [00:03<00:00, 10.65it/s]
Computing and storing labels in ./data-fifa/La Liga\labels_valid.h5: 100%|██████████| 33/33 [00:03<00:00, 10.50it/s]


### test데이터 feature, label 처리하기

In [14]:
for game in tqdm.tqdm(list(test_games.itertuples()), desc=f"Generating and storing features in {features2_h5}"):
    actions = pd.read_hdf(spadl2_h5, f"actions/game_{game.game_id}")
    gamestates = fs.gamestates(spadl.add_names(actions), 3)
    gamestates = fs.play_left_to_right(gamestates, game.home_team_id)
    
    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
    X.to_hdf(features2_h5, f"game_{game.game_id}")
    
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

for game in tqdm.tqdm(list(test_games.itertuples()), desc=f"Computing and storing labels in {labels2_h5}"):
    actions = pd.read_hdf(spadl2_h5, f"actions/game_{game.game_id}")   
    Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
    Y.to_hdf(labels2_h5, f"game_{game.game_id}")

Generating and storing features in ./data-fifa/La Liga\features_test.h5: 100%|██████████| 35/35 [00:03<00:00, 11.00it/s]
Computing and storing labels in ./data-fifa/La Liga\labels_test.h5: 100%|██████████| 35/35 [00:02<00:00, 11.92it/s]

3773689





In [20]:
for game in tqdm.tqdm(list(test_games.itertuples()), desc=f"Generating and storing features in {features2_h5}"):
    actions = pd.read_hdf(spadl2_h5, f"actions/game_{game.game_id}")
    #test데이터 사용시에는 실제로 해당 경기 game_id는 제거하고 사용
    if actions.game_id[0]==3773689:
         print(actions.game_id[0])

Generating and storing features in ./data-fifa/La Liga\features_test.h5: 100%|██████████| 35/35 [00:00<00:00, 115.29it/s]

3773689



