# 2. Statsbomb를 Feature, Label를 정의하기

- game_id = 3773689는 score와 concede가 둘 다 true인 데이터가 존재해서 해당 경기 데이터 제거

In [1]:
import sys
import os

# 현재 디렉토리의 상위 폴더 경로를 구합니다.
# matplotsoccer를 설치(pip)해서 라이브러리르 사용할 수 있지만, 본 연구에서는 좀 더 시각적으로 설명하기 위해 라이브러리를 조금 수정해보았다
parent_dir = os.path.dirname(os.getcwd())

# 상위 폴더 경로를 sys.path에 추가합니다.
sys.path.insert(0, parent_dir)

In [2]:
import os
import warnings
import tqdm
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
%load_ext autoreload
%autoreload 2
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab



### Train, Valid, Test 데이터의 SPADL, Feature, Label 정의하기

In [4]:
spadl_datafolder = "../data-fifa/La Liga/spadl-statsbomb/"
feature_datafolder = "../data-fifa/La Liga/feature/"
label_datafoler = "../data-fifa/La Liga/label/"

# os.makedirs(feature_datafolder)
# os.makedirs(label_datafoler)

train_spadl_h5 = os.path.join(spadl_datafolder, "train_competitions.h5")
train_features_h5 = os.path.join(feature_datafolder, "train_features.h5")
train_labels_h5 = os.path.join(label_datafoler, "train_labels.h5")

valid_spadl_h5 = os.path.join(spadl_datafolder, "valid_competitions.h5")
valid_features_h5 = os.path.join(feature_datafolder, "valid_features.h5")
valid_labels_h5 = os.path.join(label_datafoler, "valid_labels.h5")

test_spadl_h5 = os.path.join(spadl_datafolder, "test_competitions.h5")
test_features_h5 = os.path.join(feature_datafolder, "test_features.h5")
test_labels_h5 = os.path.join(label_datafoler, "test_labels.h5")

In [5]:
train_games = pd.read_hdf(train_spadl_h5, "games")
print("nb of games:", len(train_games))

valid_games = pd.read_hdf(valid_spadl_h5, "games")
print("nb of games:", len(valid_games))

test_games = pd.read_hdf(test_spadl_h5, "games")
print("nb of games:", len(test_games))

nb of games: 800
nb of games: 33
nb of games: 35


In [6]:
train_games.shape, valid_games.shape, test_games.shape

((800, 12), (33, 12), (35, 12))

In [7]:
#Select feature set X

xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    
    #foot, head, head/other, other
    #foot중 왼/오른발 구분은 bodypart에서 하지 않음
    fs.bodypart,
    fs.bodypart_onehot,
    
    #수행한 액션의 왼/오른발 구분도 포함
    # fs.bodypart_detailed,
    # fs.bodypart_detailed_onehot,
    
    fs.result,
    fs.result_onehot,
    
    #goalscore_team, goalscore_opponent, goalscore_diff 3개의 feature를 불러옴
    fs.goalscore,
    
    #action type-result사이의 원핫인코딩
    # fs.actiontype_result_onehot,
    
    #action의 시작, 끝 위치
    fs.startlocation,
    fs.endlocation,
    
    #movement : x좌표변화량, y좌표변화량
    fs.movement,
    
    #space_delta : (x,y)좌표의 변화량
    fs.space_delta,
    
    #startlocation, endloaction에서 goal까지 거리랑 각도
    fs.startpolar,
    fs.endpolar,
    
    #home인지 away인지
    fs.team,
    
    #time : 3가지 feature가 나옴
    #1.period_id = 전반1 / 후반2
    #2.time_seconds = 전/후반의 시작 후에 time
    #3.time_seconds_overall = 게임시작후 초
    # fs.time,
    
    #time_delta1 = 현 action time_seconds - 전 action time_seconds
    #time_delta2 = 현 action time_seconds - 전전 action time_seconds
    fs.time_delta,
    
]

### train 데이터 feature, label 처리하기

In [8]:
with pd.HDFStore(train_spadl_h5) as spadlstore, pd.HDFStore(train_features_h5) as featurestore:
    for game in tqdm.tqdm(list(train_games.itertuples()), desc=f"Generating and storing features in {train_features_h5}"):
        actions = pd.read_hdf(spadlstore, f"actions/game_{game.game_id}")
        gamestates = fs.gamestates(spadl.add_names(actions), 3)
        gamestates = fs.play_left_to_right(gamestates, game.home_team_id)

        X = pd.concat([fn(gamestates) for fn in xfns], axis=1)

        featurestore.put(f"game_{game.game_id}", X, format='table')
    
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

with pd.HDFStore(train_spadl_h5) as spadlstore, pd.HDFStore(train_labels_h5) as labelstore:
    for game in tqdm.tqdm(list(train_games.itertuples()), desc=f"Computing and storing labels in {train_labels_h5}"):
        actions = pd.read_hdf(spadlstore, f"actions/game_{game.game_id}")   
        Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
        labelstore.put(f"game_{game.game_id}", Y, format='table')

Generating and storing features in ../data-fifa/La Liga/feature/train_features.h5:   0%|          | 2/800 [00:00<01:01, 12.94it/s]

Generating and storing features in ../data-fifa/La Liga/feature/train_features.h5: 100%|██████████| 800/800 [01:01<00:00, 13.04it/s]
Computing and storing labels in ../data-fifa/La Liga/label/train_labels.h5: 100%|██████████| 800/800 [20:48<00:00,  1.56s/it]


### valid데이터 feature, label 처리하기

In [9]:
with pd.HDFStore(valid_spadl_h5) as spadlstore, pd.HDFStore(valid_features_h5) as featurestore:
    for game in tqdm.tqdm(list(valid_games.itertuples()), desc=f"Generating and storing features in {valid_features_h5}"):
        actions = pd.read_hdf(spadlstore, f"actions/game_{game.game_id}")
        gamestates = fs.gamestates(spadl.add_names(actions), 3)
        gamestates = fs.play_left_to_right(gamestates, game.home_team_id)

        X = pd.concat([fn(gamestates) for fn in xfns], axis=1)

        featurestore.put(f"game_{game.game_id}", X, format='table')
    
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

with pd.HDFStore(valid_spadl_h5) as spadlstore, pd.HDFStore(valid_labels_h5) as labelstore:
    for game in tqdm.tqdm(list(valid_games.itertuples()), desc=f"Computing and storing labels in {valid_labels_h5}"):
        actions = pd.read_hdf(spadlstore, f"actions/game_{game.game_id}")   
        Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
        labelstore.put(f"game_{game.game_id}", Y, format='table')

Generating and storing features in ../data-fifa/La Liga/feature/valid_features.h5:  12%|█▏        | 4/33 [00:00<00:02, 14.24it/s]

Generating and storing features in ../data-fifa/La Liga/feature/valid_features.h5: 100%|██████████| 33/33 [00:02<00:00, 13.84it/s]
Computing and storing labels in ../data-fifa/La Liga/label/valid_labels.h5: 100%|██████████| 33/33 [00:55<00:00,  1.68s/it]


### test데이터 feature, label 처리하기

In [10]:
with pd.HDFStore(test_spadl_h5) as spadlstore, pd.HDFStore(test_features_h5) as featurestore:
    for game in tqdm.tqdm(list(test_games.itertuples()), desc=f"Generating and storing features in {test_features_h5}"):
        actions = pd.read_hdf(spadlstore, f"actions/game_{game.game_id}")
        gamestates = fs.gamestates(spadl.add_names(actions), 3)
        gamestates = fs.play_left_to_right(gamestates, game.home_team_id)

        X = pd.concat([fn(gamestates) for fn in xfns], axis=1)

        featurestore.put(f"game_{game.game_id}", X, format='table')
    
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

with pd.HDFStore(test_spadl_h5) as spadlstore, pd.HDFStore(test_labels_h5) as labelstore:
    for game in tqdm.tqdm(list(test_games.itertuples()), desc=f"Computing and storing labels in {test_labels_h5}"):

        actions = pd.read_hdf(spadlstore, f"actions/game_{game.game_id}")   
        Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
        labelstore.put(f"game_{game.game_id}", Y, format='table')

Generating and storing features in ../data-fifa/La Liga/feature/test_features.h5: 100%|██████████| 35/35 [00:02<00:00, 13.87it/s]
Computing and storing labels in ../data-fifa/La Liga/label/test_labels.h5: 100%|██████████| 35/35 [01:00<00:00,  1.73s/it]
