# statsbomb를 feature, label를 정의하기

In [2]:
import os
import warnings
import tqdm
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
pd.read_csv('./soccer_binary_data/train/Y_train',index_col=0).value_counts()

scores  concedes
False   False       983349
True    False        13272
False   True          2159
Name: count, dtype: int64

In [4]:
pd.read_csv('./soccer_binary_data/valid/Y_valid',index_col=0).value_counts()

scores  concedes
False   False       74776
True    False         858
False   True          152
Name: count, dtype: int64

In [6]:
pd.read_csv('./soccer_binary_data/test/X_test',index_col=0).shape

(79812, 154)

In [5]:
pd.read_csv('./soccer_binary_data/test/Y_test',index_col=0).value_counts()

scores  concedes
False   False       78735
True    False         940
False   True          137
Name: count, dtype: int64

In [38]:
%load_ext autoreload
%autoreload 2
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### train, valid, test 데이터의 spadl, feature, label 정의하기

In [39]:
datafolder = "./data-fifa/La Liga"
#0은 train, 1은 valid, 2은 test

spadl0_h5 = os.path.join(datafolder, "spadl-statsbomb_train_competitions.h5")
features0_h5 = os.path.join(datafolder, "features_train.h5")
labels0_h5 = os.path.join(datafolder, "labels_train.h5")

spadl1_h5 = os.path.join(datafolder, "spadl-statsbomb_valid_competitions.h5")
features1_h5 = os.path.join(datafolder, "features_valid.h5")
labels1_h5 = os.path.join(datafolder, "labels_valid.h5")

spadl2_h5 = os.path.join(datafolder, "spadl-statsbomb_test_competitions.h5")
features2_h5 = os.path.join(datafolder, "features_test.h5")
labels2_h5 = os.path.join(datafolder, "labels_test.h5")

In [40]:
train_games = pd.read_hdf(spadl0_h5, "games")
print("nb of games:", len(train_games))

valid_games = pd.read_hdf(spadl1_h5, "games")
print("nb of games:", len(valid_games))

test_games = pd.read_hdf(spadl2_h5, "games")
print("nb of games:", len(test_games))

nb of games: 452
nb of games: 33
nb of games: 35


In [28]:
train_games.shape, valid_games.shape, test_games.shape

((452, 12), (33, 12), (35, 12))

In [29]:
# xfns = [
#     fs.actiontype,
#     fs.actiontype_onehot,
#     fs.bodypart,
#     fs.bodypart_onehot,
#     fs.result,
#     fs.result_onehot,
#     fs.goalscore,
#     fs.startlocation,
#     fs.endlocation,
#     fs.movement,
#     fs.space_delta,
#     fs.startpolar,
#     fs.endpolar,
#     fs.team,
#     fs.time,
#     fs.time_delta
# ]

In [30]:
#Select feature set X

xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    
    #foot, head, head/other, other
    #foot중 왼/오른발 구분은 bodypart에서 하지 않음
    fs.bodypart,
    fs.bodypart_onehot,
    
    #각 수행한 액션의 왼/ 오른발로 구분함
    # fs.bodypart_detailed,
    # fs.bodypart_detailed_onehot,
    
    fs.result,
    fs.result_onehot,
    
    #goalscore_team, goalscore_opponent, goalscore_diff 3개의 feature를 불러옴
    fs.goalscore,
    
    #action type and result사이의 원핫인코딩값?
    # fs.actiontype_result_onehot,
    
    
    #action의 시작, 끝 위치
    fs.startlocation,
    fs.endlocation,
    
    #movement는 x좌표변화량, y좌표변화량
    fs.movement,
    
    #space_delta는 (x,y)좌표의 변화량
    fs.space_delta,
    
    #startlocation, endloaction에서 goal까지 거리랑 각도
    fs.startpolar,
    fs.endpolar,
    
    #home인지 away인지
    fs.team,
    
    #time : 3가지 feature가 나옴
    #1.period_id = 전반1 / 후반2
    #2.time_seconds = 전/후반의 시작 후에 time
    #3.time_seconds_overall = 게임시작후 초
    # fs.time,
    
    #time_delta1 = 현 action time_seconds - 전 action time_seconds
    #time_delta2 = 현 action time_seconds - 전전 action time_seconds
    fs.time_delta,
    
]

### train 데이터 feature, label 처리하기

In [31]:
for game in tqdm.tqdm(list(train_games.itertuples()), desc=f"Generating and storing features in {features0_h5}"):
    actions = pd.read_hdf(spadl0_h5, f"actions/game_{game.game_id}")
    gamestates = fs.gamestates(spadl.add_names(actions), 3)
    gamestates = fs.play_left_to_right(gamestates, game.home_team_id)

    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)

    X.to_hdf(features0_h5, f"game_{game.game_id}")
    
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

for game in tqdm.tqdm(list(train_games.itertuples()), desc=f"Computing and storing labels in {labels0_h5}"):
    actions = pd.read_hdf(spadl0_h5, f"actions/game_{game.game_id}")   
    Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
    Y.to_hdf(labels0_h5, f"game_{game.game_id}")

Generating and storing features in ./data-fifa/La Liga\features_train.h5: 100%|██████████| 452/452 [01:01<00:00,  7.40it/s]
Computing and storing labels in ./data-fifa/La Liga\labels_train.h5: 100%|██████████| 452/452 [00:51<00:00,  8.70it/s]


### valid데이터 feature, label 처리하기

In [32]:
for game in tqdm.tqdm(list(valid_games.itertuples()), desc=f"Generating and storing features in {features1_h5}"):
    actions = pd.read_hdf(spadl1_h5, f"actions/game_{game.game_id}")
    gamestates = fs.gamestates(spadl.add_names(actions), 3)
    gamestates = fs.play_left_to_right(gamestates, game.home_team_id)
    
    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
    X.to_hdf(features1_h5, f"game_{game.game_id}")
    
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

for game in tqdm.tqdm(list(valid_games.itertuples()), desc=f"Computing and storing labels in {labels1_h5}"):
    actions = pd.read_hdf(spadl1_h5, f"actions/game_{game.game_id}")   
    Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
    Y.to_hdf(labels1_h5, f"game_{game.game_id}")

Generating and storing features in ./data-fifa/La Liga\features_valid.h5: 100%|██████████| 33/33 [00:03<00:00, 10.65it/s]
Computing and storing labels in ./data-fifa/La Liga\labels_valid.h5: 100%|██████████| 33/33 [00:03<00:00, 10.50it/s]


### test데이터 feature, label 처리하기

In [33]:
for game in tqdm.tqdm(list(test_games.itertuples()), desc=f"Generating and storing features in {features2_h5}"):
    actions = pd.read_hdf(spadl2_h5, f"actions/game_{game.game_id}")
    #print(actions.game_id[0])
    if actions.game_id[0]==3773689:
         print(actions.game_id[0])

Generating and storing features in ./data-fifa/La Liga\features_test.h5: 100%|██████████| 35/35 [00:00<00:00, 94.45it/s]

3773689





In [35]:
for game in tqdm.tqdm(list(test_games.itertuples()), desc=f"Generating and storing features in {features2_h5}"):
    actions = pd.read_hdf(spadl2_h5, f"actions/game_{game.game_id}")
    gamestates = fs.gamestates(spadl.add_names(actions), 3)
    gamestates = fs.play_left_to_right(gamestates, game.home_team_id)
    
    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
    X.to_hdf(features2_h5, f"game_{game.game_id}")
    
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

for game in tqdm.tqdm(list(test_games.itertuples()), desc=f"Computing and storing labels in {labels2_h5}"):
    actions = pd.read_hdf(spadl2_h5, f"actions/game_{game.game_id}")   
    Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
    Y.to_hdf(labels2_h5, f"game_{game.game_id}")

Generating and storing features in ./data-fifa/La Liga\features_test.h5: 100%|██████████| 35/35 [00:03<00:00,  9.57it/s]
Computing and storing labels in ./data-fifa/La Liga\labels_test.h5: 100%|██████████| 35/35 [00:03<00:00, 10.61it/s]


In [36]:
Y

Unnamed: 0,scores,concedes,goal_from_shot
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
2235,False,False,False
2236,False,False,False
2237,False,False,False
2238,False,False,False


In [11]:
actions

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,type_id,result_id,bodypart_id,action_id
0,3773477,a39b9c13-9a01-4452-b6fa-d5058ead67bf,1,0.0,218,6880.0,52.058824,33.655696,56.382353,35.549367,0,1,5,0
1,3773477,cc81f927-dda5-4ead-9a0f-38072ea0c563,1,1.0,218,6673.0,56.382353,35.549367,49.676471,37.959494,21,1,0,1
2,3773477,571103bd-adb0-4f9a-b591-e9fbd6ee80f6,1,3.0,218,6673.0,49.676471,37.959494,36.088235,64.470886,0,0,4,2
3,3773477,e62ef626-4bbd-4efa-a923-6bc939de5ab0,1,5.0,217,5211.0,35.294118,65.245570,35.294118,65.245570,10,1,0,3
4,3773477,301b4005-4f81-4d3c-ac21-b9a59ddad5cc,1,5.0,217,5211.0,35.294118,65.245570,45.352941,64.298734,21,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3773477,4a798266-43d7-4c9a-91d5-276cc5228b69,2,2972.0,218,6851.0,68.823529,64.556962,37.676471,57.068354,0,1,5,2235
2236,3773477,edd38889-aaed-45b8-8ad4-16718a477d66,2,2974.0,217,6826.0,42.000000,58.101266,42.000000,58.101266,8,1,0,2236
2237,3773477,6a5e2ead-5a23-4cc7-8f17-e2cc14e98a53,2,3002.0,218,41083.0,48.970588,62.577215,6.529412,34.688608,3,0,4,2237
2238,3773477,09fcb776-16cd-41fc-82f5-387115a4f97e,2,3006.0,217,20055.0,5.735294,35.463291,6.970588,31.503797,21,1,0,2238
