In [35]:
import os
import warnings
import tqdm
import pandas as pd
import numpy as np
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=UserWarning)

In [36]:
%load_ext autoreload
%autoreload 2
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
datafolder = "./data-fifa/La Liga"
#0은 train, 1은 valid, 2은 test

spadl0_h5 = os.path.join(datafolder, "spadl-statsbomb_train_competitions.h5")
features0_h5 = os.path.join(datafolder, "features_train.h5")
labels0_h5 = os.path.join(datafolder, "labels_train.h5")

spadl1_h5 = os.path.join(datafolder, "spadl-statsbomb_valid_competitions.h5")
features1_h5 = os.path.join(datafolder, "features_valid.h5")
labels1_h5 = os.path.join(datafolder, "labels_valid.h5")

spadl2_h5 = os.path.join(datafolder, "spadl-statsbomb_test_competitions.h5")
features2_h5 = os.path.join(datafolder, "features_test.h5")
labels2_h5 = os.path.join(datafolder, "labels_test.h5")

In [38]:
train_games = pd.read_hdf(spadl0_h5, "games")
print("nb of games:", len(train_games))

valid_games = pd.read_hdf(spadl1_h5, "games")
print("nb of games:", len(valid_games))

test_games = pd.read_hdf(spadl2_h5, "games")
print("nb of games:", len(test_games))

nb of games: 452
nb of games: 33
nb of games: 35


##### statsbomb soccer data에서 사용할 feature

In [39]:
#Select feature set X

xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    
    #foot, head, head/other, other
    #foot중 왼/오른발 구분은 bodypart에서 하지 않음
    fs.bodypart,
    fs.bodypart_onehot,
    
    #각 수행한 액션의 왼/ 오른발로 구분함
    # fs.bodypart_detailed,
    # fs.bodypart_detailed_onehot,
    
    fs.result,
    fs.result_onehot,
    
    #goalscore_team, goalscore_opponent, goalscore_diff 3개의 feature를 불러옴
    fs.goalscore,
    
    #action type and result사이의 원핫인코딩값?
    # fs.actiontype_result_onehot,
    
    
    #action의 시작, 끝 위치
    fs.startlocation,
    fs.endlocation,
    
    #movement는 x좌표변화량, y좌표변화량
    fs.movement,
    
    #space_delta는 (x,y)좌표의 변화량
    fs.space_delta,
    
    #startlocation, endloaction에서 goal까지 거리랑 각도
    fs.startpolar,
    fs.endpolar,
    
    #home인지 away인지
    fs.team,
    
    #time : 3가지 feature가 나옴
    #1.period_id = 전반1 / 후반2
    #2.time_seconds = 전/후반의 시작 후에 time
    #3.time_seconds_overall = 게임시작후 초
    # fs.time,
    
    #time_delta1 = 현 action time_seconds - 전 action time_seconds
    #time_delta2 = 현 action time_seconds - 전전 action time_seconds
    fs.time_delta,
    
]

In [40]:
nb_prev_actions = 3

Xcols = fs.feature_column_names(xfns, nb_prev_actions)
Xcols

['type_id_a0',
 'type_id_a1',
 'type_id_a2',
 'type_pass_a0',
 'type_cross_a0',
 'type_throw_in_a0',
 'type_freekick_crossed_a0',
 'type_freekick_short_a0',
 'type_corner_crossed_a0',
 'type_corner_short_a0',
 'type_take_on_a0',
 'type_foul_a0',
 'type_tackle_a0',
 'type_interception_a0',
 'type_shot_a0',
 'type_shot_penalty_a0',
 'type_shot_freekick_a0',
 'type_keeper_save_a0',
 'type_keeper_claim_a0',
 'type_keeper_punch_a0',
 'type_keeper_pick_up_a0',
 'type_clearance_a0',
 'type_bad_touch_a0',
 'type_non_action_a0',
 'type_dribble_a0',
 'type_goalkick_a0',
 'type_pass_a1',
 'type_cross_a1',
 'type_throw_in_a1',
 'type_freekick_crossed_a1',
 'type_freekick_short_a1',
 'type_corner_crossed_a1',
 'type_corner_short_a1',
 'type_take_on_a1',
 'type_foul_a1',
 'type_tackle_a1',
 'type_interception_a1',
 'type_shot_a1',
 'type_shot_penalty_a1',
 'type_shot_freekick_a1',
 'type_keeper_save_a1',
 'type_keeper_claim_a1',
 'type_keeper_punch_a1',
 'type_keeper_pick_up_a1',
 'type_clearance_a1

### train, valid, test 데이터를 X, Y데이터로 변환하기

### game_id = 3773689는 score와 concede가 둘 다 true인 데이터가 존재해서 그냥 지워버림

In [45]:
def getXY(games,Xcols, features_h5, labels_h5):
    # generate the columns of the selected feature
    # X에 들어가는것이 독립변수이니까 중요한 feature가 들어가는 것은 맞지만
    # 남성 vs 여성 actions 차이
    # 시즌별 actions 차이
    # 리그별 actions 차이
    # 다양한 변수를 비교해서 actions를 비교해본다
    X = []
    for game_id in tqdm.tqdm(games.game_id, desc="Selecting features"):
        if game_id == 3773689:
            print("game_id = ",3773689,"인  game은 제거합니다")
            continue
        Xi = pd.read_hdf(features_h5, f"game_{game_id}")
        X.append(Xi[Xcols])
    X = pd.concat(X).reset_index(drop=True)

    # 2. Select label Y
    Ycols = ["scores","concedes"]
    Y = []
    for game_id in tqdm.tqdm(games.game_id, desc="Selecting label"):
        if game_id == 3773689:
            print("game_id = ",3773689,"인  game은 제거합니다")
            continue
        Yi = pd.read_hdf(labels_h5, f"game_{game_id}")
        Y.append(Yi[Ycols])
    Y = pd.concat(Y).reset_index(drop=True)
    
    return X, Y

In [46]:
X_train, Y_train = getXY(train_games,Xcols, features0_h5, labels0_h5)
X_train.shape, Y_train.shape

Selecting features: 100%|██████████| 452/452 [00:05<00:00, 80.59it/s]
Selecting label: 100%|██████████| 452/452 [00:02<00:00, 182.73it/s]


((998780, 154), (998780, 2))

In [47]:
X_valid, Y_valid = getXY(valid_games,Xcols, features1_h5, labels1_h5)
X_valid.shape, Y_valid.shape

Selecting features: 100%|██████████| 33/33 [00:00<00:00, 81.10it/s]
Selecting label: 100%|██████████| 33/33 [00:00<00:00, 190.08it/s]


((75786, 154), (75786, 2))

In [48]:
X_test, Y_test = getXY(test_games,Xcols, features2_h5, labels2_h5)
X_test.shape, Y_test.shape

Selecting features: 100%|██████████| 35/35 [00:00<00:00, 86.11it/s]


game_id =  3773689 인  game은 제거합니다


Selecting label: 100%|██████████| 35/35 [00:00<00:00, 182.12it/s]

game_id =  3773689 인  game은 제거합니다





((79812, 154), (79812, 2))

In [50]:
X_test

Unnamed: 0,type_id_a0,type_id_a1,type_id_a2,type_pass_a0,type_cross_a0,type_throw_in_a0,type_freekick_crossed_a0,type_freekick_short_a0,type_corner_crossed_a0,type_corner_short_a0,...,end_dist_to_goal_a0,end_angle_to_goal_a0,end_dist_to_goal_a1,end_angle_to_goal_a1,end_dist_to_goal_a2,end_angle_to_goal_a2,team_1,team_2,time_delta_1,time_delta_2
0,0,0,0,True,False,False,False,False,False,False,...,62.344111,0.040050,62.344111,0.040050,62.344111,0.040050,True,True,0.0,0.0
1,21,0,0,False,False,False,False,False,False,False,...,59.963744,0.041641,62.344111,0.040050,62.344111,0.040050,True,True,1.0,1.0
2,0,21,0,True,False,False,False,False,False,False,...,63.970052,0.229408,59.963744,0.041641,62.344111,0.040050,True,True,1.0,2.0
3,0,0,21,True,False,False,False,False,False,False,...,77.353726,0.247320,63.970052,0.229408,59.963744,0.041641,True,True,3.0,4.0
4,21,0,0,False,False,False,False,False,False,False,...,77.140580,0.245717,77.353726,0.247320,63.970052,0.229408,True,True,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79807,0,21,0,True,False,False,False,False,False,False,...,44.177657,0.549396,75.302099,0.417845,75.302099,0.417845,True,True,2.0,3.0
79808,8,0,21,False,False,False,False,False,False,False,...,67.452732,0.365382,71.166049,0.330111,47.354672,0.701388,False,False,2.0,4.0
79809,3,8,0,False,False,False,True,False,False,False,...,6.565622,0.105074,48.423868,0.520962,44.177657,0.549396,False,True,28.0,30.0
79810,21,3,8,False,False,False,False,False,False,False,...,98.061188,0.025458,98.472996,0.006993,67.452732,0.365382,False,True,4.0,32.0


In [51]:
Y_test

Unnamed: 0,scores,concedes
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
79807,False,False
79808,False,False
79809,False,False
79810,False,False


In [52]:
Y_test[Y_test['scores'] & Y_test['concedes']]

Unnamed: 0,scores,concedes


## -Data preprocessing

1. time_delta 오류값 처리

X_train, X_valid, X_test에는 오류값이 들어있는데, 바로 time_delta값이다<br>
time_delta는 전 action과의 시간차이로 계산되는데, <br>
위 방식대로 각 경기를 연결하면, 현 경기 첫 time_delta = 0초 - 전 경기 마지막action time =음수값이 들어옴<br>

In [53]:
count1 = 0
count2 = 0
count3 = 0

for i,value in enumerate(X_train['time_delta_1']):
    if value<0:
        count1+=1
        
for i,value in enumerate(X_valid['time_delta_1']):
    if value<0:
        count2+=1
        
for i,value in enumerate(X_test['time_delta_1']):
    if value<0:
        count3+=1

count1,count2,count3

(452, 33, 34)

time_prprocessing함수는 time_delta_1, time_delta_2의 feature를 전처리하는 함수로<br>
음수로 표현된 잘못 된 값은 이전action이 없어서 나오는 값이므로 0으로 처리한다

In [55]:
def time_preprocessing(data):
    for i in tqdm.tqdm(range(len(data))):
        if data.loc[i,'time_delta_1'] < 0.0:
            data.loc[i,'time_delta_1'] = 0.0
            
        if data.loc[i,'time_delta_2'] < 0.0:
            data.loc[i,'time_delta_2'] = 0.0    

    return data

In [56]:
# X_train = time_preprocessing(X_train)
# X_valid = time_preprocessing(X_valid)
X_test = time_preprocessing(X_test)

100%|██████████| 79812/79812 [00:01<00:00, 58843.75it/s]


In [57]:
X_train.shape, X_valid.shape,X_test.shape

((998780, 154), (75786, 154), (79812, 154))

In [58]:
Y_train.shape, Y_valid.shape, Y_test.shape

((998780, 2), (75786, 2), (79812, 2))

2. test_data에는 score/concede가 모두 true인 경우가 있는데, 너무 예외 경우이므로 제외

In [59]:
print("Y_train ",Y_train.value_counts(),'\n')
print("Y_valid ",Y_valid.value_counts(),'\n')
print("Y_test ",Y_test.value_counts())

Y_train  scores  concedes
False   False       983349
True    False        13272
False   True          2159
Name: count, dtype: int64 

Y_valid  scores  concedes
False   False       74776
True    False         858
False   True          152
Name: count, dtype: int64 

Y_test  scores  concedes
False   False       78735
True    False         940
False   True          137
Name: count, dtype: int64


In [33]:
all_true_index = []
for i in range(len(Y_test)):
    if (Y_test.loc[i,'scores']==True) & (Y_test.loc[i,'concedes']==True):
        all_true_index.append(i)
        
print("remove index : ",all_true_index)

X_test.drop(all_true_index,axis=0,inplace=True,errors='ignore')
Y_test.drop(all_true_index,axis=0,inplace=True,errors='ignore')

remove index :  [78256, 78257, 78258, 78259]


In [34]:
X_test = X_test.reset_index().drop(columns=['index','level_0'],errors='ignore')
Y_test = Y_test.reset_index().drop(columns=['index','level_0'],errors='ignore')

In [60]:
Y_train.value_counts(), Y_valid.value_counts(),Y_test.value_counts()

(scores  concedes
 False   False       983349
 True    False        13272
 False   True          2159
 Name: count, dtype: int64,
 scores  concedes
 False   False       74776
 True    False         858
 False   True          152
 Name: count, dtype: int64,
 scores  concedes
 False   False       78735
 True    False         940
 False   True          137
 Name: count, dtype: int64)

In [36]:
X_train.to_csv('./soccer_binary_data/train/X_train')
Y_train.to_csv('./soccer_binary_data/train/Y_train')

In [37]:
X_valid.to_csv('./soccer_binary_data/valid/X_valid')
Y_valid.to_csv('./soccer_binary_data/valid/Y_valid')

In [61]:
X_test.to_csv('./soccer_binary_data/test/X_test')
Y_test.to_csv('./soccer_binary_data/test/Y_test')

3. goalscore를  승/패/무의 categorical data를 추가

In [116]:
def categorical_goal(X):
    except_feature = ['goalscore_team', 'goalscore_opponent', 'goalscore_diff']
    for except_col in except_feature:
    #카테고리로 넣었을 때는, 패무승 중인 경우 3가지로 구분해서 feature를 추가
        if except_col=='goalscore_diff':
            cate = "cate_" + except_col
            for i, value in tqdm.tqdm(enumerate(X[except_col]),desc='lose:0 equal:1 win:2'):
                if value<0:
                    X.loc[i,cate] = 0
                elif value==0:
                    X.loc[i,cate] = 1
                else:
                    X.loc[i,cate] = 2
                    
        X[except_col] = X[except_col].astype('float64')
                    
    return X

In [117]:
#패/무/승인 상황을 categorical호 집어넣는 데이터 -> new_c_train & new_c_test
#'cate_goalscore_diff' feature가 추가로 사용됨
# new_c_train = categorical_goal(c_train)
# new_c_test = categorical_goal(c_test)

lose:0 equal:1 win:2: 998777it [01:04, 15551.66it/s]
lose:0 equal:1 win:2: 157895it [00:09, 16107.85it/s]


4. zone categorical data추가

In [121]:
def get_zone_index(x, y):
    # zone_width = 104/12  # zone의 너비
    # zone_height = 68/8  # zone의 높이
    zone_width = 108/12  # zone의 너비
    zone_height = 72/8  # zone의 높이

    # x좌표가 속하는 zone의 인덱스 계산
    zone_col = int(x // zone_width)
    if zone_col == 12:  # 경계 체크
        print("sadas")
        zone_col -= 1

    # y좌표가 속하는 zone의 인덱스 계산
    zone_row = int(y // zone_height)
    if zone_row == 8:  # 경계 체크
        print("sadas")
        zone_row -= 1

    # zone의 인덱스를 반환
    return zone_row * 12 + zone_col

In [122]:
def location_zone(X):
    position_feature = [['start_x_a0','start_y_a0'],['start_x_a1','start_y_a1'],
                        ['start_x_a2','start_y_a2'],['end_x_a0','end_y_a0'],
                        ['end_x_a1','end_y_a1'],['end_x_a2','end_y_a2']]
    
    for x_col,y_col in position_feature:
        col_name = 'zone_' + x_col[-2:]
        for i,(x_val, y_val) in tqdm.tqdm(enumerate(zip(X[x_col],X[y_col])),desc='location discrete by zone'):
            X.loc[i,col_name] = get_zone_index(x_val,y_val)
            
    return X

## soccer binary data뿐 아니라 multiclass사용을 위해 multi_data를 만들 예정

In [62]:
# multi_X_train = pd.read_csv('./soccer_binary_data/train/X_train',index_col=0)
# multi_Y_train = pd.read_csv("./soccer_binary_data/train/Y_train",index_col=0)           

# multi_X_valid = pd.read_csv('./soccer_binary_data/valid/X_valid',index_col=0)
# multi_Y_valid = pd.read_csv("./soccer_binary_data/valid/Y_valid",index_col=0)           

multi_X_test = pd.read_csv('./soccer_binary_data/test/X_test',index_col=0)
multi_Y_test = pd.read_csv("./soccer_binary_data/test/Y_test",index_col=0)  

In [63]:
# multi_X_train.shape,multi_Y_train.shape,multi_X_valid.shape,multi_Y_valid.shape,
multi_X_test.shape,multi_Y_test.shape,

((79812, 154), (79812, 2))

In [64]:
def multi_class(Y):
    multi_label = pd.DataFrame(columns=['label'])
    
    for i in tqdm.tqdm(range(len(Y))):
        if (Y.loc[i,'scores']==False) & (Y.loc[i,'concedes']==False):
            multi_label.loc[i,'label'] = 0
        elif (Y.loc[i,'scores']==True) & (Y.loc[i,'concedes']==False):
            multi_label.loc[i,'label'] = 1
        elif (Y.loc[i,'scores']==False) & (Y.loc[i,'concedes']==True):
            multi_label.loc[i,'label'] = 2
        else:
            print("error : 'score=True, concede=True' is impossible")
            exit()
    return multi_label

In [66]:
# multi_Y_train = multi_class(multi_Y_train)
# multi_Y_valid = multi_class(multi_Y_valid)
multi_Y_test = multi_class(multi_Y_test)

100%|██████████| 79812/79812 [01:50<00:00, 723.26it/s] 


In [67]:
len(multi_Y_test)

79812

In [68]:
# multi_X_train.shape,multi_Y_train.shape,multi_X_valid.shape,multi_Y_valid.shape,
multi_X_test.shape,multi_Y_test.shape,

((79812, 154), (79812, 1))

In [69]:
# multi_Y_train.value_counts(),multi_Y_valid.value_counts(),
multi_Y_test.value_counts()

label
0        78735
1          940
2          137
Name: count, dtype: int64

In [70]:
# multi_X_train.to_csv('./soccer_multiclass_data/X_train')
# multi_Y_train.to_csv('./soccer_multiclass_data/Y_train')

# multi_X_valid.to_csv('./soccer_multiclass_data/X_valid')
# multi_Y_valid.to_csv('./soccer_multiclass_data/Y_valid')

multi_X_test.to_csv('./soccer_multiclass_data/X_test')
multi_Y_test.to_csv('./soccer_multiclass_data/Y_test')

In [71]:
pd.read_csv('./soccer_multiclass_data/X_test',index_col=0)

Unnamed: 0,type_id_a0,type_id_a1,type_id_a2,type_pass_a0,type_cross_a0,type_throw_in_a0,type_freekick_crossed_a0,type_freekick_short_a0,type_corner_crossed_a0,type_corner_short_a0,...,end_dist_to_goal_a0,end_angle_to_goal_a0,end_dist_to_goal_a1,end_angle_to_goal_a1,end_dist_to_goal_a2,end_angle_to_goal_a2,team_1,team_2,time_delta_1,time_delta_2
0,0,0,0,True,False,False,False,False,False,False,...,62.344111,0.040050,62.344111,0.040050,62.344111,0.040050,True,True,0.0,0.0
1,21,0,0,False,False,False,False,False,False,False,...,59.963744,0.041641,62.344111,0.040050,62.344111,0.040050,True,True,1.0,1.0
2,0,21,0,True,False,False,False,False,False,False,...,63.970052,0.229408,59.963744,0.041641,62.344111,0.040050,True,True,1.0,2.0
3,0,0,21,True,False,False,False,False,False,False,...,77.353726,0.247320,63.970052,0.229408,59.963744,0.041641,True,True,3.0,4.0
4,21,0,0,False,False,False,False,False,False,False,...,77.140580,0.245717,77.353726,0.247320,63.970052,0.229408,True,True,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79807,0,21,0,True,False,False,False,False,False,False,...,44.177657,0.549396,75.302099,0.417845,75.302099,0.417845,True,True,2.0,3.0
79808,8,0,21,False,False,False,False,False,False,False,...,67.452732,0.365382,71.166049,0.330111,47.354672,0.701388,False,False,2.0,4.0
79809,3,8,0,False,False,False,True,False,False,False,...,6.565622,0.105074,48.423868,0.520962,44.177657,0.549396,False,True,28.0,30.0
79810,21,3,8,False,False,False,False,False,False,False,...,98.061188,0.025458,98.472996,0.006993,67.452732,0.365382,False,True,4.0,32.0


In [72]:
pd.read_csv('./soccer_multiclass_data/Y_test',index_col=0)

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
...,...
79807,0
79808,0
79809,0
79810,0
