In [1]:
import sys
sys.path.insert(0,'/home/ubuntu/soccer_videos/conv_3D/')

In [2]:
import numpy as np
import pandas as pd
import hashlib
import pickle
import glob
from sklearn.model_selection import train_test_split
from aux_functions.soccer_functions import transform_data
from aux_functions.soccer_functions import filter_clips
from aux_functions.soccer_functions import split_video
from aux_functions.soccer_functions import dataframe_from_splits
from aux_functions.soccer_functions import save_clips
pd.options.mode.chained_assignment = None

Using TensorFlow backend.


In [3]:
events_df, story_df, shot_bound_df, shot_type_df = transform_data(path='/home/ubuntu/soccer_videos/data/comprehensive_soccer_dataset/')

X_train, X_val = train_test_split(events_df, stratify=events_df['type'], test_size=0.4, random_state=10)
X_val, X_test = train_test_split(X_val, stratify=X_val['type'], test_size=0.5, random_state=10)

In [5]:
events_df

Unnamed: 0,type,start,end,replay,video,league,duration
0,foul,5110,5141,0,0000,event,1.24
1,shot,7934,7971,0,0000,event,1.48
2,foul,10184,10209,0,0000,event,1.00
3,foul,10470,10861,1,0000,event,15.64
4,free-kick,11450,11476,0,0000,event,1.04
5,foul,17146,17191,0,0000,event,1.80
6,foul,17380,17501,1,0000,event,4.84
7,foul,17667,17731,0,0000,event,2.56
8,shot,25212,25235,0,0000,event,0.92
9,shot,25462,25740,1,0000,event,11.12


In [4]:
X_train

Unnamed: 0,type,start,end,replay,video,league,duration
2739,shot,33813,34206,1,1014,event,15.72
10222,shot,14694,14729,0,1058,event,1.40
4179,foul,47630,47777,1,1011,event,5.88
3351,foul,11035,11238,1,1030,event,8.12
6681,foul,57687,57770,0,1261,event,3.32
6750,free-kick,61011,61045,0,1262,event,1.36
1546,shot,25652,26093,1,0020,event,17.64
9421,shot,3510,3552,0,0073,event,1.68
5497,foul,40879,40942,0,1043,event,2.52
5701,goal,26163,26972,1,1047,event,32.36


In [None]:
# save_clips(events_df, videos_path='/home/ubuntu/soccer_videos/data/comprehensive_soccer_dataset/videos/', save_path='/home/ubuntu/soccer_videos/data/comprehensive_soccer_dataset/train_clips/')

In [None]:
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [None]:
train_counts = np.unique(X_train['type'], return_counts = True)[1]
val_counts = np.unique(X_val['type'], return_counts = True)[1]
test_counts = np.unique(X_test['type'], return_counts = True)[1]

In [None]:
assert(hashlib.sha256(pd.util.hash_pandas_object(X_train, index=True).values).hexdigest() == 'c5ff5ef6e9d86d69eb2d1002c1108cc49f5a2148f0e9d27721fa7039ff12c1f2')
assert(hashlib.sha256(pd.util.hash_pandas_object(X_val, index=True).values).hexdigest() == '03cdeff4338920ce56d16a18793992cfcb5f03d3f026399acc657c17fa618e31')
assert(hashlib.sha256(pd.util.hash_pandas_object(X_test, index=True).values).hexdigest() == 'abbbbb79a6a975d595fc9c2134545cb5d10e47fe90acf743e8a62a583daef404')

In [None]:
X_train.to_pickle('X_train.pkl')
X_val.to_pickle('X_val.pkl')
X_test.to_pickle('X_test.pkl')

In [None]:
X_train_no_replay = X_train[X_train['replay'] == 0]
X_val_no_replay = X_val[X_val['replay'] == 0]
X_test_no_replay = X_test[X_test['replay'] == 0]

In [None]:
X_train_no_replay.to_pickle('X_train_no_replay.pkl')
X_val_no_replay.to_pickle('X_val_no_replay.pkl')
X_test_no_replay.to_pickle('X_test_no_replay.pkl')

In [None]:
splits = [filename.split('_')[-1] for filename in glob.glob('/home/ubuntu/soccer_videos/data/comprehensive_soccer_dataset/splits/*')]
clips_df = pd.DataFrame()
for split in splits:
    clips_df = clips_df.append(dataframe_from_splits(split, events_df, split_path = '/home/ubuntu/soccer_videos/data/comprehensive_soccer_dataset/splits/'))
clips_df = clips_df[clips_df['type'] == 'none']

In [None]:
X_train_splits, X_val_splits = train_test_split(clips_df, test_size=0.4, random_state=10)
X_val_splits, X_test_splits = train_test_split(X_val_splits, test_size=0.5, random_state=10)

In [None]:
X_train_splits = X_train_splits.reset_index(drop=True)
X_val_splits = X_val_splits.reset_index(drop=True)
X_test_splits = X_test_splits.reset_index(drop=True)

In [None]:
assert(hashlib.sha256(pd.util.hash_pandas_object(X_train_splits, index=True).values).hexdigest() == 'be6af481469f54a3bc18918f5b2169572d3a31eb29d52a4b037fb398c9031531')
assert(hashlib.sha256(pd.util.hash_pandas_object(X_val_splits, index=True).values).hexdigest() == '75d8d6315be78340dc36a32443ed999603525e4e80bed8f32a95262c91b2fc95')
assert(hashlib.sha256(pd.util.hash_pandas_object(X_test_splits, index=True).values).hexdigest() == '025b1a022acab0e132489c5bb255e6a24df4ca0537141919c7966712a2727c0c')

In [None]:
X_train_splits.to_pickle('X_train_splits.pkl')
X_val_splits.to_pickle('X_val_splits.pkl')
X_test_splits.to_pickle('X_test_splits.pkl')

In [None]:
X_train_full = X_train.append(X_train_splits).sample(frac=1, random_state=10).reset_index(drop=True)
X_val_full = X_val.append(X_val_splits).sample(frac=1, random_state=10).reset_index(drop=True)
X_test_full = X_test.append(X_test_splits).sample(frac=1, random_state=10).reset_index(drop=True)

In [None]:
assert(hashlib.sha256(pd.util.hash_pandas_object(X_train_full, index=True).values).hexdigest() == '8b9144a4986ddc1a68a32eee6a53be02d39859a4fdf49661e4798063cdf8f86c')
assert(hashlib.sha256(pd.util.hash_pandas_object(X_val_full, index=True).values).hexdigest() == '808c29630138e6ffcd8ee7306a9f8a03041b7cfcc3fa84ca3984ca74122da307')
assert(hashlib.sha256(pd.util.hash_pandas_object(X_test_full, index=True).values).hexdigest() == '4af729799a42573e8353db28d0c2d5abc9e0ca2982a566bbeb3b99a372fe4d58')

In [None]:
X_train_full.to_pickle('X_train_full.pkl')
X_val_full.to_pickle('X_val_full.pkl')
X_test_full.to_pickle('X_test_full.pkl')

In [None]:
X_train_full_no_replay = X_train_no_replay.append(X_train_splits).sample(frac=1, random_state=10).reset_index(drop=True)
X_val_full_no_replay = X_val_no_replay.append(X_val_splits).sample(frac=1, random_state=10).reset_index(drop=True)
X_test_full_no_replay = X_test_no_replay.append(X_test_splits).sample(frac=1, random_state=10).reset_index(drop=True)

In [None]:
assert(hashlib.sha256(pd.util.hash_pandas_object(X_train_full_no_replay, index=True).values).hexdigest() == '265f71d211c03c5203fa0a0f623d34a7e5242447937a712e1575afed75c69a1a')
assert(hashlib.sha256(pd.util.hash_pandas_object(X_val_full_no_replay, index=True).values).hexdigest() == 'f23dec4b1a82799725b77d83e655c7e97aff4c28d27b095a69a0af6045147fc2')
assert(hashlib.sha256(pd.util.hash_pandas_object(X_test_full_no_replay, index=True).values).hexdigest() == '8e10042ffc172e4112d19e7922d47e106892dd0afed24e7925cf54df41296637')

In [None]:
X_train_full_no_replay.to_pickle('X_train_full_no_replay.pkl')
X_val_full_no_replay.to_pickle('X_val_full_no_replay.pkl')
X_test_full_no_replay.to_pickle('X_test_full_no_replay.pkl')

In [None]:
X_train_full['type'].value_counts().plot('bar')

In [None]:
X_val_full['type'].value_counts().plot('bar')

In [None]:
X_test_full['type'].value_counts().plot('bar')

In [None]:
eval_class = ['shot', 'free-kick', 'goal', 'penalty-kick']

train_rows = np.zeros(len(X_test_full), dtype=bool)
if type(eval_class) == list:
    for label in eval_class:
        train_rows = train_rows | (X_test_full['type'] == label)

In [None]:
sum(train_rows)

In [None]:
len(train_rows)