In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [2]:
!ls /kaggle/input/nfl-player-contact-detection

sample_submission.csv	   train
test			   train_baseline_helmets.csv
test_baseline_helmets.csv  train_labels.csv
test_player_tracking.csv   train_player_tracking.csv
test_video_metadata.csv    train_video_metadata.csv


In [3]:
!ls /kaggle/working/

__notebook__.ipynb


In [4]:
class CFG:
    input_dir = '/kaggle/input/nfl-player-contact-detection'
    # --------------------- FIXED --------------------- #
    train_labels_dir = os.path.join(input_dir, 'train_labels.csv')
    test_sub_dir = os.path.join(input_dir, 'sample_submission.csv')
    
    def expand_contact_id(df):
        df['game'] = df['contact_id'].apply(lambda s: s.split('_')[0])
        df['play'] = df['contact_id'].apply(lambda s: s.split('_')[1])
        df['step'] = df['contact_id'].apply(lambda s: s.split('_')[2])
        df['nfl_player_id_1'] = df['contact_id'].apply(lambda s: s.split('_')[-2])
        df['nfl_player_id_2'] = df['contact_id'].apply(lambda s: s.split('_')[-1])
        return df

In [5]:
df_labels = pd.read_csv(CFG.train_labels_dir)
df_labels['game'] = df_labels['game_play'].apply(lambda s: s.split('_')[0])
df_games_count = (df_labels.groupby('game').size().reset_index(name='count')
                  .sort_values(by='count', ascending=False))

In [6]:
list_games = df_labels['game'].unique()
train_ids, valid_ids = train_test_split(list_games, test_size=0.05, random_state=11)
len(train_ids), len(valid_ids)

(141, 8)

In [7]:
print(train_ids)
print(valid_ids)

['58227' '58316' '58407' '58314' '58387' '58527' '58567' '58261' '58512'
 '58326' '58415' '58573' '58330' '58173' '58217' '58285' '58200' '58198'
 '58571' '58558' '58205' '58399' '58168' '58538' '58541' '58282' '58226'
 '58518' '58516' '58577' '58251' '58574' '58308' '58362' '58180' '58290'
 '58514' '58507' '58302' '58247' '58560' '58274' '58503' '58520' '58311'
 '58582' '58401' '58266' '58215' '58293' '58216' '58221' '58341' '58530'
 '58579' '58225' '58176' '58233' '58568' '58257' '58517' '58403' '58511'
 '58537' '58561' '58508' '58241' '58491' '58543' '58536' '58528' '58553'
 '58279' '58535' '58281' '58211' '58550' '58209' '58510' '58255' '58220'
 '58213' '58366' '58240' '58336' '58329' '58555' '58406' '58188' '58174'
 '58172' '58306' '58509' '58284' '58575' '58291' '58177' '58504' '58565'
 '58260' '58545' '58544' '58295' '58187' '58203' '58270' '58540' '58301'
 '58190' '58519' '58414' '58551' '58224' '58218' '58506' '58368' '58321'
 '58548' '58529' '58267' '58331' '58327' '58581' '5

## Save dataset

In [8]:
df_train_p2p_all = df_labels[df_labels['nfl_player_id_2']!='G']
df_train_p2g_all = df_labels[df_labels['nfl_player_id_2']=='G']

In [9]:
df_train_all = df_labels[df_labels['game'].isin(train_ids)]
df_valid_all = df_labels[df_labels['game'].isin(valid_ids)]

In [10]:
df_train_p2p = df_labels[df_labels['game'].isin(train_ids) & (df_labels['nfl_player_id_2']!='G')]
df_train_p2g = df_labels[df_labels['game'].isin(train_ids) & (df_labels['nfl_player_id_2']=='G')]
df_valid_p2p = df_labels[df_labels['game'].isin(valid_ids) & (df_labels['nfl_player_id_2']!='G')]
df_valid_p2g = df_labels[df_labels['game'].isin(valid_ids) & (df_labels['nfl_player_id_2']=='G')]

In [11]:
df_train_all.to_csv('train_labels_all.csv') 
df_valid_all.to_csv('valid_labels_all.csv')
df_train_p2p.to_csv('train_labels_p2p.csv')
df_train_p2g.to_csv('train_labels_p2g.csv')
df_valid_p2p.to_csv('valid_labels_p2p.csv')
df_valid_p2g.to_csv('valid_labels_p2g.csv')

In [12]:
!ls

__notebook__.ipynb    train_labels_p2p.csv  valid_labels_p2p.csv
train_labels_all.csv  valid_labels_all.csv
train_labels_p2g.csv  valid_labels_p2g.csv


## Test

In [13]:
df_test_labels = CFG.expand_contact_id(pd.read_csv(CFG.test_sub_dir))

In [14]:
df_test_p2p = df_test_labels[df_test_labels['nfl_player_id_2']!='G']
df_test_p2g = df_test_labels[df_test_labels['nfl_player_id_2']=='G']

In [15]:
len(df_test_p2p), len(df_test_p2g)

(45276, 4312)