In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split

In [2]:
!ls /kaggle/input/nfl-split

train_labels.csv      train_labels_p2p.csv  valid_labels_all.csv
train_labels_all.csv  train_p2g_all.csv     valid_labels_p2g.csv
train_labels_p2g.csv  train_p2p_all.csv     valid_labels_p2p.csv


In [3]:
!ls /kaggle/input/nfl-player-contact-detection/

sample_submission.csv	   train
test			   train_baseline_helmets.csv
test_baseline_helmets.csv  train_labels.csv
test_player_tracking.csv   train_player_tracking.csv
test_video_metadata.csv    train_video_metadata.csv


In [4]:
class CFG:
    input_dir = '/kaggle/input/nfl-player-contact-detection'
    split_dir = '/kaggle/input/nfl-split'
    # --------------------- FIXED --------------------- #
    train_labels_dir = os.path.join(input_dir, 'train_labels.csv')
    train_tracking_dir = os.path.join(input_dir, 'train_player_tracking.csv')
    test_sub_dir = os.path.join(input_dir, 'sample_submission.csv')
    test_tracking_dir = os.path.join(input_dir, 'test_player_tracking.csv')
    
    train_all = os.path.join(split_dir, 'train_labels.csv')
    train_labels_all = os.path.join(split_dir, 'train_labels_all.csv')
    train_labels_p2p = os.path.join(split_dir, 'train_labels_p2p.csv')
    train_labels_p2g = os.path.join(split_dir, 'train_labels_p2g.csv')
    valid_labels_all = os.path.join(split_dir, 'valid_labels_all.csv')
    valid_labels_p2p = os.path.join(split_dir, 'valid_labels_p2p.csv')
    valid_labels_p2g = os.path.join(split_dir, 'valid_labels_p2g.csv')
    train_p2p_labels = os.path.join(split_dir, 'train_p2p_all.csv')
    train_p2g_labels = os.path.join(split_dir, 'train_p2g_all.csv')
    
    labels_dict = {
        'train': train_all,
        'train_all': train_labels_all,
        'train_p2p': train_labels_p2p,
        'train_p2g': train_labels_p2g,
        'valid_all': valid_labels_all,
        'valid_p2p': valid_labels_p2p,
        'valid_p2g': valid_labels_p2g,
        'p2p_all': train_p2p_labels,
        'p2g_all': train_p2g_labels,
    }
    
    # --------------------- SPECIFY ----------------------- #
    default_labels = labels_dict['valid_all']
    
    
    def expand_contact_id(df):
        df['game_play'] = df['contact_id'].str[:12]
        df['game'] = df['contact_id'].apply(lambda s: s.split('_')[0])
        df['play'] = df['contact_id'].apply(lambda s: s.split('_')[1])
        df['step'] = df['contact_id'].apply(lambda s: s.split('_')[2]).astype(int)
        df['nfl_player_id_1'] = df['contact_id'].apply(lambda s: s.split('_')[-2])
        df['nfl_player_id_2'] = df['contact_id'].apply(lambda s: s.split('_')[-1])
        return df

In [5]:
train_labels = pd.read_csv(CFG.default_labels, index_col=0)
train_tracking = pd.read_csv(CFG.train_tracking_dir)
# train_helmets = pd.read_csv("/kaggle/input/nfl-player-contact-detection/train_baseline_helmets.csv")
# train_video_metadata = pd.read_csv("/kaggle/input/nfl-player-contact-detection/train_video_metadata.csv")

In [6]:
def create_features(df_label, df_tracking, cols, merge_col="step"):
    df_combo = (
        df_label.astype({"nfl_player_id_1": str, "nfl_player_id_2": str})
        .merge(
            df_tracking[["game_play", merge_col, "nfl_player_id"] + cols]
                .astype({"nfl_player_id": "str"}),
            left_on=["game_play", merge_col, "nfl_player_id_1"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",)
        .drop(columns=["nfl_player_id"])
        .merge(
            df_tracking[["game_play", merge_col, "nfl_player_id"] + cols]
                .astype({"nfl_player_id": "str"}),
            left_on=["game_play", merge_col, "nfl_player_id_2"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
            suffixes=['_1', '_2']
        )
        .drop(columns=["nfl_player_id"])
        .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
        .reset_index(drop=True)
    )
    df_combo['G_flag'] = (df_combo['nfl_player_id_2']=='G').astype(int)
    
    df_combo['distance_x'] = np.abs(df_combo['x_position_1']-df_combo['x_position_2'])
    df_combo['distance_y'] = np.abs(df_combo['y_position_1']-df_combo['y_position_2'])
    dist_pow = df_combo['distance_x']**2 + df_combo['distance_y']**2
    df_combo['distance'] = np.sqrt(dist_pow)
        
    return df_combo

In [7]:
use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]

feats = ['contact_id', 'contact', 'G_flag', 'x_position_1',
       'y_position_1', 'speed_1', 'distance_1', 'direction_1', 'orientation_1',
       'acceleration_1', 'sa_1', 'x_position_2', 'y_position_2', 'speed_2',
       'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2',
       'distance', ]

df_train = create_features(train_labels, train_tracking, use_cols)

In [8]:
for label, dir_ in CFG.labels_dict.items():
    print(label, dir_)
    train_labels = pd.read_csv(dir_)
    
    df_train = create_features(train_labels, train_tracking, use_cols)[feats]
    df_train.to_csv(f'{label}_feats.csv')

train /kaggle/input/nfl-split/train_labels.csv
train_all /kaggle/input/nfl-split/train_labels_all.csv
train_p2p /kaggle/input/nfl-split/train_labels_p2p.csv
train_p2g /kaggle/input/nfl-split/train_labels_p2g.csv
valid_all /kaggle/input/nfl-split/valid_labels_all.csv
valid_p2p /kaggle/input/nfl-split/valid_labels_p2p.csv
valid_p2g /kaggle/input/nfl-split/valid_labels_p2g.csv
p2p_all /kaggle/input/nfl-split/train_p2p_all.csv
p2g_all /kaggle/input/nfl-split/train_p2g_all.csv


In [9]:
df_train

Unnamed: 0,contact_id,contact,G_flag,x_position_1,y_position_1,speed_1,distance_1,direction_1,orientation_1,acceleration_1,sa_1,x_position_2,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,distance
0,58168_003392_0_37084_G,0,1,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,,,,,,,,,
1,58168_003392_0_37211_G,0,1,39.59,17.07,0.53,0.05,134.84,84.73,1.43,1.42,,,,,,,,,
2,58168_003392_0_38556_G,0,1,41.93,30.61,0.67,0.05,232.50,227.00,1.82,1.61,,,,,,,,,
3,58168_003392_0_38567_G,0,1,40.37,19.88,0.66,0.07,136.70,88.92,0.90,0.89,,,,,,,,,
4,58168_003392_0_38590_G,0,1,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410628,58582_003121_91_48220_G,0,1,33.18,25.26,2.55,0.26,95.39,152.22,0.43,-0.40,,,,,,,,,
410629,58582_003121_91_52493_G,0,1,65.04,38.68,1.31,0.14,168.40,143.47,0.74,-0.69,,,,,,,,,
410630,58582_003121_91_52500_G,0,1,58.74,40.11,1.34,0.13,204.96,136.56,1.23,-1.20,,,,,,,,,
410631,58582_003121_91_52609_G,0,1,60.32,25.93,1.38,0.15,261.77,269.45,0.35,-0.30,,,,,,,,,


In [10]:
!ls /kaggle/working

__notebook__.ipynb   train_feats.csv	  valid_p2g_feats.csv
p2g_all_feats.csv    train_p2g_feats.csv  valid_p2p_feats.csv
p2p_all_feats.csv    train_p2p_feats.csv
train_all_feats.csv  valid_all_feats.csv


## Test

In [11]:
test_labels = CFG.expand_contact_id(pd.read_csv(CFG.test_sub_dir))
test_tracking = pd.read_csv(CFG.test_tracking_dir)
# test_helmets = pd.read_csv("/kaggle/input/nfl-player-contact-detection/test_baseline_helmets.csv")
# test_video_metadata = pd.read_csv("/kaggle/input/nfl-player-contact-detection/test_video_metadata.csv")
use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]

feats = ['contact_id', 'contact', 'G_flag', 'x_position_1',
       'y_position_1', 'speed_1', 'distance_1', 'direction_1', 'orientation_1',
       'acceleration_1', 'sa_1', 'x_position_2', 'y_position_2', 'speed_2',
       'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2',
       'distance', ]

df_test = create_features(test_labels, test_tracking, use_cols)

In [12]:
df_test = CFG.expand_contact_id(df_test)
df_test

Unnamed: 0,contact_id,contact,game_play,game,play,step,nfl_player_id_1,nfl_player_id_2,x_position_1,y_position_1,...,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,G_flag,distance_x,distance_y,distance
0,58168_003392_0_37084_37211,0,58168_003392,58168,003392,0,37084,37211,41.90,20.08,...,0.53,0.05,134.84,84.73,1.43,1.42,0,2.31,3.01,3.794232
1,58168_003392_0_37084_38556,0,58168_003392,58168,003392,0,37084,38556,41.90,20.08,...,0.67,0.05,232.50,227.00,1.82,1.61,0,0.03,10.53,10.530043
2,58168_003392_0_37084_38567,0,58168_003392,58168,003392,0,37084,38567,41.90,20.08,...,0.66,0.07,136.70,88.92,0.90,0.89,0,1.53,0.20,1.543017
3,58168_003392_0_37084_38590,0,58168_003392,58168,003392,0,37084,38590,41.90,20.08,...,0.52,0.06,141.08,100.37,0.59,0.58,0,1.57,5.20,5.431841
4,58168_003392_0_37084_39947,0,58168_003392,58168,003392,0,37084,39947,41.90,20.08,...,0.99,0.09,163.38,90.69,1.68,1.64,0,1.79,6.65,6.886697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49583,58172_003247_125_52521_52939,0,58172_003247,58172,003247,125,52521,52939,23.44,4.04,...,2.29,0.24,211.73,215.11,2.43,-2.43,0,14.50,1.94,14.629204
49584,58172_003247_125_52521_G,0,58172_003247,58172,003247,125,52521,G,23.44,4.04,...,,,,,,,1,,,
49585,58172_003247_125_52852_52939,0,58172_003247,58172,003247,125,52852,52939,32.67,2.18,...,2.29,0.24,211.73,215.11,2.43,-2.43,0,5.27,0.08,5.270607
49586,58172_003247_125_52852_G,0,58172_003247,58172,003247,125,52852,G,32.67,2.18,...,,,,,,,1,,,
