In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split

In [2]:
!ls /kaggle/input/nfl-split

train_labels.csv      train_labels_p2p.csv  valid_labels_p2p.csv
train_labels_all.csv  valid_labels_all.csv
train_labels_p2g.csv  valid_labels_p2g.csv


In [3]:
!ls /kaggle/input/nfl-player-contact-detection/

sample_submission.csv	   train
test			   train_baseline_helmets.csv
test_baseline_helmets.csv  train_labels.csv
test_player_tracking.csv   train_player_tracking.csv
test_video_metadata.csv    train_video_metadata.csv


In [4]:
class CFG:
    input_dir = '/kaggle/input/nfl-player-contact-detection'
    split_dir = '/kaggle/input/nfl-split'
    # --------------------- FIXED --------------------- #
    train_labels_dir = os.path.join(input_dir, 'train_labels.csv')
    train_tracking_dir = os.path.join(input_dir, 'train_player_tracking.csv')
    test_sub_dir = os.path.join(input_dir, 'sample_submission.csv')
    test_tracking_dir = os.path.join(input_dir, 'test_player_tracking.csv')
    
    train_all = os.path.join(split_dir, 'train_labels.csv')
    train_labels_all = os.path.join(split_dir, 'train_labels_all.csv')
    train_labels_p2p = os.path.join(split_dir, 'train_labels_p2p.csv')
    train_labels_p2g = os.path.join(split_dir, 'train_labels_p2g.csv')
    valid_labels_all = os.path.join(split_dir, 'valid_labels_all.csv')
    valid_labels_p2p = os.path.join(split_dir, 'valid_labels_p2p.csv')
    valid_labels_p2g = os.path.join(split_dir, 'valid_labels_p2g.csv')
    
    
    
    labels_dict = {
        'train': train_all,
        'train_all': train_labels_all,
        'train_p2p': train_labels_p2p,
        'train_p2g': train_labels_p2g,
        'valid_all': valid_labels_all,
        'valid_p2p': valid_labels_p2p,
        'valid_p2g': valid_labels_p2g
    }
    
    # --------------------- SPECIFY ----------------------- #
    default_labels = labels_dict['valid_all']
    
    
    def expand_contact_id(df):
        df['game_play'] = df['contact_id'].str[:12]
        df['game'] = df['contact_id'].apply(lambda s: s.split('_')[0])
        df['play'] = df['contact_id'].apply(lambda s: s.split('_')[1])
        df['step'] = df['contact_id'].apply(lambda s: s.split('_')[2])
        df['nfl_player_id_1'] = df['contact_id'].apply(lambda s: s.split('_')[-2])
        df['nfl_player_id_2'] = df['contact_id'].apply(lambda s: s.split('_')[-1])
        return df

In [5]:
train_labels = pd.read_csv(CFG.default_labels, index_col=0)
train_tracking = pd.read_csv(CFG.train_tracking_dir)
# train_helmets = pd.read_csv("/kaggle/input/nfl-player-contact-detection/train_baseline_helmets.csv")
# train_video_metadata = pd.read_csv("/kaggle/input/nfl-player-contact-detection/train_video_metadata.csv")

In [6]:
def create_features(df_label, df_tracking, cols, merge_col="step"):
    df_combo = (
        df_label.astype({"nfl_player_id_1": str, "nfl_player_id_2": str})
        .merge(
            df_tracking[["game_play", merge_col, "nfl_player_id"] + cols]
                .astype({"nfl_player_id": "str"}),
            left_on=["game_play", merge_col, "nfl_player_id_1"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",)
        .drop(columns=["nfl_player_id"])
        .merge(
            df_tracking[["game_play", merge_col, "nfl_player_id"] + cols]
                .astype({"nfl_player_id": "str"}),
            left_on=["game_play", merge_col, "nfl_player_id_2"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
            suffixes=['_1', '_2']
        )
        .drop(columns=["nfl_player_id"])
        .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
        .reset_index(drop=True)
    )
    
    
    df_combo['distance_x'] = np.abs(df_combo['x_position_1']-df_combo['x_position_2'])
    df_combo['distance_y'] = np.abs(df_combo['y_position_1']-df_combo['y_position_2'])
    dist_pow = df_combo['distance_x']**2 + df_combo['distance_y']**2
    df_combo['distance'] = np.sqrt(dist_pow)
        
    return df_combo

In [7]:
use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]

feats = ['contact_id', 'contact', 'x_position_1',
       'y_position_1', 'speed_1', 'distance_1', 'direction_1', 'orientation_1',
       'acceleration_1', 'sa_1', 'x_position_2', 'y_position_2', 'speed_2',
       'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2',
       'distance']

df_train = create_features(train_labels, train_tracking, use_cols)

In [8]:
for label, dir_ in CFG.labels_dict.items():
    print(label, dir_)
    train_labels = pd.read_csv(dir_)
    
    df_train = create_features(train_labels, train_tracking, use_cols)[feats]
    df_train.to_csv(f'{label}_feats.csv')

train /kaggle/input/nfl-split/train_labels.csv
train_all /kaggle/input/nfl-split/train_labels_all.csv
train_p2p /kaggle/input/nfl-split/train_labels_p2p.csv
train_p2g /kaggle/input/nfl-split/train_labels_p2g.csv
valid_all /kaggle/input/nfl-split/valid_labels_all.csv
valid_p2p /kaggle/input/nfl-split/valid_labels_p2p.csv
valid_p2g /kaggle/input/nfl-split/valid_labels_p2g.csv


In [9]:
df_train

Unnamed: 0,contact_id,contact,x_position_1,y_position_1,speed_1,distance_1,direction_1,orientation_1,acceleration_1,sa_1,x_position_2,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,distance
0,58189_002566_0_38947_G,0,76.91,18.90,0.00,0.00,237.78,287.29,0.00,0.00,,,,,,,,,
1,58189_002566_0_41269_G,0,71.88,26.83,1.72,0.17,131.03,244.81,4.46,4.46,,,,,,,,,
2,58189_002566_0_41320_G,0,76.04,11.04,0.07,0.00,264.56,302.44,0.87,0.87,,,,,,,,,
3,58189_002566_0_42390_G,0,69.75,33.14,0.33,0.04,172.23,81.97,0.32,0.31,,,,,,,,,
4,58189_002566_0_42392_G,0,75.70,29.76,0.39,0.04,225.21,263.85,1.82,1.82,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26263,58564_000452_63_52600_G,0,69.99,28.92,0.46,0.04,249.27,235.60,0.34,0.24,,,,,,,,,
26264,58564_000452_63_52697_G,0,79.23,31.94,2.00,0.20,272.95,279.22,0.75,0.21,,,,,,,,,
26265,58564_000452_63_53491_G,0,71.69,28.81,0.95,0.10,258.64,238.82,0.97,0.97,,,,,,,,,
26266,58564_000452_63_53571_G,0,73.22,28.43,0.53,0.06,205.08,224.03,0.41,0.07,,,,,,,,,


In [10]:
!ls /kaggle/working

__notebook__.ipynb   train_p2g_feats.csv  valid_p2g_feats.csv
train_all_feats.csv  train_p2p_feats.csv  valid_p2p_feats.csv
train_feats.csv      valid_all_feats.csv


In [11]:
feats = ['contact_id', 'contact', 'x_position_1',
       'y_position_1', 'speed_1', 'distance_1', 'direction_1', 'orientation_1',
       'acceleration_1', 'sa_1', 'x_position_2', 'y_position_2', 'speed_2',
       'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2',
       'distance']

## Test

In [12]:
labels = CFG.expand_contact_id(pd.read_csv(CFG.test_sub_dir))
test_tracking = pd.read_csv(CFG.test_tracking_dir)
# test_helmets = pd.read_csv("/kaggle/input/nfl-player-contact-detection/test_baseline_helmets.csv")
# test_video_metadata = pd.read_csv("/kaggle/input/nfl-player-contact-detection/test_video_metadata.csv")