In [1]:
import os
import gc
from glob import glob
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix
from scipy.optimize import minimize
from IPython.display import display

In [2]:
!ls /kaggle/input/nfl-feats-tab/

p2g_all_feats.csv    train_feats.csv	  valid_all_feats.csv
p2p_all_feats.csv    train_p2g_feats.csv  valid_p2g_feats.csv
train_all_feats.csv  train_p2p_feats.csv  valid_p2p_feats.csv


In [3]:
!mkdir model
!mkdir preds

In [4]:
class CFG:
    name = 'yy'
    suffix = 'default_kfold_test_helmet'
    dist_thresh = 2.5
    
    input_dir = '/kaggle/input/nfl-player-contact-detection'
    split_dir = '/kaggle/input/nfl-split'
    feats_dir = '/kaggle/input/nfl-feats-tab'
    
    working_dir = '/kaggle/working'
    
    model_dir = os.path.join(working_dir, 'model')
    preds_dir = os.path.join(working_dir, 'preds')
    
    num_folds = 5
    
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.03,
        'tree_method':'hist'
    }
    
    
    # --------------------- FIXED --------------------- #
    train_labels_dir = os.path.join(input_dir, 'train_labels.csv')
    train_tracking_dir = os.path.join(input_dir, 'train_player_tracking.csv')
    test_sub_dir = os.path.join(input_dir, 'sample_submission.csv')
    test_tracking_dir = os.path.join(input_dir, 'test_player_tracking.csv')
    train_helmets_dir = os.path.join(input_dir, 'train_baseline_helmets.csv')
    test_helmets_dir = os.path.join(input_dir, 'test_baseline_helmets.csv')
    
    ## feats
    train_all_feats = os.path.join(feats_dir, 'train_feats.csv')
    train_feats_all = os.path.join(feats_dir, 'train_all_feats.csv')
    train_feats_p2p = os.path.join(feats_dir, 'train_feats_p2p.csv')
    train_feats_p2g = os.path.join(feats_dir, 'train_feats_p2g.csv')
    valid_feats_all = os.path.join(feats_dir, 'valid_all_feats.csv')
    valid_feats_p2p = os.path.join(feats_dir, 'valid_feats_p2p.csv')
    valid_feats_p2g = os.path.join(feats_dir, 'valid_feats_p2g.csv')
    feats_p2p = os.path.join(feats_dir, 'p2p_all_feats.csv')
    feats_p2g = os.path.join(feats_dir, 'p2g_all_feats.csv')
    
    feats_dict = {
        'train': train_all_feats,
        'train_all': train_feats_all,
        'train_p2p': train_feats_p2p,
        'train_p2g': train_feats_p2g,
        'valid_all': valid_feats_all,
        'valid_p2p': valid_feats_p2p,
        'valid_p2g': valid_feats_p2g,
        'p2p': feats_p2p,
        'p2g': feats_p2g
    }
    
    
    def expand_contact_id(df):
        df['game_play'] = df['contact_id'].str[:12]
        df['game'] = df['contact_id'].apply(lambda s: s.split('_')[0])
        df['play'] = df['contact_id'].apply(lambda s: s.split('_')[1])
        df['step'] = df['contact_id'].apply(lambda s: s.split('_')[2]).astype(int)
        df['nfl_player_id_1'] = df['contact_id'].apply(lambda s: s.split('_')[-2])
        df['nfl_player_id_2'] = df['contact_id'].apply(lambda s: s.split('_')[-1])
        return df
    
    def merge_helmet_views(helmets):
        df = helmets.drop(columns=['play_id', 'video'])
        on_cols = ['game_key', 'game_play', 'frame', 'nfl_player_id', 'player_label']
        df_view = df[df['view']=='Endzone'].merge(
            df[df['view']=='Sideline'], on=on_cols, how='outer', suffixes=['_end', '_side'])
        del df_view['view_end'], df_view['view_side'], df_view['player_label']
        return df_view
    
    def merge_label_helmet(label, helmet):
        if 'frame' not in label.columns:
            label['frame'] = np.round(label['step']/10*59.94+5*59.94).astype(int)
        helmet = helmet.copy(deep=True)
        helmet['nfl_player_id'] = helmet['nfl_player_id'].astype(str)
        df = label.merge(helmet, 
                         left_on=['game_play', 'frame', 'nfl_player_id_1'],
                         right_on=['game_play', 'frame', 'nfl_player_id'],
                         how='left')
        df = df.merge(helmet, 
                         left_on=['game_play', 'frame', 'nfl_player_id_2'],
                         right_on=['game_play', 'frame', 'nfl_player_id'],
                         how='left', suffixes=['_p1','_p2'])
        df = df.drop(columns=['nfl_player_id_p1', 'nfl_player_id_p2', 'game_key_p1', 'game_key_p2'])
        return df
    
    
    def get_groupkfold(train, n_splits):
        kf = GroupKFold(n_splits=n_splits)
        generator = kf.split(train, train['contact'], train['contact_id'].str[:5], )
        fold_series = []
        for fold, (idx_train, idx_valid) in enumerate(generator):
            fold_series.append(pd.Series(fold, index=idx_valid))
        fold_series = pd.concat(fold_series).sort_index()
        return fold_series
    
    
    def fit_xgboost(X_train, y_train, X_valid, y_valid, xgb_params):
        xgb_train = xgb.DMatrix(X_train, y_train)
        xgb_valid = xgb.DMatrix(X_valid, y_valid)
        display(pd.Series(y_valid).value_counts())
        evals = [(xgb_train, 'train'), (xgb_valid, 'valid')]

        model = xgb.train(
            xgb_params, 
            xgb_train,
            num_boost_round=10_000,
            early_stopping_rounds=100,
            evals=evals,
            verbose_eval=100,
        )
        return model
    
    def fit_xgboost_kfold(X, y, xgb_params, folds, name='', suffix=''):
        oof_pred = np.zeros(len(y), dtype=np.float32)
        for fold in sorted(CFG.folds.unique()):
            if fold == -1: continue
            idx_train = (CFG.folds!=fold)
            idx_valid = (CFG.folds==fold)
            x_train, y_train = X[idx_train], y[idx_train]
            x_valid, y_valid = X[idx_valid], y[idx_valid]
            print(f"Fold -- {fold} -- ")
            model = CFG.fit_xgboost(x_train, y_train, x_valid, y_valid, xgb_params)
            model_path = os.path.join(CFG.model_dir, f'xgb_fold_{fold}_{name}_{suffix}.joblib')
            joblib.dump(model, model_path)
            pred_i = model.predict(xgb.DMatrix(x_valid, y_valid))

            oof_pred[x_valid.index] = pred_i
            score = round(roc_auc_score(y_valid, pred_i), 5)
            print(f'Performance of the prediction: {score}\n')
            del model; gc.collect()

        np.save(os.path.join(CFG.preds_dir, f'oof_pred{name}{suffix}'), oof_pred)
        score = round(roc_auc_score(y, oof_pred), 5)
        print(f'All Performance of the prediction: {score}')
        return oof_pred
    

In [5]:
CFG.default_feats = CFG.feats_dict['train']
CFG.folds = CFG.get_groupkfold(pd.read_csv(CFG.default_feats, index_col=0), CFG.num_folds)

CFG.p2p_feats = CFG.feats_dict['p2p']
CFG.p2g_feats = CFG.feats_dict['p2g']
CFG.p2p_folds = CFG.get_groupkfold(pd.read_csv(CFG.p2p_feats, index_col=0), CFG.num_folds)
CFG.p2g_folds = CFG.get_groupkfold(pd.read_csv(CFG.p2g_feats, index_col=0), CFG.num_folds)

### train all

In [6]:
feats = ['contact_id', 'contact', 'G_flag', 'x_position_1',
       'y_position_1', 'speed_1', 'distance_1', 'direction_1', 'orientation_1',
       'acceleration_1', 'sa_1', 'x_position_2', 'y_position_2', 'speed_2',
       'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2',
       'distance', ] + ['left_side_p1', 'width_side_p1', 'top_side_p1', 'height_side_p1',
       'left_end_p2', 'width_end_p2', 'top_end_p2', 'height_end_p2',
       'left_side_p2', 'width_side_p2', 'top_side_p2', 'height_side_p2']

In [7]:
df = pd.read_csv(CFG.default_feats, index_col=0)[feats]
df_ = df[(df['distance']<CFG.dist_thresh) | df['distance'].isna()].reset_index(drop=True)
X = df_.drop(columns=['contact_id', 'contact'])
y = df_['contact']
CFG.folds = CFG.get_groupkfold(df_, CFG.num_folds)

In [8]:
df_['contact'].value_counts()

0    700880
1     64427
Name: contact, dtype: int64

### p2p & p2g

In [9]:
# df_p2p = pd.read_csv(CFG.p2p_feats, index_col=0)
# df_p2g = pd.read_csv(CFG.p2g_feats, index_col=0)
# X_p2p = df_p2p.drop(columns=['contact_id', 'contact'])
# y_p2p = df_p2p['contact']
# X_p2g = df_p2g.drop(columns=['contact_id', 'contact'])
# y_p2g = df_p2g['contact']

In [10]:
# CFG.p2p_folds.value_counts(), CFG.p2g_folds.value_counts()

In [11]:
# %%time
# p2g_oof_pred = CFG.fit_xgboost_kfold(X_p2g, y_p2g, CFG.xgb_params, CFG.p2g_folds, CFG.name, 'p2g')

In [12]:
# %%time
# p2p_oof_pred = CFG.fit_xgboost_kfold(X_p2p, y_p2p, CFG.xgb_params, CFG.p2p_folds, CFG.name, 'p2p')

In [13]:
%%time
oof_pred = CFG.fit_xgboost_kfold(X, y, CFG.xgb_params, CFG.folds, CFG.name, 'all')

Fold -- 0 -- 


0    139185
1     14139
Name: contact, dtype: int64

[0]	train-auc:0.91065	valid-auc:0.90160
[100]	train-auc:0.93910	valid-auc:0.92579
[200]	train-auc:0.94926	valid-auc:0.93160
[300]	train-auc:0.95611	valid-auc:0.93326
[400]	train-auc:0.96064	valid-auc:0.93405
[500]	train-auc:0.96470	valid-auc:0.93434
[600]	train-auc:0.96744	valid-auc:0.93444
[700]	train-auc:0.96998	valid-auc:0.93452
[800]	train-auc:0.97243	valid-auc:0.93466
[900]	train-auc:0.97428	valid-auc:0.93470
[957]	train-auc:0.97552	valid-auc:0.93469
Performance of the prediction: 0.93469

Fold -- 1 -- 


0    139647
1     12571
Name: contact, dtype: int64

[0]	train-auc:0.91044	valid-auc:0.91303
[100]	train-auc:0.93712	valid-auc:0.93505
[200]	train-auc:0.94683	valid-auc:0.94097
[300]	train-auc:0.95343	valid-auc:0.94362
[400]	train-auc:0.95815	valid-auc:0.94495
[500]	train-auc:0.96255	valid-auc:0.94620
[600]	train-auc:0.96557	valid-auc:0.94670
[700]	train-auc:0.96845	valid-auc:0.94702
[800]	train-auc:0.97090	valid-auc:0.94726
[900]	train-auc:0.97332	valid-auc:0.94748
[1000]	train-auc:0.97495	valid-auc:0.94728
[1006]	train-auc:0.97507	valid-auc:0.94729
Performance of the prediction: 0.94729

Fold -- 2 -- 


0    141269
1     11922
Name: contact, dtype: int64

[0]	train-auc:0.90916	valid-auc:0.91855
[100]	train-auc:0.93687	valid-auc:0.93953
[200]	train-auc:0.94718	valid-auc:0.94404
[300]	train-auc:0.95402	valid-auc:0.94611
[400]	train-auc:0.95882	valid-auc:0.94711
[500]	train-auc:0.96281	valid-auc:0.94757
[600]	train-auc:0.96569	valid-auc:0.94808
[700]	train-auc:0.96814	valid-auc:0.94824
[800]	train-auc:0.97043	valid-auc:0.94825
[877]	train-auc:0.97192	valid-auc:0.94824
Performance of the prediction: 0.94824

Fold -- 3 -- 


0    140833
1     12388
Name: contact, dtype: int64

[0]	train-auc:0.91119	valid-auc:0.90741
[100]	train-auc:0.93850	valid-auc:0.93081
[200]	train-auc:0.94856	valid-auc:0.93643
[300]	train-auc:0.95544	valid-auc:0.93808
[400]	train-auc:0.96010	valid-auc:0.93917
[500]	train-auc:0.96377	valid-auc:0.93962
[600]	train-auc:0.96642	valid-auc:0.93996
[700]	train-auc:0.96908	valid-auc:0.93999
[800]	train-auc:0.97121	valid-auc:0.94010
[900]	train-auc:0.97344	valid-auc:0.94017
[1000]	train-auc:0.97529	valid-auc:0.94021
[1061]	train-auc:0.97627	valid-auc:0.94015
Performance of the prediction: 0.94015

Fold -- 4 -- 


0    139946
1     13407
Name: contact, dtype: int64

[0]	train-auc:0.91095	valid-auc:0.89523
[100]	train-auc:0.93971	valid-auc:0.92247
[200]	train-auc:0.94969	valid-auc:0.93001
[300]	train-auc:0.95660	valid-auc:0.93337
[400]	train-auc:0.96118	valid-auc:0.93481
[500]	train-auc:0.96468	valid-auc:0.93579
[600]	train-auc:0.96800	valid-auc:0.93603
[700]	train-auc:0.97041	valid-auc:0.93637
[800]	train-auc:0.97244	valid-auc:0.93640
[900]	train-auc:0.97443	valid-auc:0.93640
[938]	train-auc:0.97512	valid-auc:0.93637
Performance of the prediction: 0.93637

All Performance of the prediction: 0.94109
CPU times: user 43min 28s, sys: 4.28 s, total: 43min 32s
Wall time: 11min 11s


In [14]:
!ls model/

xgb_fold_0_yy_all.joblib  xgb_fold_2_yy_all.joblib  xgb_fold_4_yy_all.joblib
xgb_fold_1_yy_all.joblib  xgb_fold_3_yy_all.joblib


## Optimize

### train all

In [15]:
def func(x_list):
    score = matthews_corrcoef(df_['contact'], oof_pred>x_list[0])
    return -score

x0 = [0.5]
result = minimize(func, x0,  method="nelder-mead")
CFG.threshold = result.x[0]
print("score:", round(matthews_corrcoef(df_['contact'], oof_pred>CFG.threshold), 5))
print("threshold", round(CFG.threshold, 5))

score: 0.58078
threshold 0.31113


In [16]:
confusion_matrix(df_['contact'], (oof_pred>CFG.threshold).astype(int))

array([[672437,  28443],
       [ 22997,  41430]])

## p2p & p2g

In [17]:
# def func_p2g(x_list):
#     return -matthews_corrcoef(df_p2g['contact'], p2g_oof_pred>x_list[0])

# def func_p2p(x_list):
#     return -matthews_corrcoef(df_p2p['contact'], p2p_oof_pred>x_list[0])

# x_p2g = [0.5]
# res_p2g = minimize(func_p2g, x_p2g,  method="nelder-mead")
# CFG.p2g_threshold = res_p2g.x[0]
# print("p2g score:", round(matthews_corrcoef(df_p2g['contact'], p2g_oof_pred>CFG.p2g_threshold), 5))
# print("p2g threshold:", round(CFG.p2g_threshold, 5))

# x_p2p = [0.5]
# res_p2p = minimize(func_p2p, x_p2p,  method="nelder-mead")
# CFG.p2p_threshold = res_p2p.x[0]
# print("p2p score:", round(matthews_corrcoef(df_p2p['contact'], p2p_oof_pred>CFG.p2p_threshold), 5))
# print("p2p threshold:", round(CFG.p2p_threshold, 5))

## Pred

In [18]:
def pred_xgboost(X, data_dir, add_suffix=''):
    models = glob(os.path.join(data_dir, f'xgb_fold*{add_suffix}.joblib'))
    print(models)
    models = [joblib.load(model) for model in models]
    X_Dmatrix = xgb.DMatrix(X)
    preds = np.array([model.predict(X_Dmatrix) for model in models])
    preds = np.mean(preds, axis=0)
    return preds

In [19]:
def create_features(df_label, df_tracking, cols, merge_col="step"):
    df_combo = (
        df_label.astype({"nfl_player_id_1": str, "nfl_player_id_2": str})
        .merge(
            df_tracking[["game_play", merge_col, "nfl_player_id"] + cols]
                .astype({"nfl_player_id": "str"}),
            left_on=["game_play", merge_col, "nfl_player_id_1"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",)
        .drop(columns=["nfl_player_id"])
        .merge(
            df_tracking[["game_play", merge_col, "nfl_player_id"] + cols]
                .astype({"nfl_player_id": "str"}),
            left_on=["game_play", merge_col, "nfl_player_id_2"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
            suffixes=['_1', '_2']
        )
        .drop(columns=["nfl_player_id"])
        .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
        .reset_index(drop=True)
    )
    df_combo['G_flag'] = (df_combo['nfl_player_id_2']=='G').astype(int)
    
    df_combo['distance_x'] = np.abs(df_combo['x_position_1']-df_combo['x_position_2'])
    df_combo['distance_y'] = np.abs(df_combo['y_position_1']-df_combo['y_position_2'])
    dist_pow = df_combo['distance_x']**2 + df_combo['distance_y']**2
    df_combo['distance'] = np.sqrt(dist_pow)
        
    return df_combo

In [20]:
test_labels = CFG.expand_contact_id(pd.read_csv(CFG.test_sub_dir))
test_tracking = pd.read_csv(CFG.test_tracking_dir)
# test_helmets = pd.read_csv("/kaggle/input/nfl-player-contact-detection/test_baseline_helmets.csv")
# test_video_metadata = pd.read_csv("/kaggle/input/nfl-player-contact-detection/test_video_metadata.csv")
use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]

df_test = create_features(test_labels, test_tracking, use_cols)

In [21]:
df_test_pred = df_test[(df_test['distance']<CFG.dist_thresh) | df_test['distance'].isna()]

In [22]:
df_test.columns

Index(['contact_id', 'contact', 'game_play', 'game', 'play', 'step',
       'nfl_player_id_1', 'nfl_player_id_2', 'x_position_1', 'y_position_1',
       'speed_1', 'distance_1', 'direction_1', 'orientation_1',
       'acceleration_1', 'sa_1', 'x_position_2', 'y_position_2', 'speed_2',
       'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2',
       'G_flag', 'distance_x', 'distance_y', 'distance'],
      dtype='object')

In [23]:
df_test_pred['frame'] = np.round(df_test_pred['step']/10*59.94+5*59.94).astype(int)
test_helmets = pd.read_csv(CFG.test_helmets_dir)
# merge two helmet views of the same player
df_test_helmets = CFG.merge_helmet_views(test_helmets)
# merge helmet views of two players
df_feats = CFG.merge_label_helmet(df_test_pred, df_test_helmets)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [24]:
df_feats[feats].columns

Index(['contact_id', 'contact', 'G_flag', 'x_position_1', 'y_position_1',
       'speed_1', 'distance_1', 'direction_1', 'orientation_1',
       'acceleration_1', 'sa_1', 'x_position_2', 'y_position_2', 'speed_2',
       'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2',
       'distance', 'left_side_p1', 'width_side_p1', 'top_side_p1',
       'height_side_p1', 'left_end_p2', 'width_end_p2', 'top_end_p2',
       'height_end_p2', 'left_side_p2', 'width_side_p2', 'top_side_p2',
       'height_side_p2'],
      dtype='object')

In [25]:
sub_pred = pred_xgboost(
    df_feats[feats].drop(columns=['contact_id', 'contact', ]), 
    CFG.model_dir, f'{CFG.name}_all')
df_test.loc[df_feats.index, 'contact'] = (sub_pred>CFG.threshold).astype(int)
# df_test[['contact_id', 'contact']].to_csv('submission.csv', index=False)
df_test['contact'].value_counts()


['/kaggle/working/model/xgb_fold_2_yy_all.joblib', '/kaggle/working/model/xgb_fold_1_yy_all.joblib', '/kaggle/working/model/xgb_fold_3_yy_all.joblib', '/kaggle/working/model/xgb_fold_4_yy_all.joblib', '/kaggle/working/model/xgb_fold_0_yy_all.joblib']


0    48701
1      887
Name: contact, dtype: int64

In [26]:
df_test[['contact_id', 'contact']].to_csv('submission.csv', index=False)
df_test[['contact_id', 'contact']]

Unnamed: 0,contact_id,contact
0,58168_003392_0_37084_37211,0
1,58168_003392_0_37084_38556,0
2,58168_003392_0_37084_38567,0
3,58168_003392_0_37084_38590,0
4,58168_003392_0_37084_39947,0
...,...,...
49583,58172_003247_125_52521_52939,0
49584,58172_003247_125_52521_G,0
49585,58172_003247_125_52852_52939,0
49586,58172_003247_125_52852_G,0


In [27]:
# sub_pred = pred_xgboost(df_test.drop(columns=['contact_id', 'contact']), 
#                         CFG.model_dir, f'{CFG.name}_{CFG.suffix}')
# df_test['contact'] = (sub_pred > CFG.threshold).astype(int)
# df_test = CFG.expand_contact_id(df_test)
# df_test[['contact_id', 'contact']].to_csv('submission.csv', index=False)
# display(df_test[['contact_id', 'contact']].head())

In [28]:
!ls model/

xgb_fold_0_yy_all.joblib  xgb_fold_2_yy_all.joblib  xgb_fold_4_yy_all.joblib
xgb_fold_1_yy_all.joblib  xgb_fold_3_yy_all.joblib


In [29]:
# df_test_p2g = df_test[df_test['G_flag']==1]
# df_test_p2p = df_test[df_test['G_flag']==0]

In [30]:
# sub_pred_p2g = pred_xgboost(df_test_p2g.drop(columns=['contact_id', 'contact']),
#                            CFG.model_dir, f'{CFG.name}_p2g')
# sub_pred_p2p = pred_xgboost(df_test_p2p.drop(columns=['contact_id', 'contact']),
#                            CFG.model_dir, f'{CFG.name}_p2p')
# df_test_p2g['contact'] = (sub_pred_p2g > CFG.p2p_threshold).astype(int)
# df_test_p2p['contact'] = (sub_pred_p2p > CFG.p2g_threshold).astype(int)

In [31]:
# df_test_ = pd.concat([df_test_p2g, df_test_p2p]).sort_index()
# df_test_[['contact_id', 'contact']].to_csv('submission.csv', index=False)
# df_test_

In [32]:
# df_test_['contact'].value_counts()

In [33]:
# df_test_p2p['contact'].value_counts()

In [34]:
# df_test_p2g['contact'].value_counts()