In [1]:
import os
import gc
from glob import glob
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from scipy.optimize import minimize
from IPython.display import display

In [2]:
!ls /kaggle/input/nfl-feats-tab/

train_all_feats.csv  train_p2p_feats.csv  valid_p2p_feats.csv
train_feats.csv      valid_all_feats.csv
train_p2g_feats.csv  valid_p2g_feats.csv


In [3]:
!mkdir model
!mkdir preds

In [4]:
class CFG:
    name = 'yy'
    suffix = 'default_kfold_test'
    
    input_dir = '/kaggle/input/nfl-player-contact-detection'
    split_dir = '/kaggle/input/nfl-split'
    feats_dir = '/kaggle/input/nfl-feats-tab'
    
    working_dir = '/kaggle/working'
    
    model_dir = os.path.join(working_dir, 'model')
    preds_dir = os.path.join(working_dir, 'preds')
    
    num_folds = 5
    
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.03,
        'tree_method':'hist'
    }
    
    
    # --------------------- FIXED --------------------- #
    train_labels_dir = os.path.join(input_dir, 'train_labels.csv')
    train_tracking_dir = os.path.join(input_dir, 'train_player_tracking.csv')
    test_sub_dir = os.path.join(input_dir, 'sample_submission.csv')
    test_tracking_dir = os.path.join(input_dir, 'test_player_tracking.csv')
    
    ## feats
    train_all_feats = os.path.join(feats_dir, 'train_feats.csv')
    train_feats_all = os.path.join(feats_dir, 'train_all_feats.csv')
    train_feats_p2p = os.path.join(feats_dir, 'train_feats_p2p.csv')
    train_feats_p2g = os.path.join(feats_dir, 'train_feats_p2g.csv')
    valid_feats_all = os.path.join(feats_dir, 'valid_all_feats.csv')
    valid_feats_p2p = os.path.join(feats_dir, 'valid_feats_p2p.csv')
    valid_feats_p2g = os.path.join(feats_dir, 'valid_feats_p2g.csv')
    
    feats_dict = {
        'train': train_all_feats,
        'train_all': train_feats_all,
        'train_p2p': train_feats_p2p,
        'train_p2g': train_feats_p2g,
        'valid_all': valid_feats_all,
        'valid_p2p': valid_feats_p2p,
        'valid_p2g': valid_feats_p2g
    }
    
    
    # --------------------- SPECIFY ----------------------- #
    default_feats = feats_dict['train']
    
    def expand_contact_id(df):
        df['game_play'] = df['contact_id'].str[:12]
        df['game'] = df['contact_id'].apply(lambda s: s.split('_')[0])
        df['play'] = df['contact_id'].apply(lambda s: s.split('_')[1])
        df['step'] = df['contact_id'].apply(lambda s: s.split('_')[2]).astype(int)
        df['nfl_player_id_1'] = df['contact_id'].apply(lambda s: s.split('_')[-2])
        df['nfl_player_id_2'] = df['contact_id'].apply(lambda s: s.split('_')[-1])
        return df
    
    def get_groupkfold(train, n_splits):
        kf = GroupKFold(n_splits=n_splits)
        generator = kf.split(train, train['contact'], train['contact_id'].str[:5], )
        fold_series = []
        for fold, (idx_train, idx_valid) in enumerate(generator):
            fold_series.append(pd.Series(fold, index=idx_valid))
        fold_series = pd.concat(fold_series).sort_index()
        return fold_series
    
    
    def fit_xgboost(X_train, y_train, X_valid, y_valid, xgb_params):
        xgb_train = xgb.DMatrix(X_train, y_train)
        xgb_valid = xgb.DMatrix(X_valid, y_valid)
        display(pd.Series(y_valid).value_counts())
        evals = [(xgb_train, 'train'), (xgb_valid, 'valid')]

        model = xgb.train(
            xgb_params, 
            xgb_train,
            num_boost_round=10_000,
            early_stopping_rounds=100,
            evals=evals,
            verbose_eval=100,
        )
        return model
    
    def fit_xgboost_kfold(X, y, xgb_params, name='', suffix=''):
        oof_pred = np.zeros(len(y), dtype=np.float32)
        for fold in sorted(CFG.folds.unique()):
            if fold == -1: continue
            idx_train = (CFG.folds!=fold)
            idx_valid = (CFG.folds==fold)
            x_train, y_train = X[idx_train], y[idx_train]
            x_valid, y_valid = X[idx_valid], y[idx_valid]
            print(f"Fold -- {fold} -- ")
            model = CFG.fit_xgboost(x_train, y_train, x_valid, y_valid, xgb_params)
            model_path = os.path.join(CFG.model_dir, f'xgb_fold_{fold}_{name}_{suffix}.joblib')
            joblib.dump(model, model_path)
            pred_i = model.predict(xgb.DMatrix(x_valid, y_valid))

            oof_pred[x_valid.index] = pred_i
            score = round(roc_auc_score(y_valid, pred_i), 5)
            print(f'Performance of the prediction: {score}\n')
            del model; gc.collect()

        np.save(os.path.join(CFG.preds_dir, f'oof_pred{name}{suffix}'), oof_pred)
        score = round(roc_auc_score(y, oof_pred), 5)
        print(f'All Performance of the prediction: {score}')
        return oof_pred
    
    folds = get_groupkfold(pd.read_csv(default_feats, index_col=0), num_folds)

In [5]:
CFG.default_feats = CFG.feats_dict['train']

In [6]:
df = pd.read_csv(CFG.default_feats, index_col=0)
X = df.drop(columns=['contact_id', 'contact'])
y = df['contact']

In [7]:
CFG.folds.value_counts()

3    948750
0    945714
2    945593
1    945461
4    936100
dtype: int64

In [8]:
def fit_xgboost_kfold(X, y, xgb_params, name='', suffix=''):
    oof_pred = np.zeros(len(y), dtype=np.float32)
    for fold in sorted(CFG.folds.unique()):
        if fold == -1: continue
        idx_train = (CFG.folds!=fold)
        idx_valid = (CFG.folds==fold)
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        print(f"Fold -- {fold} -- ")
        model = CFG.fit_xgboost(x_train, y_train, x_valid, y_valid, xgb_params)
        model_path = os.path.join(CFG.model_dir, f'xgb_fold_{fold}_{name}_{suffix}.joblib')
        joblib.dump(model, model_path)
        pred_i = model.predict(xgb.DMatrix(x_valid, y_valid))
        
        oof_pred[x_valid.index] = pred_i
        score = round(roc_auc_score(y_valid, pred_i), 5)
        print(f'Performance of the prediction: {score}\n')
        del model; gc.collect()

    np.save(os.path.join(CFG.preds_dir, f'oof_pred{name}{suffix}'), oof_pred)
    score = round(roc_auc_score(y, oof_pred), 5)
    print(f'All Performance of the prediction: {score}')
    return oof_pred

In [9]:
%%time
oof_pred = CFG.fit_xgboost_kfold(X, y, CFG.xgb_params, CFG.name, CFG.suffix)

Fold -- 0 -- 


0    933661
1     12053
Name: contact, dtype: int64

[0]	train-auc:0.98333	valid-auc:0.98611
[100]	train-auc:0.98821	valid-auc:0.98979
[200]	train-auc:0.98976	valid-auc:0.99028
[300]	train-auc:0.99060	valid-auc:0.99061
[400]	train-auc:0.99136	valid-auc:0.99095
[500]	train-auc:0.99225	valid-auc:0.99118
[600]	train-auc:0.99285	valid-auc:0.99121
[657]	train-auc:0.99315	valid-auc:0.99118
Performance of the prediction: 0.99118

Fold -- 1 -- 


0    931485
1     13976
Name: contact, dtype: int64

[0]	train-auc:0.98288	valid-auc:0.98163
[100]	train-auc:0.98871	valid-auc:0.98729
[200]	train-auc:0.99035	valid-auc:0.98788
[300]	train-auc:0.99110	valid-auc:0.98809
[400]	train-auc:0.99174	valid-auc:0.98828
[500]	train-auc:0.99256	valid-auc:0.98851
[600]	train-auc:0.99322	valid-auc:0.98863
[700]	train-auc:0.99365	valid-auc:0.98866
[800]	train-auc:0.99407	valid-auc:0.98870
[900]	train-auc:0.99441	valid-auc:0.98869
[932]	train-auc:0.99450	valid-auc:0.98868
Performance of the prediction: 0.98869

Fold -- 2 -- 


0    934228
1     11365
Name: contact, dtype: int64

[0]	train-auc:0.98228	valid-auc:0.98587
[100]	train-auc:0.98821	valid-auc:0.99004
[200]	train-auc:0.98974	valid-auc:0.99063
[300]	train-auc:0.99060	valid-auc:0.99086
[400]	train-auc:0.99128	valid-auc:0.99110
[500]	train-auc:0.99216	valid-auc:0.99131
[600]	train-auc:0.99273	valid-auc:0.99140
[700]	train-auc:0.99327	valid-auc:0.99150
[800]	train-auc:0.99367	valid-auc:0.99148
[844]	train-auc:0.99383	valid-auc:0.99148
Performance of the prediction: 0.99148

Fold -- 3 -- 


0    932929
1     15821
Name: contact, dtype: int64

[0]	train-auc:0.98502	valid-auc:0.97934
[100]	train-auc:0.98942	valid-auc:0.98465
[200]	train-auc:0.99083	valid-auc:0.98527
[300]	train-auc:0.99173	valid-auc:0.98569
[400]	train-auc:0.99234	valid-auc:0.98583
[500]	train-auc:0.99314	valid-auc:0.98596
[600]	train-auc:0.99370	valid-auc:0.98599
[700]	train-auc:0.99414	valid-auc:0.98605
[800]	train-auc:0.99450	valid-auc:0.98590
[824]	train-auc:0.99456	valid-auc:0.98589
Performance of the prediction: 0.98589

Fold -- 4 -- 


0    924793
1     11307
Name: contact, dtype: int64

[0]	train-auc:0.98338	valid-auc:0.98035
[100]	train-auc:0.98890	valid-auc:0.98668
[200]	train-auc:0.99005	valid-auc:0.98744
[300]	train-auc:0.99077	valid-auc:0.98806
[400]	train-auc:0.99150	valid-auc:0.98827
[500]	train-auc:0.99239	valid-auc:0.98816
[563]	train-auc:0.99282	valid-auc:0.98783
Performance of the prediction: 0.98783

All Performance of the prediction: 0.98901
CPU times: user 2h 44min 11s, sys: 14.3 s, total: 2h 44min 26s
Wall time: 45min 57s


In [10]:
!ls model/

xgb_fold_0_yy_default_kfold_test.joblib
xgb_fold_1_yy_default_kfold_test.joblib
xgb_fold_2_yy_default_kfold_test.joblib
xgb_fold_3_yy_default_kfold_test.joblib
xgb_fold_4_yy_default_kfold_test.joblib


## Optimize

In [11]:
def func(x_list):
    score = matthews_corrcoef(df['contact'], oof_pred>x_list[0])
    return -score

x0 = [0.5]
result = minimize(func, x0,  method="nelder-mead")
CFG.threshold = result.x[0]
print("score:", round(matthews_corrcoef(df['contact'], oof_pred>CFG.threshold), 5))
print("threshold", round(CFG.threshold, 5))

score: 0.58343
threshold 0.29844


In [12]:
CFG.threshold

0.2984374999999998

## Pred

In [13]:
def pred_xgboost(X, data_dir, add_suffix=''):
    models = glob(os.path.join(data_dir, f'xgb_fold*{add_suffix}.joblib'))
    print(models)
    models = [joblib.load(model) for model in models]
    X_Dmatrix = xgb.DMatrix(X)
    preds = np.array([model.predict(X_Dmatrix) for model in models])
    preds = np.mean(preds, axis=0)
    return preds

In [14]:
def create_features(df_label, df_tracking, cols, merge_col="step"):
    df_combo = (
        df_label.astype({"nfl_player_id_1": str, "nfl_player_id_2": str})
        .merge(
            df_tracking[["game_play", merge_col, "nfl_player_id"] + cols]
                .astype({"nfl_player_id": "str"}),
            left_on=["game_play", merge_col, "nfl_player_id_1"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",)
        .drop(columns=["nfl_player_id"])
        .merge(
            df_tracking[["game_play", merge_col, "nfl_player_id"] + cols]
                .astype({"nfl_player_id": "str"}),
            left_on=["game_play", merge_col, "nfl_player_id_2"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
            suffixes=['_1', '_2']
        )
        .drop(columns=["nfl_player_id"])
        .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
        .reset_index(drop=True)
    )
    df_combo['G_flag'] = (df_combo['nfl_player_id_2']=='G').astype(int)
    
    df_combo['distance_x'] = np.abs(df_combo['x_position_1']-df_combo['x_position_2'])
    df_combo['distance_y'] = np.abs(df_combo['y_position_1']-df_combo['y_position_2'])
    dist_pow = df_combo['distance_x']**2 + df_combo['distance_y']**2
    df_combo['distance'] = np.sqrt(dist_pow)
        
    return df_combo

In [15]:
test_labels = CFG.expand_contact_id(pd.read_csv(CFG.test_sub_dir))
test_tracking = pd.read_csv(CFG.test_tracking_dir)
# test_helmets = pd.read_csv("/kaggle/input/nfl-player-contact-detection/test_baseline_helmets.csv")
# test_video_metadata = pd.read_csv("/kaggle/input/nfl-player-contact-detection/test_video_metadata.csv")
use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]

feats = ['contact_id', 'contact', 'G_flag', 'x_position_1',
       'y_position_1', 'speed_1', 'distance_1', 'direction_1', 'orientation_1',
       'acceleration_1', 'sa_1', 'x_position_2', 'y_position_2', 'speed_2',
       'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2',
       'distance', ]

df_test = create_features(test_labels, test_tracking, use_cols)[feats]

In [16]:
df_test

Unnamed: 0,contact_id,contact,G_flag,x_position_1,y_position_1,speed_1,distance_1,direction_1,orientation_1,acceleration_1,sa_1,x_position_2,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,distance
0,58168_003392_0_37084_37211,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,39.59,17.07,0.53,0.05,134.84,84.73,1.43,1.42,3.794232
1,58168_003392_0_37084_38556,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,41.93,30.61,0.67,0.05,232.50,227.00,1.82,1.61,10.530043
2,58168_003392_0_37084_38567,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,40.37,19.88,0.66,0.07,136.70,88.92,0.90,0.89,1.543017
3,58168_003392_0_37084_38590,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58,5.431841
4,58168_003392_0_37084_39947,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,40.11,26.73,0.99,0.09,163.38,90.69,1.68,1.64,6.886697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49583,58172_003247_125_52521_52939,0,0,23.44,4.04,1.41,0.15,163.22,185.42,0.75,-0.58,37.94,2.10,2.29,0.24,211.73,215.11,2.43,-2.43,14.629204
49584,58172_003247_125_52521_G,0,1,23.44,4.04,1.41,0.15,163.22,185.42,0.75,-0.58,,,,,,,,,
49585,58172_003247_125_52852_52939,0,0,32.67,2.18,2.34,0.24,113.19,119.09,1.03,-0.97,37.94,2.10,2.29,0.24,211.73,215.11,2.43,-2.43,5.270607
49586,58172_003247_125_52852_G,0,1,32.67,2.18,2.34,0.24,113.19,119.09,1.03,-0.97,,,,,,,,,


In [17]:
sub_pred = pred_xgboost(df_test.drop(columns=['contact_id', 'contact']), 
                        CFG.model_dir, f'{CFG.name}_{CFG.suffix}')
df_test['contact'] = (sub_pred > CFG.threshold).astype(int)
df_test = CFG.expand_contact_id(df_test)
df_test[['contact_id', 'contact']].to_csv('submission.csv', index=False)
display(df_test[['contact_id', 'contact']].head())

['/kaggle/working/model/xgb_fold_3_yy_default_kfold_test.joblib', '/kaggle/working/model/xgb_fold_4_yy_default_kfold_test.joblib', '/kaggle/working/model/xgb_fold_1_yy_default_kfold_test.joblib', '/kaggle/working/model/xgb_fold_0_yy_default_kfold_test.joblib', '/kaggle/working/model/xgb_fold_2_yy_default_kfold_test.joblib']


Unnamed: 0,contact_id,contact
0,58168_003392_0_37084_37211,0
1,58168_003392_0_37084_38556,0
2,58168_003392_0_37084_38567,0
3,58168_003392_0_37084_38590,0
4,58168_003392_0_37084_39947,0


In [18]:
df_test['contact'].value_counts()

0    48718
1      870
Name: contact, dtype: int64