In [1]:
import os
import gc
from glob import glob
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from scipy.optimize import minimize
from IPython.display import display

In [2]:
!ls /kaggle/input/nfl-feats-tab/

p2g_all_feats.csv    train_feats.csv	  valid_all_feats.csv
p2p_all_feats.csv    train_p2g_feats.csv  valid_p2g_feats.csv
train_all_feats.csv  train_p2p_feats.csv  valid_p2p_feats.csv


In [3]:
!mkdir model
!mkdir preds

In [4]:
class CFG:
    name = 'yy'
    suffix = 'default_kfold_test'
    
    input_dir = '/kaggle/input/nfl-player-contact-detection'
    split_dir = '/kaggle/input/nfl-split'
    feats_dir = '/kaggle/input/nfl-feats-tab'
    
    working_dir = '/kaggle/working'
    
    model_dir = os.path.join(working_dir, 'model')
    preds_dir = os.path.join(working_dir, 'preds')
    
    num_folds = 5
    
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.03,
        'tree_method':'hist'
    }
    
    
    # --------------------- FIXED --------------------- #
    train_labels_dir = os.path.join(input_dir, 'train_labels.csv')
    train_tracking_dir = os.path.join(input_dir, 'train_player_tracking.csv')
    test_sub_dir = os.path.join(input_dir, 'sample_submission.csv')
    test_tracking_dir = os.path.join(input_dir, 'test_player_tracking.csv')
    
    ## feats
    train_all_feats = os.path.join(feats_dir, 'train_feats.csv')
    train_feats_all = os.path.join(feats_dir, 'train_all_feats.csv')
    train_feats_p2p = os.path.join(feats_dir, 'train_feats_p2p.csv')
    train_feats_p2g = os.path.join(feats_dir, 'train_feats_p2g.csv')
    valid_feats_all = os.path.join(feats_dir, 'valid_all_feats.csv')
    valid_feats_p2p = os.path.join(feats_dir, 'valid_feats_p2p.csv')
    valid_feats_p2g = os.path.join(feats_dir, 'valid_feats_p2g.csv')
    feats_p2p = os.path.join(feats_dir, 'p2p_all_feats.csv')
    feats_p2g = os.path.join(feats_dir, 'p2g_all_feats.csv')
    
    feats_dict = {
        'train': train_all_feats,
        'train_all': train_feats_all,
        'train_p2p': train_feats_p2p,
        'train_p2g': train_feats_p2g,
        'valid_all': valid_feats_all,
        'valid_p2p': valid_feats_p2p,
        'valid_p2g': valid_feats_p2g,
        'p2p': feats_p2p,
        'p2g': feats_p2g
    }
    
    
    def expand_contact_id(df):
        df['game_play'] = df['contact_id'].str[:12]
        df['game'] = df['contact_id'].apply(lambda s: s.split('_')[0])
        df['play'] = df['contact_id'].apply(lambda s: s.split('_')[1])
        df['step'] = df['contact_id'].apply(lambda s: s.split('_')[2]).astype(int)
        df['nfl_player_id_1'] = df['contact_id'].apply(lambda s: s.split('_')[-2])
        df['nfl_player_id_2'] = df['contact_id'].apply(lambda s: s.split('_')[-1])
        return df
    
    def get_groupkfold(train, n_splits):
        kf = GroupKFold(n_splits=n_splits)
        generator = kf.split(train, train['contact'], train['contact_id'].str[:5], )
        fold_series = []
        for fold, (idx_train, idx_valid) in enumerate(generator):
            fold_series.append(pd.Series(fold, index=idx_valid))
        fold_series = pd.concat(fold_series).sort_index()
        return fold_series
    
    
    def fit_xgboost(X_train, y_train, X_valid, y_valid, xgb_params):
        xgb_train = xgb.DMatrix(X_train, y_train)
        xgb_valid = xgb.DMatrix(X_valid, y_valid)
        display(pd.Series(y_valid).value_counts())
        evals = [(xgb_train, 'train'), (xgb_valid, 'valid')]

        model = xgb.train(
            xgb_params, 
            xgb_train,
            num_boost_round=10_000,
            early_stopping_rounds=100,
            evals=evals,
            verbose_eval=100,
        )
        return model
    
    def fit_xgboost_kfold(X, y, xgb_params, folds, name='', suffix=''):
        oof_pred = np.zeros(len(y), dtype=np.float32)
        for fold in sorted(CFG.folds.unique()):
            if fold == -1: continue
            idx_train = (CFG.folds!=fold)
            idx_valid = (CFG.folds==fold)
            x_train, y_train = X[idx_train], y[idx_train]
            x_valid, y_valid = X[idx_valid], y[idx_valid]
            print(f"Fold -- {fold} -- ")
            model = CFG.fit_xgboost(x_train, y_train, x_valid, y_valid, xgb_params)
            model_path = os.path.join(CFG.model_dir, f'xgb_fold_{fold}_{name}_{suffix}.joblib')
            joblib.dump(model, model_path)
            pred_i = model.predict(xgb.DMatrix(x_valid, y_valid))

            oof_pred[x_valid.index] = pred_i
            score = round(roc_auc_score(y_valid, pred_i), 5)
            print(f'Performance of the prediction: {score}\n')
            del model; gc.collect()

        np.save(os.path.join(CFG.preds_dir, f'oof_pred{name}{suffix}'), oof_pred)
        score = round(roc_auc_score(y, oof_pred), 5)
        print(f'All Performance of the prediction: {score}')
        return oof_pred
    

In [5]:
CFG.default_feats = CFG.feats_dict['train']
CFG.folds = CFG.get_groupkfold(pd.read_csv(CFG.default_feats, index_col=0), CFG.num_folds)

CFG.p2p_feats = CFG.feats_dict['p2p']
CFG.p2g_feats = CFG.feats_dict['p2g']
CFG.p2p_folds = CFG.get_groupkfold(pd.read_csv(CFG.p2p_feats, index_col=0), CFG.num_folds)
CFG.p2g_folds = CFG.get_groupkfold(pd.read_csv(CFG.p2g_feats, index_col=0), CFG.num_folds)

### train all

In [6]:
# df = pd.read_csv(CFG.default_feats, index_col=0)
# X = df.drop(columns=['contact_id', 'contact'])
# y = df['contact']

### p2p & p2g

In [7]:
df_p2p = pd.read_csv(CFG.p2p_feats, index_col=0)
df_p2g = pd.read_csv(CFG.p2g_feats, index_col=0)
X_p2p = df_p2p.drop(columns=['contact_id', 'contact'])
y_p2p = df_p2p['contact']
X_p2g = df_p2g.drop(columns=['contact_id', 'contact'])
y_p2g = df_p2g['contact']

In [8]:
CFG.p2p_folds.value_counts(), CFG.p2g_folds.value_counts()

(3    866250
 0    863478
 2    863310
 1    863247
 4    854700
 dtype: int64,
 3    82500
 0    82236
 1    82236
 4    82236
 2    81425
 dtype: int64)

In [9]:
# def fit_xgboost_kfold(X, y, xgb_params, name='', suffix=''):
#     oof_pred = np.zeros(len(y), dtype=np.float32)
#     for fold in sorted(CFG.folds.unique()):
#         if fold == -1: continue
#         idx_train = (CFG.folds!=fold)
#         idx_valid = (CFG.folds==fold)
#         x_train, y_train = X[idx_train], y[idx_train]
#         x_valid, y_valid = X[idx_valid], y[idx_valid]
#         print(f"Fold -- {fold} -- ")
#         model = CFG.fit_xgboost(x_train, y_train, x_valid, y_valid, xgb_params)
#         model_path = os.path.join(CFG.model_dir, f'xgb_fold_{fold}_{name}_{suffix}.joblib')
#         joblib.dump(model, model_path)
#         pred_i = model.predict(xgb.DMatrix(x_valid, y_valid))
        
#         oof_pred[x_valid.index] = pred_i
#         score = round(roc_auc_score(y_valid, pred_i), 5)
#         print(f'Performance of the prediction: {score}\n')
#         del model; gc.collect()

#     np.save(os.path.join(CFG.preds_dir, f'oof_pred{name}{suffix}'), oof_pred)
#     score = round(roc_auc_score(y, oof_pred), 5)
#     print(f'All Performance of the prediction: {score}')
#     return oof_pred

In [10]:
%%time
p2g_oof_pred = CFG.fit_xgboost_kfold(X_p2g, y_p2g, CFG.xgb_params, CFG.p2g_folds, CFG.name, 'p2g')



Fold -- 0 -- 




0    203976
1      8797
Name: contact, dtype: int64

[0]	train-auc:0.87825	valid-auc:0.84511
[100]	train-auc:0.91730	valid-auc:0.87485
[200]	train-auc:0.94095	valid-auc:0.87392
[220]	train-auc:0.94533	valid-auc:0.87224
Performance of the prediction: 0.87224





Fold -- 1 -- 


0    49942
1     2429
Name: contact, dtype: int64

[0]	train-auc:0.87108	valid-auc:0.85750
[100]	train-auc:0.90444	valid-auc:0.87936
[200]	train-auc:0.92176	valid-auc:0.88648
[300]	train-auc:0.93859	valid-auc:0.88780
[381]	train-auc:0.94783	valid-auc:0.88657
Performance of the prediction: 0.88649





Fold -- 2 -- 


0    40388
1     1104
Name: contact, dtype: int64

[0]	train-auc:0.86837	valid-auc:0.85883
[100]	train-auc:0.90363	valid-auc:0.88733
[200]	train-auc:0.92120	valid-auc:0.88165
[205]	train-auc:0.92184	valid-auc:0.88187
Performance of the prediction: 0.88187





Fold -- 3 -- 


0    14077
1      850
Name: contact, dtype: int64

[0]	train-auc:0.86792	valid-auc:0.88597
[100]	train-auc:0.90163	valid-auc:0.89976
[200]	train-auc:0.91968	valid-auc:0.90016
[262]	train-auc:0.93021	valid-auc:0.89953
Performance of the prediction: 0.89953





Fold -- 4 -- 


0    85436
1     3634
Name: contact, dtype: int64

[0]	train-auc:0.87286	valid-auc:0.84375
[100]	train-auc:0.90816	valid-auc:0.87646
[200]	train-auc:0.92638	valid-auc:0.87916
[300]	train-auc:0.94481	valid-auc:0.87490
[313]	train-auc:0.94635	valid-auc:0.87443
Performance of the prediction: 0.8744

All Performance of the prediction: 0.8755
CPU times: user 6min 3s, sys: 641 ms, total: 6min 4s
Wall time: 1min 34s


In [11]:
%%time
p2p_oof_pred = CFG.fit_xgboost_kfold(X_p2p, y_p2p, CFG.xgb_params, CFG.p2p_folds, CFG.name, 'p2p')



Fold -- 0 -- 


0    793093
1      9676
Name: contact, dtype: int64

[0]	train-auc:0.99060	valid-auc:0.99190
[100]	train-auc:0.99468	valid-auc:0.99449
[200]	train-auc:0.99589	valid-auc:0.99475
[300]	train-auc:0.99633	valid-auc:0.99499
[400]	train-auc:0.99668	valid-auc:0.99520
[500]	train-auc:0.99706	valid-auc:0.99539
[600]	train-auc:0.99742	valid-auc:0.99549
[700]	train-auc:0.99766	valid-auc:0.99552
[800]	train-auc:0.99789	valid-auc:0.99554
[900]	train-auc:0.99806	valid-auc:0.99554
[1000]	train-auc:0.99822	valid-auc:0.99555
[1100]	train-auc:0.99836	valid-auc:0.99555
[1106]	train-auc:0.99837	valid-auc:0.99555
Performance of the prediction: 0.99555





Fold -- 1 -- 


0    890456
1      9971
Name: contact, dtype: int64

[0]	train-auc:0.99106	valid-auc:0.98859
[100]	train-auc:0.99488	valid-auc:0.99375
[200]	train-auc:0.99603	valid-auc:0.99462
[300]	train-auc:0.99650	valid-auc:0.99490
[400]	train-auc:0.99684	valid-auc:0.99505
[500]	train-auc:0.99720	valid-auc:0.99511
[600]	train-auc:0.99748	valid-auc:0.99512
[700]	train-auc:0.99775	valid-auc:0.99512
[731]	train-auc:0.99782	valid-auc:0.99513
Performance of the prediction: 0.99513





Fold -- 2 -- 


0    837005
1      8892
Name: contact, dtype: int64

[0]	train-auc:0.99247	valid-auc:0.99102
[100]	train-auc:0.99491	valid-auc:0.99354
[200]	train-auc:0.99575	valid-auc:0.99403
[300]	train-auc:0.99615	valid-auc:0.99416
[400]	train-auc:0.99657	valid-auc:0.99538
[500]	train-auc:0.99694	valid-auc:0.99550
[600]	train-auc:0.99727	valid-auc:0.99559
[700]	train-auc:0.99755	valid-auc:0.99566
[800]	train-auc:0.99778	valid-auc:0.99570
[900]	train-auc:0.99798	valid-auc:0.99571
[1000]	train-auc:0.99813	valid-auc:0.99572
[1100]	train-auc:0.99828	valid-auc:0.99574
[1200]	train-auc:0.99841	valid-auc:0.99577
[1300]	train-auc:0.99854	valid-auc:0.99578
[1392]	train-auc:0.99864	valid-auc:0.99577
Performance of the prediction: 0.99577





Fold -- 3 -- 


0    859045
1      9504
Name: contact, dtype: int64

[0]	train-auc:0.99076	valid-auc:0.99260
[100]	train-auc:0.99463	valid-auc:0.99520
[200]	train-auc:0.99579	valid-auc:0.99528
[300]	train-auc:0.99634	valid-auc:0.99587
[400]	train-auc:0.99667	valid-auc:0.99597
[500]	train-auc:0.99702	valid-auc:0.99605
[600]	train-auc:0.99733	valid-auc:0.99608
[700]	train-auc:0.99761	valid-auc:0.99612
[800]	train-auc:0.99787	valid-auc:0.99616
[900]	train-auc:0.99804	valid-auc:0.99617
[1000]	train-auc:0.99821	valid-auc:0.99618
[1100]	train-auc:0.99834	valid-auc:0.99618
[1102]	train-auc:0.99834	valid-auc:0.99618
Performance of the prediction: 0.99618





Fold -- 4 -- 


0    883678
1      9665
Name: contact, dtype: int64

[0]	train-auc:0.99235	valid-auc:0.99152
[100]	train-auc:0.99473	valid-auc:0.99420
[200]	train-auc:0.99591	valid-auc:0.99478
[300]	train-auc:0.99640	valid-auc:0.99524
[400]	train-auc:0.99677	valid-auc:0.99544
[500]	train-auc:0.99713	valid-auc:0.99555
[600]	train-auc:0.99744	valid-auc:0.99559
[700]	train-auc:0.99767	valid-auc:0.99562
[800]	train-auc:0.99790	valid-auc:0.99564
[900]	train-auc:0.99807	valid-auc:0.99565
[1000]	train-auc:0.99824	valid-auc:0.99568
[1100]	train-auc:0.99838	valid-auc:0.99568
[1157]	train-auc:0.99845	valid-auc:0.99568
Performance of the prediction: 0.99568

All Performance of the prediction: 0.99563
CPU times: user 4h 2min 44s, sys: 57.7 s, total: 4h 3min 41s
Wall time: 1h 8min 42s


In [12]:
!ls model/

xgb_fold_0_yy_p2g.joblib  xgb_fold_2_yy_p2g.joblib  xgb_fold_4_yy_p2g.joblib
xgb_fold_0_yy_p2p.joblib  xgb_fold_2_yy_p2p.joblib  xgb_fold_4_yy_p2p.joblib
xgb_fold_1_yy_p2g.joblib  xgb_fold_3_yy_p2g.joblib
xgb_fold_1_yy_p2p.joblib  xgb_fold_3_yy_p2p.joblib


## Optimize

### train all

In [13]:
# def func(x_list):
#     score = matthews_corrcoef(df['contact'], oof_pred>x_list[0])
#     return -score

# x0 = [0.5]
# result = minimize(func, x0,  method="nelder-mead")
# CFG.threshold = result.x[0]
# print("score:", round(matthews_corrcoef(df['contact'], oof_pred>CFG.threshold), 5))
# print("threshold", round(CFG.threshold, 5))

## p2p & p2g

In [14]:
def func_p2g(x_list):
    return -matthews_corrcoef(df_p2g['contact'], p2g_oof_pred>x_list[0])

def func_p2p(x_list):
    return -matthews_corrcoef(df_p2p['contact'], p2p_oof_pred>x_list[0])

x_p2g = [0.5]
res_p2g = minimize(func_p2g, x_p2g,  method="nelder-mead")
CFG.p2g_threshold = res_p2g.x[0]
print("p2g score:", round(matthews_corrcoef(df_p2g['contact'], p2g_oof_pred>CFG.p2g_threshold), 5))
print("p2g threshold:", round(CFG.p2g_threshold, 5))

x_p2p = [0.5]
res_p2p = minimize(func_p2p, x_p2p,  method="nelder-mead")
CFG.p2p_threshold = res_p2p.x[0]
print("p2p score:", round(matthews_corrcoef(df_p2p['contact'], p2p_oof_pred>CFG.p2p_threshold), 5))
print("p2p threshold:", round(CFG.p2p_threshold, 5))

p2g score: 0.29475
p2g threshold: 0.06826
p2p score: 0.67806
p2p threshold: 0.31533


## Pred

In [15]:
def pred_xgboost(X, data_dir, add_suffix=''):
    models = glob(os.path.join(data_dir, f'xgb_fold*{add_suffix}.joblib'))
    print(models)
    models = [joblib.load(model) for model in models]
    X_Dmatrix = xgb.DMatrix(X)
    preds = np.array([model.predict(X_Dmatrix) for model in models])
    preds = np.mean(preds, axis=0)
    return preds

In [16]:
def create_features(df_label, df_tracking, cols, merge_col="step"):
    df_combo = (
        df_label.astype({"nfl_player_id_1": str, "nfl_player_id_2": str})
        .merge(
            df_tracking[["game_play", merge_col, "nfl_player_id"] + cols]
                .astype({"nfl_player_id": "str"}),
            left_on=["game_play", merge_col, "nfl_player_id_1"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",)
        .drop(columns=["nfl_player_id"])
        .merge(
            df_tracking[["game_play", merge_col, "nfl_player_id"] + cols]
                .astype({"nfl_player_id": "str"}),
            left_on=["game_play", merge_col, "nfl_player_id_2"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
            suffixes=['_1', '_2']
        )
        .drop(columns=["nfl_player_id"])
        .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
        .reset_index(drop=True)
    )
    df_combo['G_flag'] = (df_combo['nfl_player_id_2']=='G').astype(int)
    
    df_combo['distance_x'] = np.abs(df_combo['x_position_1']-df_combo['x_position_2'])
    df_combo['distance_y'] = np.abs(df_combo['y_position_1']-df_combo['y_position_2'])
    dist_pow = df_combo['distance_x']**2 + df_combo['distance_y']**2
    df_combo['distance'] = np.sqrt(dist_pow)
        
    return df_combo

In [17]:
test_labels = CFG.expand_contact_id(pd.read_csv(CFG.test_sub_dir))
test_tracking = pd.read_csv(CFG.test_tracking_dir)
# test_helmets = pd.read_csv("/kaggle/input/nfl-player-contact-detection/test_baseline_helmets.csv")
# test_video_metadata = pd.read_csv("/kaggle/input/nfl-player-contact-detection/test_video_metadata.csv")
use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]

feats = ['contact_id', 'contact', 'G_flag', 'x_position_1',
       'y_position_1', 'speed_1', 'distance_1', 'direction_1', 'orientation_1',
       'acceleration_1', 'sa_1', 'x_position_2', 'y_position_2', 'speed_2',
       'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2',
       'distance', ]

df_test = create_features(test_labels, test_tracking, use_cols)[feats]

In [18]:
df_test

Unnamed: 0,contact_id,contact,G_flag,x_position_1,y_position_1,speed_1,distance_1,direction_1,orientation_1,acceleration_1,sa_1,x_position_2,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,distance
0,58168_003392_0_37084_37211,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,39.59,17.07,0.53,0.05,134.84,84.73,1.43,1.42,3.794232
1,58168_003392_0_37084_38556,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,41.93,30.61,0.67,0.05,232.50,227.00,1.82,1.61,10.530043
2,58168_003392_0_37084_38567,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,40.37,19.88,0.66,0.07,136.70,88.92,0.90,0.89,1.543017
3,58168_003392_0_37084_38590,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58,5.431841
4,58168_003392_0_37084_39947,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,40.11,26.73,0.99,0.09,163.38,90.69,1.68,1.64,6.886697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49583,58172_003247_125_52521_52939,0,0,23.44,4.04,1.41,0.15,163.22,185.42,0.75,-0.58,37.94,2.10,2.29,0.24,211.73,215.11,2.43,-2.43,14.629204
49584,58172_003247_125_52521_G,0,1,23.44,4.04,1.41,0.15,163.22,185.42,0.75,-0.58,,,,,,,,,
49585,58172_003247_125_52852_52939,0,0,32.67,2.18,2.34,0.24,113.19,119.09,1.03,-0.97,37.94,2.10,2.29,0.24,211.73,215.11,2.43,-2.43,5.270607
49586,58172_003247_125_52852_G,0,1,32.67,2.18,2.34,0.24,113.19,119.09,1.03,-0.97,,,,,,,,,


In [19]:
# sub_pred = pred_xgboost(df_test.drop(columns=['contact_id', 'contact']), 
#                         CFG.model_dir, f'{CFG.name}_{CFG.suffix}')
# df_test['contact'] = (sub_pred > CFG.threshold).astype(int)
# df_test = CFG.expand_contact_id(df_test)
# df_test[['contact_id', 'contact']].to_csv('submission.csv', index=False)
# display(df_test[['contact_id', 'contact']].head())

In [20]:
!ls model/

xgb_fold_0_yy_p2g.joblib  xgb_fold_2_yy_p2g.joblib  xgb_fold_4_yy_p2g.joblib
xgb_fold_0_yy_p2p.joblib  xgb_fold_2_yy_p2p.joblib  xgb_fold_4_yy_p2p.joblib
xgb_fold_1_yy_p2g.joblib  xgb_fold_3_yy_p2g.joblib
xgb_fold_1_yy_p2p.joblib  xgb_fold_3_yy_p2p.joblib


In [21]:
df_test_p2g = df_test[df_test['G_flag']==1]
df_test_p2p = df_test[df_test['G_flag']==0]

In [22]:
sub_pred_p2g = pred_xgboost(df_test_p2g.drop(columns=['contact_id', 'contact']),
                           CFG.model_dir, f'{CFG.name}_p2g')
sub_pred_p2p = pred_xgboost(df_test_p2p.drop(columns=['contact_id', 'contact']),
                           CFG.model_dir, f'{CFG.name}_p2p')
df_test_p2g['contact'] = (sub_pred_p2g > CFG.p2g_threshold).astype(int)
df_test_p2p['contact'] = (sub_pred_p2p > CFG.p2p_threshold).astype(int)

['/kaggle/working/model/xgb_fold_4_yy_p2g.joblib', '/kaggle/working/model/xgb_fold_2_yy_p2g.joblib', '/kaggle/working/model/xgb_fold_1_yy_p2g.joblib', '/kaggle/working/model/xgb_fold_3_yy_p2g.joblib', '/kaggle/working/model/xgb_fold_0_yy_p2g.joblib']
['/kaggle/working/model/xgb_fold_4_yy_p2p.joblib', '/kaggle/working/model/xgb_fold_0_yy_p2p.joblib', '/kaggle/working/model/xgb_fold_1_yy_p2p.joblib', '/kaggle/working/model/xgb_fold_3_yy_p2p.joblib', '/kaggle/working/model/xgb_fold_2_yy_p2p.joblib']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
df_test_ = pd.concat([df_test_p2g, df_test_p2p]).sort_index()
df_test_[['contact_id', 'contact']].to_csv('submission.csv', index=False)
df_test_

Unnamed: 0,contact_id,contact,G_flag,x_position_1,y_position_1,speed_1,distance_1,direction_1,orientation_1,acceleration_1,sa_1,x_position_2,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,distance
0,58168_003392_0_37084_37211,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,39.59,17.07,0.53,0.05,134.84,84.73,1.43,1.42,3.794232
1,58168_003392_0_37084_38556,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,41.93,30.61,0.67,0.05,232.50,227.00,1.82,1.61,10.530043
2,58168_003392_0_37084_38567,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,40.37,19.88,0.66,0.07,136.70,88.92,0.90,0.89,1.543017
3,58168_003392_0_37084_38590,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58,5.431841
4,58168_003392_0_37084_39947,0,0,41.90,20.08,0.54,0.06,252.69,262.31,0.92,0.90,40.11,26.73,0.99,0.09,163.38,90.69,1.68,1.64,6.886697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49583,58172_003247_125_52521_52939,0,0,23.44,4.04,1.41,0.15,163.22,185.42,0.75,-0.58,37.94,2.10,2.29,0.24,211.73,215.11,2.43,-2.43,14.629204
49584,58172_003247_125_52521_G,0,1,23.44,4.04,1.41,0.15,163.22,185.42,0.75,-0.58,,,,,,,,,
49585,58172_003247_125_52852_52939,0,0,32.67,2.18,2.34,0.24,113.19,119.09,1.03,-0.97,37.94,2.10,2.29,0.24,211.73,215.11,2.43,-2.43,5.270607
49586,58172_003247_125_52852_G,0,1,32.67,2.18,2.34,0.24,113.19,119.09,1.03,-0.97,,,,,,,,,


In [24]:
df_test_['contact'].value_counts()

0    48289
1     1299
Name: contact, dtype: int64

In [25]:
df_test_p2p['contact'].value_counts()

0    44617
1      659
Name: contact, dtype: int64

In [26]:
df_test_p2g['contact'].value_counts()

0    3672
1     640
Name: contact, dtype: int64