In [6]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from scipy.stats import entropy, kurtosis
import xgboost as xgb
import seaborn as sns
from xgboost import plot_importance
import gc
import warnings
import matplotlib.pyplot as plt
from math import *
import catboost
from catboost import Pool
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)
from IPython.display import display
warnings.filterwarnings('ignore')
%matplotlib inline

In [13]:
root = "../data/"
even_file = root+"event.csv"
test_file = root+"test.csv"
train_file = root+"train.csv"
sample_file = root+"sample.csv"

train_df = pd.read_csv(train_file,low_memory=False)
test_df = pd.read_csv(test_file,low_memory=False)

train_num = len(train_df)
data = pd.concat([train_df, test_df], ignore_index=True)
event = pd.read_csv(even_file)
data = pd.merge(data, event, on='event_id', how='left')
gc.collect()


85

In [14]:
data['angle*xy'] = data['x']*data['thetamc']*0.01 + data['y']*data['phimc']*0.01

data['energymc/t'] = data['energymc']/data['t']

data['x/t'] = data['x']/data['t']

ss = MinMaxScaler()
x1 = ss.fit_transform(data['t'].values.reshape([-1,1]))
data['energymc/t'] = data['energymc'].values.reshape([-1,1])/x1
#距离相关特征
data['slope'] = data['y']/data['x']

data['x_cmc'] = (data['x']-data['xcmc'])/(data['xcmc']+data['xcmc'].mean())

data['y_cmc'] = (data['y']-data['ycmc'])/(data['ycmc']+data['ycmc'].mean())

data['og'] = np.sqrt((data['y']-data['ycmc'])**2 + (data['x']-data['xcmc'])**2)

data['og_2'] = abs(data['x'] - data['xcmc'])

data['og_3'] = abs(data['y'] - data['ycmc'])

data['MH'] = np.abs(data['x']-data['xcmc'])+np.abs(data['y']-data['ycmc'])

data['Cosine'] = (data['x']*data['xcmc']+data['y']*data['ycmc'])/(np.sqrt(data['x']**2+data['y']**2)*np.sqrt(data['xcmc']**2+data['ycmc']**2))
#以上特征几乎没什么用

#以event_id聚合的t、q、x、y排序的特征，其中q与t为较强特征
temp = data.sort_values(['event_id','t'])
for i in [7]:
    data['x_sort_std_win'+str(i)] = temp['x'].rolling(i, min_periods=1,center=True).std().fillna(0)
    data['x/t_sort_std_win'+str(i)] = temp['x/t'].rolling(i, min_periods=1,center=True).std().fillna(0)
    data['y_sort_std_win'+str(i)] = temp['y'].rolling(i, min_periods=1,center=True).std().fillna(0)
for i in [1]:
    data['diff_x_'+str(i)] = temp[['x','event_id']].groupby('event_id')['x'].diff(periods=i).fillna(0)
    data['diff_t_'+str(i)] = temp[['t','event_id']].groupby('event_id')['t'].diff(periods=i).fillna(0)
del temp
print(data.shape)

temp = data.sort_values(['event_id','hit_id'])
for i in [1,8,12]:
    data['x/t_sort_std_win'+str(i)+'_hit'] = temp['x/t'].rolling(i, min_periods=1,center=True).std().fillna(0)
    data['diff_t_hit_'+str(i)] = temp[['t','event_id']].groupby('event_id')['t'].diff(periods=i).fillna(0)
data['speed_og_hit'] = temp['og'].diff(1)/(data['diff_t_hit_1']+1)
data['speed_x_hit'] = temp['x'].diff(1)/(data['diff_t_hit_1']+1)
del temp
print(data.shape)

temp = data.sort_values(['event_id','q'])
for i in [1,35,12]:
    data['t_sort_std_win'+str(i)+'_q'] = temp['t'].rolling(i, min_periods=1,center=True).std().fillna(0)
    data['diff_t_q_'+str(i)] = temp[['t','event_id']].groupby('event_id')['t'].diff(periods=i).fillna(0)
    data['diff_q_q_'+str(i)] = temp[['q','event_id']].groupby('event_id')['q'].diff(periods=i).fillna(0)
data['shift_q'] = temp['q']/temp['q'].shift(-1).fillna(method='pad')
data['speed_og_hit_q'] = temp['og'].diff(1)/(data['diff_t_q_1']+1)
del temp
print(data.shape)

#x、y唯一值较少，所以按x、t和y、t排序
temp = data.sort_values(['event_id','x','t'])
for i in [3]:
    data['og_win3_sort_std_hit_x'] = temp['og'].rolling(i, min_periods=1,center=True).std()
    data['t_win3_sort_std_hit_x'] = temp['t'].rolling(i, min_periods=1,center=True).std()
    data['t_win5_sort_std_hit_x'] = temp['t'].rolling(i, min_periods=1,center=True).std()

for i in [12,8,2,1]:
    data['diff_t_x_'+str(i)] = temp[['t','event_id']].groupby('event_id')['t'].diff(periods=i).fillna(0)
data['speed_y_hit_x'] = temp['y'].diff(1)/(data['diff_t_x_1']+1)
data['speed_y_hit_x3'] = temp['y'].diff(2)/(data['diff_t_x_2']+1)
del temp
print(data.shape)

temp = data.sort_values(['event_id','y','t'])
for i in [3]:
    data['og_win3_sort_std_hit_y'] = temp['og'].rolling(i, min_periods=1,center=True).std()
    data['t_win3_sort_std_hit_y'] = temp['t'].rolling(i, min_periods=1,center=True).std()
    data['t_win7_sort_std_hit_y'] = temp['t'].rolling(i, min_periods=1,center=True).std()
print(data.shape)

for i in [1,2,12]:
    data['diff_t_y_'+str(i)] = temp[['t','event_id']].groupby('event_id')['t'].diff(periods=i).fillna(0)
data['speed_x_hit_y'] = temp['x'].diff(1)/(data['diff_t_y_1']+1)
data['speed_x_hit_y2'] = temp['x'].diff(2)/(data['diff_t_y_2']+1)
del temp
print(data.shape)

data['speed_xy_speed'] = data['speed_x_hit_y']/(data['speed_y_hit_x']+1)
data['speed_xy_speed2'] = data['speed_x_hit_y2']/(data['speed_y_hit_x3']+1)
data['speed_xy_diff3'] = data['diff_t_hit_8']/(data['diff_t_x_8']+1)

#整体数据的x、y、terror、q的排序特征，terror、q为强特
temp = data[['event_id','t','q','x/t','y','x','og']].sort_values(['x','q'])
for i in [41]:
    data['t_std_win'+str(i)+'_X'] = temp['t'].rolling(i, min_periods=1,center=True).std().fillna(0)
del temp
print(data.shape)

temp = data[['event_id','t','q','x/t','y','x','og']].sort_values(['y','q'])
for i in [41]:
    data['t_std_win'+str(i)+'_Y'] = temp['t'].rolling(i, min_periods=1,center=True).std().fillna(0)
del temp
print(data.shape)

temp = data[['event_id','t','q','x/t','y','x','og','terror']].sort_values(['terror','q'])
for i in [41]:
    data['t_std_win'+str(i)+'_TQ'] = temp['t'].rolling(i, min_periods=1,center=True).std().fillna(0)
del temp
print(data.shape)

temp = data[['event_id','t','q','x/t','y','x','og']].sort_values(['q'])
for i in [7,51]:
    data['q_std_win'+str(i)+'_Q'] = temp['q'].rolling(i, min_periods=1,center=True).std().fillna(0)
    data['t_std_win'+str(i)+'_Q'] = temp['t'].rolling(i, min_periods=1,center=True).std().fillna(0)
del temp
print(data.shape)

(13559712, 32)
(13559712, 40)
(13559712, 51)
(13559712, 60)
(13559712, 63)
(13559712, 68)
(13559712, 72)
(13559712, 73)
(13559712, 74)
(13559712, 78)


In [15]:
train_df = data[:train_num]
test_df = data[train_num:]
del data

#为防止过拟合，不使用event文件中所有特征
feature= [x for x in train_df.columns if x not in ['nhit','nhitreal', 'energymc', 'thetamc', 'phimc',
                                                   'xcmc', 'ycmc','z','flag','index','index_x','index_y','hit_id','event_id']]
labels = train_df['flag'].values

In [None]:
#训练Xgboost模型
n_splits = 6
kf = KFold(n_splits=n_splits,shuffle=True,random_state=4399)
y_pp_xgb = np.zeros(len(test_df))
y_pp_xgb_stacking = np.zeros(len(labels))
for num,(train_index, test_index) in enumerate(kf.split(train_df)):
    print ( ">>>", train_df[feature].shape )
    clf = xgb.XGBClassifier(tree_method='gpu_hist',max_depth=7,learning_rate=0.1,
          eval_metric='auc',n_estimators=2000,min_child_weight=5,max_bin=1024)
    clf.fit(
        train_df[feature].iloc[train_index], labels[train_index],
        eval_set=[(train_df[feature].iloc[train_index], labels[train_index]),
                  (train_df[feature].iloc[test_index], labels[test_index])],
        early_stopping_rounds=50,
        verbose=10,
    )

    y_pred = clf.predict(train_df[feature].iloc[test_index]) 
    y_predprob = clf.predict_proba(train_df[feature].iloc[test_index])[:, 1] 
    
    y_pp_xgb_stacking[test_index] = y_predprob
      
    auc = metrics.roc_auc_score(labels[test_index], y_predprob)
    acc = metrics.accuracy_score(labels[test_index],y_pred)

    print("AUC Score (Train): %f" % auc) 
    print("ACC Score (Train): %f" % acc) 

    y_pp_xgb += clf.predict_proba(test_df[feature])[:, 1] / n_splits

In [None]:
#训练Catboost模型
n_splits = 6
kf = KFold(n_splits=n_splits, shuffle=True, random_state=1)
y_pp_cat_stacking = np.zeros(len(labels))
y_pp_cat = np.zeros(len(test_df))
for train_index, test_index in kf.split(train_df):
    print ( ">>>", train_df.shape )
    model = catboost.CatBoostClassifier(
                               iterations=3000,
                               depth = 8,
                               learning_rate = 0.1,
                               custom_loss='AUC',
                               eval_metric='AUC',
                               task_type = "GPU",
                              )
    train_pool = Pool(train_df[feature].iloc[train_index], labels[train_index], cat_features=cal_cols)
    eval_pool = Pool(train_df[feature].iloc[test_index], labels[test_index], cat_features=cal_cols)
    
    model.fit(train_pool, eval_set=(eval_pool),use_best_model=True,early_stopping_rounds=50,verbose=10)
    
    y_pred = clf.predict(train_df[feature].iloc[test_index]) 
    y_predprob = clf.predict_proba(train_df[feature].iloc[test_index])[:, 1] 
    
    y_pp_cat_stacking[test_index] = y_predprob
      
    auc = metrics.roc_auc_score(labels[test_index], y_predprob)
    acc = metrics.accuracy_score(labels[test_index], y_predprob)

    print("AUC Score (Train): %f" % auc) 
    print("ACC Score (Train): %f" % acc) 

    y_pp_cat += clf.predict_proba(test_df[feature])[:, 1] / n_splits

In [None]:
#保存XGB预测概率用于融合
tt = pd.read_csv('../data/test.csv')
sub = pd.DataFrame()
tt['flag_pre'] =y_pp_xgb
tt.loc[tt['t']>1850,'flag_pre']=1
tt.loc[tt['t']<-900,'flag_pre']=0
tt.loc[tt['q']<0,'flag_pre']=1
sub['hit_id']=tt['hit_id']
sub['flag_pred'] = tt['flag_pre']
sub['event_id'] = tt['event_id']
sub.to_csv('../result/xgb_prob.csv',index=False)

In [None]:
#保存CAT预测概率用于融合
sub = pd.DataFrame()
tt['flag_pre'] =y_pp_cat
tt.loc[tt['t']>1850,'flag_pre']=1
tt.loc[tt['t']<-900,'flag_pre']=0
tt.loc[tt['q']<0,'flag_pre']=1
sub['hit_id']=tt['hit_id']
sub['flag_pred'] = tt['flag_pre']
sub['event_id'] = tt['event_id']
sub.to_csv('../result/cat_prob.csv',index=False)