In [None]:
import warnings
warnings.filterwarnings("ignore")

from evaluation_metric import *
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import gc

In [None]:
train_data = pd.read_pickle('Data/train_data_aggV3.pkl')
train_labels = pd.read_pickle('Data/train_labels.pkl').loc[train_data.index]

train_data.shape, train_labels.shape

In [None]:
n_fold = 5
n_seed = 2

kf = StratifiedKFold(n_splits=n_fold)

importances = []
models = {}
df_scores = []

for fold, (idx_tr, idx_va) in enumerate(kf.split(train_data, train_labels)):
    
        X_tr = train_data.iloc[idx_tr]
        X_va = train_data.iloc[idx_va]
        y_tr = train_labels.iloc[idx_tr]
        y_va = train_labels.iloc[idx_va]

        dtrain = xgb.DMatrix(data=X_tr, label=y_tr)
        dvalid = xgb.DMatrix(data=X_va, label=y_va)
        
        for seed in range(n_seed):
                print('Fold: '+str(fold)+ ' - seed: '+str(seed))
                key = str(fold)+'-'+str(seed)

                params = {
                        'objective': 'binary:logistic', 
                        'tree_method': 'gpu_hist', 
                        'max_depth': 7,
                        'subsample':0.88,
                        'colsample_bytree': 0.5,
                        'gamma':1.5,
                        'min_child_weight':8,
                        'lambda':70,
                        'eta':0.03,
                        'random_state': seed
                }

                watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
                bst = xgb.train(params, dtrain=dtrain,evals=watchlist,num_boost_round=20000,
                        early_stopping_rounds=2000, feval=xgb_amex, maximize=True,
                        verbose_eval=200)

                
                score = amex_metric(y_va.reset_index(drop=True), pd.Series(bst.predict(dvalid)).rename('prediction'))
                print(f'Fold: {fold} - seed: {0} - score {score:.2%}')
                models[key] = bst
                df_scores.append((fold, seed, score))
                df_results = pd.DataFrame(df_scores,columns=['fold','seed','score']).pivot(index='fold',columns='seed',values='score')
                df_results.loc['seed_mean']= df_results.mean(numeric_only=True, axis=0)
                df_results.loc[:,'fold_mean'] = df_results.mean(numeric_only=True, axis=1)


In [None]:
df_results

In [None]:
test_data = pd.read_pickle('Data/test_data_aggV3.pkl')
test_data.shape

In [None]:
dtest = xgb.DMatrix(data=test_data)
prediction_list = []
for keys in models.keys():
    prediction_list.append(models[keys].predict(dtest))

prediction_df = pd.DataFrame(prediction_list).T
prediction_df.index = test_data.index

In [None]:
prediction_df