In [None]:
import os
dir = '__pycache__/'
for f in os.listdir(dir):
    os.remove(os.path.join(dir, f))

import warnings
warnings.filterwarnings("ignore")

from evaluation_metric import xgb_amex, amex_list

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import pickle
import gc

In [None]:
data = pd.read_parquet('Data/train_data_aggV3.parquet')
data.set_index('customer_ID', inplace=True)
train_data = data.drop(['target', 'cid', 'S_2'], axis=1)
train_labels = data['target']
del data
gc.collect()
train_data.shape, train_labels.shape

In [None]:
train_labels.value_counts()

In [None]:
def xgb_train(X_tr, y_tr, X_va, y_va, seed):

        print("# of features:", X_tr.shape[1])

        dtrain = xgb.DMatrix(data=X_tr, label=y_tr)
        dvalid = xgb.DMatrix(data=X_va, label=y_va)

        params = {
                'objective': 'binary:logistic', 
                'tree_method': 'gpu_hist', 
                'max_depth': 7,
                'subsample':0.88,
                'colsample_bytree':0.5,
                'gamma':1.5,
                'min_child_weight':8,
                'lambda':70,
                'eta':0.03,
                'scale_pos_weight':2.86,
                'random_state': seed
        }

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        
        model = xgb.train(
                        params,
                        dtrain=dtrain,
                        num_boost_round=10000,
                        evals=watchlist,
                        early_stopping_rounds=200,
                        feval=xgb_amex,
                        maximize=True,
                        verbose_eval=100
        )
                        
        print('best ntree_limit:', model.best_ntree_limit)
        print('best score:', model.best_score)
        pred = model.predict(dvalid, iteration_range=(0, model.best_ntree_limit))
        amex_score = amex_list(pred, y_va)

        return amex_score, model

In [None]:
n_folds = 3
seeds = 1

models = {}
df_scores = []

kf = StratifiedKFold(n_splits=n_folds)
for fold, (idx_tr, idx_va) in enumerate(kf.split(train_data, train_labels)):
    
    X_tr = train_data.iloc[idx_tr]
    X_va = train_data.iloc[idx_va]
    y_tr = train_labels.iloc[idx_tr]
    y_va = train_labels.iloc[idx_va]

    for seed in range(seeds):
        key = str(fold) + '-' + str(seed) 
        score, model = xgb_train(X_tr, y_tr, X_va, y_va, seed)
        models[key] = model

        df_scores.append((fold, seed, score))
        print(f"Fold {fold}:, Seed {seed}, Amex_score {score:.4f}")

In [None]:
pickle.dump(models, open("Models/xgboost_b2.pkl", "wb"))

In [None]:
df_results = pd.DataFrame(df_scores,columns=['fold','seed','score']).pivot(index='fold',columns='seed',values='score')
df_results.loc['seed_mean']= df_results.mean(numeric_only=True, axis=0)
df_results.loc[:,'fold_mean'] = df_results.mean(numeric_only=True, axis=1)
df_results

### Prediction

In [None]:
test = pd.read_parquet('Data/test_data_aggV3.parquet')
test.set_index('customer_ID', inplace=True)
test = test.drop(columns=['cid', 'S_2'], axis=1)
dtest = xgb.DMatrix(data=test)

del test
gc.collect()

In [None]:
model.predict(dtest, iteration_range = (0,model.best_iteration))