In [1]:
import os
dir = '__pycache__/'
for f in os.listdir(dir):
    os.remove(os.path.join(dir, f))

# import warnings
# warnings.filterwarnings("ignore")

from evaluation_metric import xgb_amex, amex_metric_np
import cudf
cudf.set_allocator('managed')
import pandas as pd
from matplotlib import pyplot as plt
import xgboost as xgb
from xgboost import plot_importance
import pickle
import gc

In [2]:
model_id = 'm_xgb_pca50_FE'

In [3]:
data = cudf.read_parquet('Data/train_agg_pca50_FE.parquet')
labels = cudf.read_parquet('Data/train_labels.parquet')
indx_df = cudf.read_parquet('Data/train_test_indx.parquet')

train = data.loc[indx_df.train_indx]
test = data.loc[indx_df.test_indx[indx_df.test_indx.notnull()]]
del data; gc.collect()

train.shape, test.shape

((367130, 643), (91783, 643))

In [4]:
def xgb_train(params, X_tr, y_tr, X_va, y_va):
        
        dtrain = xgb.DMatrix(data=X_tr, label=y_tr)
        dvalid = xgb.DMatrix(data=X_va, label=y_va)
        
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        
        model = xgb.train(
                        params,
                        dtrain=dtrain,
                        num_boost_round=2000,
                        evals=watchlist,
                        early_stopping_rounds=300,
                        custom_metric=xgb_amex,
                        maximize=True,
                        verbose_eval=100,
        )
                        
        print('best iteration:', model.best_ntree_limit)
        print('best score:', model.best_score)

        return model

In [5]:
params = {
        'objective': 'binary:logistic',
        'tree_method': 'hist', 
        'max_depth': 7,
        'subsample':0.88,
        'colsample_bytree':0.5,
        'gamma':1.5,
        'min_child_weight':8,
        'lambda':70,
        'eta':0.03,
        'random_state': 42
}

In [6]:
model = xgb_train(params, train, labels.loc[train.index], test, labels.loc[test.index])

[0]	train-logloss:0.67368	train-amex:0.71231	eval-logloss:0.67378	eval-amex:0.70368
[100]	train-logloss:0.23953	train-amex:0.78357	eval-logloss:0.24597	eval-amex:0.77162
[200]	train-logloss:0.21677	train-amex:0.79784	eval-logloss:0.22698	eval-amex:0.78222
[300]	train-logloss:0.20855	train-amex:0.81001	eval-logloss:0.22285	eval-amex:0.78565
[400]	train-logloss:0.20306	train-amex:0.81909	eval-logloss:0.22092	eval-amex:0.78889
[500]	train-logloss:0.19818	train-amex:0.82734	eval-logloss:0.21963	eval-amex:0.78992
[600]	train-logloss:0.19418	train-amex:0.83401	eval-logloss:0.21892	eval-amex:0.79102
[700]	train-logloss:0.19040	train-amex:0.84078	eval-logloss:0.21837	eval-amex:0.79155
[800]	train-logloss:0.18680	train-amex:0.84665	eval-logloss:0.21796	eval-amex:0.79165
[900]	train-logloss:0.18345	train-amex:0.85259	eval-logloss:0.21764	eval-amex:0.79181
[1000]	train-logloss:0.18040	train-amex:0.85840	eval-logloss:0.21745	eval-amex:0.79216
[1100]	train-logloss:0.17735	train-amex:0.86389	eval-lo

In [7]:
pickle.dump(model, open('Models/' + model_id + '.pkl', 'wb'))

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,10))
plot_importance(model,importance_type = 'gain', max_num_features=50, ax=ax);

In [None]:
model.predict(xgb.DMatrix(test))

In [None]:
labels.loc[test.index]

In [22]:
model.get_score(importance_type = 'gain')['FE3_max']

12.54299545288086

In [None]:
['FE1_mean', 'FE1_min', 'FE1_max', 'FE2_mean', 'FE2_min', 'FE2_max',
       'FE3_mean', 'FE3_min', 'FE3_max']