In [26]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [27]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

target = 'is_trade'

features = load_pickle(feature_data_path + 'features_0418_fewer.pkl')
len(features)

230

In [28]:
from sklearn.metrics import log_loss
import xgboost as xgb

train_data = all_data[(all_data.day >= 19) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

xgb_clf = xgb.XGBClassifier(objective='binary:logistic',

                             n_estimators=1000,
                             learning_rate=0.05,

                             max_depth=3,
                             min_child_weight=1e-3,
                             gamma=0,

                             colsample_bytree=0.8,
                             subsample=0.7,

                             reg_lambda=10,
                             min_split_gain=0.,
                             
                             max_bin=63,

                             n_jobs=6,
                             silent=False,
                             )


xgb_clf.fit(train_data[features], train_data[target],
            eval_set=[(test_data[features], test_data[target])],
            early_stopping_rounds=50,
            eval_metric='logloss',
            verbose=5,
            )

loss_train = log_loss(train_data[target], xgb_clf.predict_proba(train_data[features]))
loss_test = log_loss(test_data[target], xgb_clf.predict_proba(test_data[features]))

loss_train, loss_test

[0]	validation_0-logloss:0.647791
Will train until validation_0-logloss hasn't improved in 50 rounds.
[5]	validation_0-logloss:0.475859
[10]	validation_0-logloss:0.362696
[15]	validation_0-logloss:0.284607
[20]	validation_0-logloss:0.229173
[25]	validation_0-logloss:0.189121
[30]	validation_0-logloss:0.159827
[35]	validation_0-logloss:0.138373
[40]	validation_0-logloss:0.122551
[45]	validation_0-logloss:0.1109
[50]	validation_0-logloss:0.102389
[55]	validation_0-logloss:0.096144
[60]	validation_0-logloss:0.091619
[65]	validation_0-logloss:0.088332
[70]	validation_0-logloss:0.08596
[75]	validation_0-logloss:0.08424
[80]	validation_0-logloss:0.083001
[85]	validation_0-logloss:0.082107
[90]	validation_0-logloss:0.081426
[95]	validation_0-logloss:0.08091
[100]	validation_0-logloss:0.080546
[105]	validation_0-logloss:0.080252
[110]	validation_0-logloss:0.079998
[115]	validation_0-logloss:0.079791
[120]	validation_0-logloss:0.07963
[125]	validation_0-logloss:0.079512
[130]	validation_0-loglo

(0.081850016589568336, 0.078226394381656503)