In [3]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

from sklearn.metrics import log_loss
import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

In [4]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

target = 'is_trade'

features = load_pickle(feature_data_path + 'features_0418_fewer.pkl')

len(features)

230

In [5]:
def CustomCV(data, k=1):
    train_days = 6 - k
    for i in range(k):
        train_start_day = 19 + i
        train_end_day = 19 + i + train_days - 1
        test_day = 19 + i + train_days
        fold_index_train = data[(data.day >= train_start_day) & (data.day <= train_end_day)].index
        fold_index_test = data[data.day == test_day].index
        yield fold_index_train, fold_index_test
    

In [6]:
def CustomCV_19_22_23(data, k=1):
    fold_index_train = data[(data.day >= 19) & (data.day <= 22)].index
    fold_index_test = data[data.day == 23].index
    yield fold_index_train, fold_index_test

In [7]:
def CustomCV_20_23_24(data, k=1):
    fold_index_train = data[(data.day >= 20) & (data.day <= 23)].index
    fold_index_test = data[data.day == 24].index
    yield fold_index_train, fold_index_test

In [8]:
def CustomCV_19_23_24(data, k=1):
    fold_index_train = data[(data.day >= 19) & (data.day <= 23)].index
    fold_index_test = data[data.day == 24].index
    yield fold_index_train, fold_index_test

In [13]:
if __name__ == '__main__':


    eval_data = all_data[all_data.day == 24]
    eval_set = [(eval_data[features], eval_data[target])]

    xgb_clf = xgb.XGBClassifier(objective='binary:logistic', n_jobs=6, silent=False)

    # 参数的组合
    xgb_param_grad = {'n_estimators': (2000, ),
                      'learning_rate': (0.1, ),

                      'max_depth': (3, ),
                      
                      'min_child_weight': (1e-3, 0.1),
                      'gamma': (0, ),

                      'colsample_bytree': (0.8, ),
                      'subsample': (0.9, ),
#                       'subsample_freq': (1, ),
                      
#                       'reg_alpha': (0,),
                      'reg_lambda': (10,),
                      
#                       'max_bin': (63, ),
                      }

    clf = GridSearchCV(xgb_clf, param_grid=xgb_param_grad, scoring='neg_log_loss',
                       cv=CustomCV(all_data, k=1), n_jobs=-1, verbose=1, refit=False, return_train_score=True)

    clf.fit(all_data[features], all_data[target],
            eval_set=eval_set,
            early_stopping_rounds=50,
            eval_metric='logloss',
            verbose=20,
           )
    

    print('=====')
    print("Best parameters set found on development set:")
    print(clf.best_params_)

    print('=====')
    print("Best parameters set found on development set:")
    print(clf.best_score_)


Fitting 1 folds for each of 4 candidates, totalling 4 fits
[0]	validation_0-logloss:0.604784
Will train until validation_0-logloss hasn't improved in 50 rounds.
[0]	validation_0-logloss:0.604784
Will train until validation_0-logloss hasn't improved in 50 rounds.
[0]	validation_0-logloss:0.60472
Will train until validation_0-logloss hasn't improved in 50 rounds.
[0]	validation_0-logloss:0.60472
Will train until validation_0-logloss hasn't improved in 50 rounds.
[20]	validation_0-logloss:0.118874
[20]	validation_0-logloss:0.118874
[20]	validation_0-logloss:0.118848
[20]	validation_0-logloss:0.118848
[40]	validation_0-logloss:0.08264
[40]	validation_0-logloss:0.08264
[40]	validation_0-logloss:0.08264
[40]	validation_0-logloss:0.08264
[60]	validation_0-logloss:0.079611
[60]	validation_0-logloss:0.079557
[60]	validation_0-logloss:0.079611
[60]	validation_0-logloss:0.079557
[80]	validation_0-logloss:0.078969
[80]	validation_0-logloss:0.078901
[80]	validation_0-logloss:0.078969
[80]	validatio

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 11.6min finished


=====
Best parameters set found on development set:
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 0.001, 'n_estimators': 2000, 'reg_lambda': 10, 'subsample': 0.9}
=====
Best parameters set found on development set:
-0.0782573028031


In [14]:
pd.DataFrame(data=clf.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_colsample_bytree,param_gamma,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,param_reg_lambda,param_subsample,params,rank_test_score,split0_test_score,split0_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,645.740469,1.778122,-0.078282,-0.08162,0.8,0,0.1,3,0.001,2000,10,0.8,"{'colsample_bytree': 0.8, 'gamma': 0, 'learnin...",3,-0.078282,-0.08162,0.0,0.0,0.0,0.0
1,647.381613,0.713554,-0.078257,-0.081636,0.8,0,0.1,3,0.001,2000,10,0.9,"{'colsample_bytree': 0.8, 'gamma': 0, 'learnin...",1,-0.078257,-0.081636,0.0,0.0,0.0,0.0
2,618.056674,0.442253,-0.078282,-0.08162,0.8,0,0.1,3,0.1,2000,10,0.8,"{'colsample_bytree': 0.8, 'gamma': 0, 'learnin...",4,-0.078282,-0.08162,0.0,0.0,0.0,0.0
3,600.628813,0.997054,-0.078257,-0.081636,0.8,0,0.1,3,0.1,2000,10,0.9,"{'colsample_bytree': 0.8, 'gamma': 0, 'learnin...",1,-0.078257,-0.081636,0.0,0.0,0.0,0.0
