In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pylab as plt
from scipy.stats import pearsonr

from hyperopt import fmin, tpe, hp, partial


def find_time_ind(df):
    day_ind = [0]
    day_temp = df["TradingDay"][0]
    for i in range(1, len(df)):
        if df["TradingDay"][i] != day_temp:
            day_ind.append(i)
            day_temp = df["TradingDay"][i]
    return day_ind

In [2]:
# input cell
train = pd.read_csv('0805rolling5.csv')
# train = pd.read_csv('08053before_rolling.csv')
# train = pd.read_csv('08052acou_and_factors.csv')
x_column = ["factor"+str(i)+'_'+str(j) for i in range(0,17) for j in range(5)]
# x_column = ["factor"+str(i) for i in range(0,17)]
# for i in range(12):
#     for j in range(5):
#         x_column.append("10M" + str(i) +'_'+str(j))
# for i in range(12):
#     x_column.append("10M" + str(i))

train = train.sort_values(by = ["TradingDay", "SecuCode"], ascending = [True, True])
train.reset_index(drop=True, inplace=True)
X = train[x_column]
target = ['NextReturnCate']
y = train[target]
y += 1
yp = train["NextReturn"]
time_ind = find_time_ind(train)

In [3]:
# initializing
train_days = 775
delta_days = -776
X_train = X[time_ind[0]:time_ind[train_days]]
X_test = X[time_ind[train_days]:time_ind[train_days+delta_days]]
y_train = y[time_ind[0]:time_ind[train_days]]
y_test = y[time_ind[train_days]:time_ind[train_days+delta_days]]
yp_train = yp[time_ind[0]:time_ind[train_days]]
yp_test = yp[time_ind[train_days]:time_ind[train_days+delta_days]]
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test)
ddev = xgb.DMatrix(data=X_test, label=y_test)

evallist = [(ddev, 'eval'), (dtrain, 'train')]


In [4]:
# 自定义hyperopt的参数空间
space = {"max_depth": hp.randint("max_depth", 25),
         "n_estimators": hp.randint("n_estimators", 30),
         'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1),
         "subsample": hp.randint("subsample", 5),
         "min_child_weight": hp.randint("min_child_weight", 6),
         "alpha": hp.uniform("alpha", 0, 1),
         "lambda": hp.uniform("lambda", 0, 1)
         }

def argsDict_tranform(argsDict, isPrint=False):
    argsDict["max_depth"] = argsDict["max_depth"] + 5    # from 5 to 30
    argsDict['n_estimators'] = argsDict['n_estimators'] + 10  # from 10 to 40
    argsDict["learning_rate"] = argsDict["learning_rate"] * 0.02 + 0.001   # from 0.001 to 0.01
    argsDict["subsample"] = argsDict["subsample"] * 0.1 + 0.5  # from 0.5 to 1
    argsDict["min_child_weight"] = argsDict["min_child_weight"] + 1 # from 1 to 6
    argsDict["alpha"] = argsDict["alpha"] * 20
    argsDict["lambda"] = argsDict["lambda"] * 50
    if isPrint:
        print(argsDict)
    else:
        pass

    return argsDict

In [5]:
from sklearn.metrics import mean_squared_error, zero_one_loss

def xgboost_factory(argsDict):
    argsDict = argsDict_tranform(argsDict)
    
    params = {'nthread': -1,  # 进程数
              'max_depth': argsDict['max_depth'],  # 最大深度
              'n_estimators': argsDict['n_estimators'],  # 树的数量
              'eta': argsDict['learning_rate'],  # 学习率
              'subsample': argsDict['subsample'],  # 采样数
              'min_child_weight': argsDict['min_child_weight'],  # 终点节点最小样本占比的和
              'objective': 'reg:squarederror',
              'silent': 0,  # 是否显示
              'gamma': 0,  # 是否后剪枝
              'colsample_bytree': 0.7,  # 样本列采样
              'alpha': argsDict['alpha'],  # L1 正则化
              'lambda': argsDict['lambda'],  # L2 正则化
              'scale_pos_weight': 0,  # 取值>0时,在数据不平衡时有助于收敛
              'seed': 100,  # 随机种子
              'tree_method':'exact'
              }

    xrf = xgb.train(params, dtrain, params['n_estimators'], evallist,early_stopping_rounds=100)

    return get_tranformer_score(xrf)


def get_tranformer_score(tranformer):
    
    xrf = tranformer
    pred = xrf.predict(ddev, ntree_limit=xrf.best_ntree_limit)
  
    return -pearsonr(np.array(yp_test), pred)[0]

In [6]:
# 开始使用hyperopt进行自动调参
algo = partial(tpe.suggest, n_startup_jobs=1)
best = fmin(xgboost_factory, space, algo=algo, max_evals=100, pass_expr_memo_ctrl=None)
store = best
print("the best parameter before transformation is:" + best)
print("the best parameter after transformation is:" + argsDict_tranform(best))


[0]	eval-rmse:0.956051	train-rmse:0.929969           

Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping.


Will train until train-rmse hasn't improved in 100 rounds.

[1]	eval-rmse:0.95452	train-rmse:0.927153            

[2]	eval-rmse:0.952931	train-rmse:0.924781           

[3]	eval-rmse:0.951267	train-rmse:0.922233           

[4]	eval-rmse:0.949732	train-rmse:0.919717           

[5]	eval-rmse:0.94814	train-rmse:0.917359            

[6]	eval-rmse:0.946586	train-rmse:0.914954           

[7]	eval-rmse:0.945101	train-rmse:0.91262            

[8]	eval-rmse:0.943708	train-rmse:0.910312           

[9]	eval-rmse:0.942272	train-rmse:0.908002           

[10]	eval-rmse:0.940817	train-rmse:0.905658          

[11]	eval-rmse:0.939378	train-rmse:0.903297          

[12]	eval-rmse:0.93797	train-rmse:0.900979           

[13]	eval-rmse:0.936629	train-rmse:0.898672          

[14]	eval-rmse:0.935321	train-rmse:0.896432          

[15]	eval-rmse:0.933924	tra

Will train until train-rmse hasn't improved in 100 rounds.                          

[1]	eval-rmse:0.957483	train-rmse:0.931853                                          

[2]	eval-rmse:0.957267	train-rmse:0.931654                                          

[3]	eval-rmse:0.956645	train-rmse:0.931152                                          

[4]	eval-rmse:0.95617	train-rmse:0.930648                                           

[5]	eval-rmse:0.955806	train-rmse:0.930378                                          

[6]	eval-rmse:0.955542	train-rmse:0.93007                                           

[7]	eval-rmse:0.955347	train-rmse:0.929669                                          

[8]	eval-rmse:0.955146	train-rmse:0.929208                                          

[9]	eval-rmse:0.954879	train-rmse:0.928775                                          

[10]	eval-rmse:0.954493	train-rmse:0.928312                                         

[11]	eval-rmse:0.954168	train-rmse:0.927897           

Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping. 


Will train until train-rmse hasn't improved in 100 rounds.                            

[1]	eval-rmse:0.955954	train-rmse:0.930598                                            

[2]	eval-rmse:0.955272	train-rmse:0.930008                                            

[3]	eval-rmse:0.95475	train-rmse:0.928809                                             

[4]	eval-rmse:0.953763	train-rmse:0.927729                                            

[5]	eval-rmse:0.953048	train-rmse:0.926891                                            

[6]	eval-rmse:0.952365	train-rmse:0.92637                                             

[7]	eval-rmse:0.951538	train-rmse:0.925738                                            

[8]	eval-rmse:0.950771	train-rmse:0.92491                                             

[9]	eval-rmse:0.950002	train-rmse:0.923948                                            

[10]	eval-rmse:0.94921	train-rm

[16]	eval-rmse:0.932749	train-rmse:0.899434                                           

[17]	eval-rmse:0.93146	train-rmse:0.897635                                            

[0]	eval-rmse:0.955211	train-rmse:0.929548                                            

Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping. 


Will train until train-rmse hasn't improved in 100 rounds.                            

[1]	eval-rmse:0.952695	train-rmse:0.926349                                            

[2]	eval-rmse:0.950164	train-rmse:0.923641                                            

[3]	eval-rmse:0.947767	train-rmse:0.920658                                            

[4]	eval-rmse:0.945418	train-rmse:0.917964                                            

[5]	eval-rmse:0.943061	train-rmse:0.915242                                            

[6]	eval-rmse:0.940778	train-rmse:0.91274                                             

[7]	eval-rmse:0.938502	train-rm

[23]	eval-rmse:0.937152	train-rmse:0.906331                                           

[24]	eval-rmse:0.936302	train-rmse:0.905341                                           

[25]	eval-rmse:0.935559	train-rmse:0.904344                                           

[26]	eval-rmse:0.93479	train-rmse:0.903365                                            

[27]	eval-rmse:0.934051	train-rmse:0.902382                                           

[28]	eval-rmse:0.933275	train-rmse:0.901364                                           

[29]	eval-rmse:0.932518	train-rmse:0.900363                                           

[30]	eval-rmse:0.931791	train-rmse:0.899404                                           

[31]	eval-rmse:0.931018	train-rmse:0.898436                                           

[32]	eval-rmse:0.930245	train-rmse:0.897485                                           

[33]	eval-rmse:0.929496	train-rmse:0.896532                                           

[34]	eval-rmse:0.928758	train-rm

Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping. 


Will train until train-rmse hasn't improved in 100 rounds.                            

[1]	eval-rmse:0.957589	train-rmse:0.931919                                            

[2]	eval-rmse:0.957403	train-rmse:0.931728                                            

[3]	eval-rmse:0.957062	train-rmse:0.931445                                            

[4]	eval-rmse:0.956585	train-rmse:0.930956                                            

[5]	eval-rmse:0.956196	train-rmse:0.930591                                            

[6]	eval-rmse:0.955926	train-rmse:0.93034                                             

[7]	eval-rmse:0.955689	train-rmse:0.93007                                             

[8]	eval-rmse:0.955509	train-rmse:0.929741                                            

[9]	eval-rmse:0.955324	train-rmse:0.92936                                             

[10]	eval-rmse:0.955144	train-r

[13]	eval-rmse:0.943227	train-rmse:0.91599                                            

[14]	eval-rmse:0.942268	train-rmse:0.914912                                           

[15]	eval-rmse:0.941315	train-rmse:0.913877                                           

[16]	eval-rmse:0.940424	train-rmse:0.912851                                           

[17]	eval-rmse:0.939388	train-rmse:0.911823                                           

[18]	eval-rmse:0.938414	train-rmse:0.910805                                           

[19]	eval-rmse:0.937504	train-rmse:0.909779                                           

[20]	eval-rmse:0.93656	train-rmse:0.90871                                             

[21]	eval-rmse:0.935621	train-rmse:0.90763                                            

[22]	eval-rmse:0.934712	train-rmse:0.906626                                           

[23]	eval-rmse:0.933808	train-rmse:0.905651                                           

[24]	eval-rmse:0.932894	train-rm

[14]	eval-rmse:0.938451	train-rmse:0.903218                                           

[0]	eval-rmse:0.955704	train-rmse:0.930188                                            

Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping. 


Will train until train-rmse hasn't improved in 100 rounds.                            

[1]	eval-rmse:0.95402	train-rmse:0.927635                                             

[2]	eval-rmse:0.952086	train-rmse:0.92548                                             

[3]	eval-rmse:0.950145	train-rmse:0.923193                                            

[4]	eval-rmse:0.948396	train-rmse:0.920907                                            

[5]	eval-rmse:0.946525	train-rmse:0.918815                                            

[6]	eval-rmse:0.944746	train-rmse:0.916686                                            

[7]	eval-rmse:0.942959	train-rmse:0.914659                                            

[8]	eval-rmse:0.941248	train-rm

[11]	eval-rmse:0.94105	train-rmse:0.91393                                             

[12]	eval-rmse:0.939848	train-rmse:0.912594                                           

[13]	eval-rmse:0.938401	train-rmse:0.911153                                           

[14]	eval-rmse:0.937197	train-rmse:0.90985                                            

[15]	eval-rmse:0.93579	train-rmse:0.908388                                            

[16]	eval-rmse:0.934681	train-rmse:0.907048                                           

[17]	eval-rmse:0.93356	train-rmse:0.905783                                            

[18]	eval-rmse:0.932424	train-rmse:0.904445                                           

[19]	eval-rmse:0.931111	train-rmse:0.902994                                           

[20]	eval-rmse:0.92996	train-rmse:0.901725                                            

[21]	eval-rmse:0.92873	train-rmse:0.900341                                            

[22]	eval-rmse:0.927546	train-rm

Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping. 


Will train until train-rmse hasn't improved in 100 rounds.                            

[1]	eval-rmse:0.952903	train-rmse:0.92666                                             

[2]	eval-rmse:0.950487	train-rmse:0.924301                                            

[3]	eval-rmse:0.948196	train-rmse:0.921447                                            

[4]	eval-rmse:0.946096	train-rmse:0.919157                                            

[5]	eval-rmse:0.943683	train-rmse:0.916512                                            

[6]	eval-rmse:0.94149	train-rmse:0.914149                                             

[7]	eval-rmse:0.93929	train-rmse:0.911875                                             

[8]	eval-rmse:0.937241	train-rmse:0.90955                                             

[9]	eval-rmse:0.935118	train-rmse:0.907154                                            

[10]	eval-rmse:0.933182	train-r

[3]	eval-rmse:0.950395	train-rmse:0.923425                                            

[4]	eval-rmse:0.948663	train-rmse:0.921163                                            

[5]	eval-rmse:0.946843	train-rmse:0.919082                                            

[6]	eval-rmse:0.945055	train-rmse:0.916989                                            

[7]	eval-rmse:0.943355	train-rmse:0.915049                                            

[8]	eval-rmse:0.94168	train-rmse:0.913065                                             

[9]	eval-rmse:0.940076	train-rmse:0.911097                                            

[10]	eval-rmse:0.938344	train-rmse:0.909095                                           

[11]	eval-rmse:0.936782	train-rmse:0.907079                                           

[12]	eval-rmse:0.93523	train-rmse:0.905117                                            

[13]	eval-rmse:0.933774	train-rmse:0.903187                                           

[14]	eval-rmse:0.932323	train-rm

[9]	eval-rmse:0.938652	train-rmse:0.911692                                         

[10]	eval-rmse:0.936813	train-rmse:0.90999                                         

[11]	eval-rmse:0.93513	train-rmse:0.908037                                         

[12]	eval-rmse:0.933452	train-rmse:0.906203                                        

[13]	eval-rmse:0.931534	train-rmse:0.9041                                          

[14]	eval-rmse:0.930063	train-rmse:0.902557                                        

[15]	eval-rmse:0.928381	train-rmse:0.900694                                        

[16]	eval-rmse:0.926722	train-rmse:0.898789                                        

[17]	eval-rmse:0.925185	train-rmse:0.897258                                        

[18]	eval-rmse:0.923764	train-rmse:0.895752                                        

[0]	eval-rmse:0.955809	train-rmse:0.930212                                         

Multiple eval metrics have been passed: 'train-rmse' will be used

Will train until train-rmse hasn't improved in 100 rounds.                          

[1]	eval-rmse:0.953459	train-rmse:0.927098                                          

[2]	eval-rmse:0.951428	train-rmse:0.924931                                          

[3]	eval-rmse:0.949355	train-rmse:0.922336                                          

[4]	eval-rmse:0.947421	train-rmse:0.920049                                          

[5]	eval-rmse:0.945345	train-rmse:0.917752                                          

[6]	eval-rmse:0.943327	train-rmse:0.915474                                          

[7]	eval-rmse:0.941421	train-rmse:0.913384                                          

[8]	eval-rmse:0.939536	train-rmse:0.911176                                          

[9]	eval-rmse:0.937734	train-rmse:0.909004                                          

[10]	eval-rmse:0.935951	train-rmse:0.90681                                          

[11]	eval-rmse:0.934211	train-rmse:0.904589           

TypeError: must be str, not dict

In [7]:
best

{'alpha': 0.6145289893746854,
 'lambda': 0.848151241595682,
 'learning_rate': 0.48805216458277617,
 'max_depth': 0,
 'min_child_weight': 5,
 'n_estimators': 3,
 'subsample': 2}

In [8]:
argsDict_tranform(best)

{'alpha': 12.290579787493707,
 'lambda': 42.4075620797841,
 'learning_rate': 0.010761043291655524,
 'max_depth': 5,
 'min_child_weight': 6,
 'n_estimators': 13,
 'subsample': 0.7}

In [7]:
best

{'alpha': 182.91075892028954,
 'lambda': 305.363687026092,
 'learning_rate': 0.0011815142243762745,
 'max_depth': 17,
 'min_child_weight': 4,
 'n_estimators': 30,
 'subsample': 0.56}

In [49]:
 xgboost_factory(argsDict_tranform(best))

[0]	eval-rmse:0.887142	train-rmse:0.91346
Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping.

Will train until train-rmse hasn't improved in 100 rounds.
[1]	eval-rmse:0.876897	train-rmse:0.894919
[2]	eval-rmse:0.868053	train-rmse:0.878668
[3]	eval-rmse:0.859192	train-rmse:0.86251
[4]	eval-rmse:0.851483	train-rmse:0.848265
[5]	eval-rmse:0.844992	train-rmse:0.833192
[6]	eval-rmse:0.838924	train-rmse:0.820392
[7]	eval-rmse:0.833219	train-rmse:0.807377
[8]	eval-rmse:0.828818	train-rmse:0.794724
[9]	eval-rmse:0.824214	train-rmse:0.783454
[10]	eval-rmse:0.820525	train-rmse:0.772966
[11]	eval-rmse:0.817106	train-rmse:0.762479
[12]	eval-rmse:0.814711	train-rmse:0.752645
[13]	eval-rmse:0.812286	train-rmse:0.743479
[14]	eval-rmse:0.810634	train-rmse:0.734102
[15]	eval-rmse:0.808537	train-rmse:0.72533
[16]	eval-rmse:0.807054	train-rmse:0.716354
[17]	eval-rmse:0.805628	train-rmse:0.707871
[18]	eval-rmse:0.804132	train-rmse:0.699963
[19]	eval-rmse:0.803122	train-r

-0.06243747878596423

In [31]:
bst=xgb.train(params,dtrain=dtrain,num_boost_round=1,early_stopping_rounds=None,evals=[(dtrain,'train'),(ddev,'test')],verbose_eval=3)


[0]	train-rmse:0.926187	test-rmse:0.896756


In [36]:
xrf = xgb.train(params, dtrain,num_boost_round=1,early_stopping_rounds=None)

In [None]:
# first time training

bst=xgb.train(params=xgb_param,dtrain=dtrain,num_boost_round=1,early_stopping_rounds=None,evals=[(dtrain,'train'),(ddev,'test')],verbose_eval=3)
pred = bst.predict(dtest)
predtrain = bst.predict(dtrain)
print(pearsonr(np.array(yp_train), predtrain))
print(pearsonr(np.array(yp_test), pred))


# first time training - continued
for kk in range(29):
    bst=xgb.train(params=xgb_param,dtrain=dtrain,num_boost_round=1,early_stopping_rounds=None,evals=[(dtrain,'train'),(ddev,'test')],verbose_eval=3, xgb_model = bst)
    pred = bst.predict(dtest)
    predtrain = bst.predict(dtrain)
print(pearsonr(np.array(yp_train), predtrain))
# print(pearsonr(np.array(y_train.T)[0], predtrain))
print(pearsonr(np.array(yp_test), pred))
# print(pearsonr(np.array(y_test.T)[0], pred))

# concatenate predictions

g =  train[["TradingDay", "SecuCode"]][time_ind[train_days]:time_ind[train_days+delta_days]]
g["NextCatPred"] = pred
k = g.pivot_table(columns='SecuCode', index='TradingDay', values='NextCatPred').reset_index().sort_values("TradingDay")


In [None]:
# second traininng and so on

# change dataset
drift = 1
while drift * delta_days + train_days + delta_days < len(time_ind):
    X_train = X[time_ind[drift * delta_days]:time_ind[train_days + drift * delta_days]]
    X_test = X[time_ind[train_days + drift * delta_days]:time_ind[train_days+delta_days + drift * delta_days]]
    y_train = y[time_ind[drift * delta_days]:time_ind[train_days + drift * delta_days]]
    y_test = y[time_ind[train_days + drift * delta_days]:time_ind[train_days+delta_days + drift * delta_days]]
    yp_train = yp[time_ind[drift * delta_days]:time_ind[train_days + drift * delta_days]]
    yp_test = yp[time_ind[train_days + drift * delta_days]:time_ind[train_days+delta_days + drift * delta_days]]
    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    dtest = xgb.DMatrix(data=X_test)
    ddev = xgb.DMatrix(data=X_test, label=y_test)
    bst=xgb.train(params=xgb_param,dtrain=dtrain,num_boost_round=1,early_stopping_rounds=None,evals=[(dtrain,'train'),(ddev,'test')],verbose_eval=3)

    for kk in range(29):
        bst=xgb.train(params=xgb_param,dtrain=dtrain,num_boost_round=1,early_stopping_rounds=None,evals=[(dtrain,'train'),(ddev,'test')],verbose_eval=3, xgb_model = bst)
#         bst=xgb.train(params=xgb_param,dtrain=dtrain,num_boost_round=1,early_stopping_rounds=None,evals=[(dtrain,'train'),(ddev,'test')],verbose_eval=3)
        pred = bst.predict(dtest)
        predtrain = bst.predict(dtrain)
    print(pearsonr(np.array(yp_train), predtrain))
    # print(pearsonr(np.array(y_train.T)[0], predtrain))
    print(pearsonr(np.array(yp_test), pred))
    # print(pearsonr(np.array(y_test.T)[0], pred))
    gt =  train[["TradingDay", "SecuCode"]][time_ind[train_days + drift * delta_days]:time_ind[train_days+delta_days + drift * delta_days]]
    gt["NextCatPred"] = pred
    kt = gt.pivot_table(columns='SecuCode', index='TradingDay', values='NextCatPred').reset_index().sort_values("TradingDay")
    k = pd.concat([k, kt],ignore_index=True)
    drift += 1

# the last dataset
if drift * delta_days + train_days + delta_days > len(time_ind):
    X_train = X[time_ind[drift * delta_days]:time_ind[train_days + drift * delta_days]]
    X_test = X[time_ind[train_days + drift * delta_days]:]
    y_train = y[time_ind[drift * delta_days]:time_ind[train_days + drift * delta_days]]
    y_test = y[time_ind[train_days + drift * delta_days]:]
    yp_train = yp[time_ind[drift * delta_days]:time_ind[train_days + drift * delta_days]]
    yp_test = yp[time_ind[train_days + drift * delta_days]:]
    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    dtest = xgb.DMatrix(data=X_test)
    ddev = xgb.DMatrix(data=X_test, label=y_test)
    bst=xgb.train(params=xgb_param,dtrain=dtrain,num_boost_round=1,early_stopping_rounds=None,evals=[(dtrain,'train'),(ddev,'test')],verbose_eval=3)

    for kk in range(29):
        bst=xgb.train(params=xgb_param,dtrain=dtrain,num_boost_round=1,early_stopping_rounds=None,evals=[(dtrain,'train'),(ddev,'test')],verbose_eval=3, xgb_model = bst)
        pred = bst.predict(dtest)
        predtrain = bst.predict(dtrain)
    print(pearsonr(np.array(yp_train), predtrain))
    # print(pearsonr(np.array(y_train.T)[0], predtrain))
    print(pearsonr(np.array(yp_test), pred))
    # print(pearsonr(np.array(y_test.T)[0], pred))
    gt =  train[["TradingDay", "SecuCode"]][time_ind[train_days + drift * delta_days]:]
    gt["NextCatPred"] = pred
    kt = gt.pivot_table(columns='SecuCode', index='TradingDay', values='NextCatPred').reset_index().sort_values("TradingDay")
    k = pd.concat([k, kt],ignore_index=True)

k.set_index("TradingDay").to_csv("0805back_input_30_30.csv")