In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import os
from tqdm import tqdm
import pyTSL as pt
import warnings
import datetime as dt
warnings.filterwarnings('ignore')
from Config import factor_config as fc
from Config import model_config as mc

head_future = pd.read_excel(os.path.join(fc.head_path, 'IC_head_future.xlsx')) # 每日主流合约

def read_file(date):
    '''
    根据日期读取对应主头合约文件
    '''
    read_date = date.strftime('%Y%m%d')
    next_idx = head_future[head_future['trade_date'] == date].index
    if next_idx == 0:
        return None
    read_future = head_future.iloc[next_idx - 1]['IC'].values[0]
    read_file = read_future + '_' + read_date + '.tdf'
    return read_file


def get_data():
    '''
    读取因子及涨跌幅数据
    '''

    # # 读取因子时间段
    # tmp = pd.read_pickle(os.path.join(fc.factor_save_path, '%s.pkl' % mc.model_factor[0]))
    # file_dates = pd.to_datetime(tmp['minute'].dt.date).drop_duplicates()
    # file_dates = file_dates[(file_dates > mc.start_date) & (file_dates <= mc.end_date)]
    # file_dates = file_dates.reset_index(drop=True)

    # # 选取因子数据
    # factor_df = pd.DataFrame()
    # for f_ in mc.model_factor:

    #     f_data = pd.read_pickle(os.path.join(fc.factor_save_path, '%s.pkl' % f_))
    #     f_data = f_data.reset_index(drop=True)
    #     f_data = f_data.set_index('minute')
    #     factor_df[f_] = f_data

    # factor_df.to_excel('factor_data.xlsx')
    factor_df = pd.read_excel('factor_data.xlsx')
    file_dates = pd.to_datetime(factor_df['minute'].dt.date).drop_duplicates()
    file_dates = file_dates[(file_dates > mc.start_date) & (file_dates <= mc.end_date)]
    file_dates = file_dates.reset_index(drop=True)
    factor_df['minute'] = pd.to_datetime(factor_df['minute'])
    factor_df = factor_df[pd.to_datetime(factor_df['minute'].dt.date).isin(file_dates)]
    factor_df = factor_df.set_index('minute')

    # 获取涨跌幅数据
    return_list = list()
    for d_ in tqdm(file_dates, desc='Getting data'):
        
        # 选取期货数据
        f_path = read_file(d_)
        f_return = pd.read_pickle(os.path.join(fc.kline_data, f_path))[['date', 'price', 'vol']]
        
        # 处理时间
        f_return['date'] = f_return.apply(lambda x: pt.DoubleToDatetime(x['date']), axis=1)
        f_return['amount'] = f_return['price'] * f_return['vol']
        f_return['minute'] = f_return['date'].dt.floor('T')
        
        # 合并每分钟数据
        f_return['amount'] = f_return['price'] * f_return['vol']
        vwap = f_return.groupby('minute').apply(lambda x: x['amount'].sum() / x['vol'].sum())
        vwap = vwap.dropna()
        vwap = pd.DataFrame(vwap, columns=['vwap'])

        # 调整涨跌幅为yield_periods
        vwap['return'] = vwap['vwap'].pct_change(mc.yield_periods)
        vwap = vwap.reset_index()

        vwap = vwap[['minute', 'return']]
        return_list.append(vwap)

    return_df = pd.concat(return_list)

    # 合并涨跌幅与因子数据
    return_df = return_df.set_index('minute')
    return_df[mc.model_factor] = factor_df
    return_df = return_df.dropna(subset=['return'])
    return_df = return_df.reset_index('minute')

    return return_df

In [2]:
def train_xgb():

    # 获取数据
    data = get_data()

    # 选取时间段
    start_minute = dt.timedelta(minutes=mc.start_minute)
    start_time = dt.datetime.combine(dt.datetime.today().date(), dt.time(9, 30))
    start_time = start_time + start_minute
    start_time = start_time.time()
    data = data[data['minute'].dt.time >= start_time]

    # 建立模型
    xgb_model = xgb.XGBRegressor(**mc.xgb_params)

    # 获取训练 & 测试起始日期
    date = pd.date_range(mc.start_date, mc.end_date, freq='M', inclusive='both')
    train_date = date[: -(mc.test_month + mc.train_month)]
    test_date = date[(mc.train_month): -1]

    data['date'] = pd.to_datetime(data['minute'].dt.date)
    data = data.set_index('date')
    # data = data.dropna()

    # 保存结果
    pred_list = list() # 预测结果
    per_index = pd.MultiIndex.from_tuples([
        ('训练集', 'MSE'),
        ('测试集', 'MSE')
    ])
    per_res = pd.DataFrame(index=per_index, columns=test_date) # 模型表现

    # 遍历训练日期
    for i in tqdm(range(len(train_date)), desc='Training xgb'):
        
        # 训练 & 测试区间及数据
        train_d = pd.date_range(train_date[i], train_date[i] + pd.offsets.MonthEnd(mc.train_month), inclusive='left')
        test_d = pd.date_range(test_date[i], test_date[i] + pd.offsets.MonthEnd(mc.test_month), inclusive='left')

        train_x = data.loc[train_d[0]: train_d[-1], mc.model_factor]
        train_y = data.loc[train_d[0]: train_d[-1], 'return']
        
        test_x = data.loc[test_d[0]: test_d[-1], mc.model_factor]
        test_y = data.loc[test_d[0]: test_d[-1], 'return']

        minute = data.loc[test_d[0]: test_d[-1], 'minute']

        # 数据标准化
        scaler = StandardScaler()
        train_x = pd.DataFrame(scaler.fit_transform(train_x.T), index=train_x.columns, columns=train_x.index).T
        test_x = pd.DataFrame(scaler.fit_transform(test_x.T), index=test_x.columns, columns=test_x.index).T

        # 训练模型
        xgb_model.fit(train_x, train_y)

        # 预测
        pred_y_test = xgb_model.predict(test_x)
        pred_y_train = xgb_model.predict(train_x)

        # 效果评估
        mse_test = mean_squared_error(test_y, pred_y_test)
        mse_train = mean_squared_error(train_y, pred_y_train)

        # 保存结果
        # 预测结果
        pred_res = pd.DataFrame(pred_y_test, index=minute, columns=[f'pred_{mc.yield_periods}min'])
        pred_list.append(pred_res)

        # 模型表现
        per_res.loc[('训练集', 'MSE'), test_d[0]] = mse_train
        per_res.loc[('测试集', 'MSE'), test_d[0]] = mse_test

    # 保存文件
    per_res = per_res.T
    per_res.to_excel(os.path.join(mc.result_save_path, f'XGB_performance_{mc.yield_periods}min.xlsx'))

    pred_df = pd.concat(pred_list)
    pred_df.to_pickle(os.path.join(mc.result_save_path, f'xgb_{mc.yield_periods}min.pkl'))

In [3]:
if __name__ == '__main__':

    # 训练xgb
    train_xgb()

Getting data: 100%|██████████| 483/483 [01:13<00:00,  6.58it/s]
Training xgb: 100%|██████████| 21/21 [00:15<00:00,  1.33it/s]
