In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from itertools import chain
import lightgbm as lgb
from sklearn.metrics import accuracy_score


backtest_frame_path = '/home/liuqize/QuantFrame'
sys.path.append(backtest_frame_path)
try:
    from MLMethods import *
    from Factors import TSFeatureEngineer
except:
    error_msg = 'Please check the path of backtest_frame'
    print(error_msg)

data_path = '/data/user_home/liuqize/aft_data'

%load_ext autoreload
%autoreload 2
%matplotlib inline

# 读取数据

## raw feature and label data

In [46]:
raw_feature_df = pd.read_parquet('./original.parquet')
raw_feature_df.reset_index(inplace=True,drop=True)
raw_feature_df.head(3)

Unnamed: 0,date,time,sym,morning,SOIR_1,SOIR_2,SOIR_3,SOIR_4,SOIR_5,SOIR,...,MPC_skew_4,MPC_skew_5,MAX,RSJ,Dispersion,AVEslope,slope_ask,slope_bid,slope_diff,patience_all
0,0,09:40:03,0,1,0.761194,-0.552239,-0.905512,0.394439,-0.993958,-0.26492,...,3.027714,1.613805,1.344113,0.445405,8.836248e-18,390.84753,32.151661,87.561082,-55.409421,-0.462853
1,0,09:40:06,0,1,0.078534,-0.82235,-0.820359,-0.188736,-0.99456,-1.345968,...,3.027714,1.613805,1.344113,0.445405,3.361027e-17,373.334714,22.236818,87.334239,-65.097421,-0.594111
2,0,09:40:09,0,1,0.681416,-0.727483,-0.912023,0.380224,-0.994,-0.494495,...,3.027714,1.613805,1.344113,0.445405,1.61004e-17,385.95579,28.9738,87.674945,-58.701145,-0.50323


In [None]:
raw_features = raw_feature_df.columns.tolist()[4:]

In [None]:
import pandas as pd
import statsmodels.api as sm

def check_random_walk(df):
    for col in df.columns:
        # 对每列进行ADF检验
        result = sm.tsa.stattools.adfuller(df[col])
        print('Column:', col)
        print('ADF Statistic: %f' % result[0])
        print('p-value: %f' % result[1])
        print('Critical Values:')
        for key, value in result[4].items():
            print('\t%s: %.3f' % (key, value))

In [17]:

import statsmodels.api as sm

features_cols = raw_feature_df.columns.tolist()[4:]
tmp_df = raw_feature_df.query('date==0')
need_diff_cols = []
for col in tqdm(features_cols):
    result = sm.tsa.stattools.adfuller(tmp_df[col].dropna())    
    # print(f'{col}: ADF Statistic: %f' % result[0], 'p-value: %f' % result[1])
    if result[1] > 0.05:
        need_diff_cols.append(col)


  0%|          | 0/125 [00:00<?, ?it/s]

100%|██████████| 125/125 [01:48<00:00,  1.15it/s]


In [65]:
need_diff_cols = ['PD_Bid', 'PD_Ask', 'AD_p', 'CD_p_1', 'CD_p_2', 'CD_p_3', 'CD_p_4',
 'CD_p_5', 'PD_bid_1_2', 'PD_bid_1_3', 'PD_bid_1_4','PD_bid_1_5', 'PD_ask_1_2', 'PD_ask_1_3',
 'PD_ask_1_4', 'PD_ask_1_5', 'PD_diff_bid_1', 'PD_diff_bid_2', 'PD_diff_bid_3', 'PD_diff_bid_4',
 'PD_diff_ask_1', 'PD_diff_ask_2', 'PD_diff_ask_3', 'PD_diff_ask_4', 'VWAP_ask_4', 'VWAP_ask_5',
 'VWAP_diff_bid_1', 'VWAP_diff_bid_2', 'VWAP_diff_bid_3', 'VWAP_diff_bid_4', 'VWAP_diff_bid_5',
 'VWAP_diff_ask_1', 'VWAP_diff_ask_2', 'VWAP_diff_ask_3', 'VWAP_diff_ask_4', 'VWAP_diff_ask_5',
 'MPC_max_1', 'MPC_max_2', 'MPC_max_3', 'MPC_max_4', 'MPC_max_5', 'MPC_skew_1', 'MPC_skew_2',
 'MPC_skew_3', 'MPC_skew_4', 'MPC_skew_5', 'MAX', 'RSJ']

# 生成特征

In [66]:
raw_feature_df[['date', 'sym']] = raw_feature_df[['date', 'sym']].astype(str)
raw_feature_df['date_sym'] = raw_feature_df['date'] + '_' + raw_feature_df['sym']
df = raw_feature_df[['date_sym','time'] + features_cols]
df.reset_index(drop=True, inplace=True)

In [67]:
chunk_size = 10
sub_feature_cols = [features_cols[i:i+chunk_size] for i in range(0, len(features_cols), chunk_size)]
sub_dfs = [df[['date_sym','time'] + sub_feature_col].copy() for sub_feature_col in sub_feature_cols]

In [70]:
import contextlib
from io import StringIO

def calc_features(factor_df):
    need_diff_cols = ['PD_Bid', 'PD_Ask', 'AD_p', 'CD_p_1', 'CD_p_2', 'CD_p_3', 'CD_p_4',
        'CD_p_5', 'PD_bid_1_2', 'PD_bid_1_3', 'PD_bid_1_4','PD_bid_1_5', 'PD_ask_1_2', 'PD_ask_1_3',
        'PD_ask_1_4', 'PD_ask_1_5', 'PD_diff_bid_1', 'PD_diff_bid_2', 'PD_diff_bid_3', 'PD_diff_bid_4',
        'PD_diff_ask_1', 'PD_diff_ask_2', 'PD_diff_ask_3', 'PD_diff_ask_4', 'VWAP_ask_4', 'VWAP_ask_5',
        'VWAP_diff_bid_1', 'VWAP_diff_bid_2', 'VWAP_diff_bid_3', 'VWAP_diff_bid_4', 'VWAP_diff_bid_5',
        'VWAP_diff_ask_1', 'VWAP_diff_ask_2', 'VWAP_diff_ask_3', 'VWAP_diff_ask_4', 'VWAP_diff_ask_5',
        'MPC_max_1', 'MPC_max_2', 'MPC_max_3', 'MPC_max_4', 'MPC_max_5', 'MPC_skew_1', 'MPC_skew_2',
        'MPC_skew_3', 'MPC_skew_4', 'MPC_skew_5', 'MAX', 'RSJ']
    feature_cols = factor_df.columns.tolist()[2:]
    for col in feature_cols:
        with contextlib.redirect_stdout(StringIO()):        
            if col in need_diff_cols:
                factor_df = TSFeatureEngineer.momentum_feature(factor_df, col, 'date_sym', 'time',window=[5,10,20,60,120],delta='diff')
            else:
                factor_df = TSFeatureEngineer.momentum_feature(factor_df, col, 'date_sym', 'time',window=[5,10,20,60,120],delta=None)
        factor_df = factor_df.loc[:, ~factor_df.columns.duplicated()]
    return factor_df[factor_df.columns.tolist()[2:]]

In [71]:
ncpu = cpu_count() // 2
with Pool(ncpu) as p:
    rst_dfs = list(tqdm(p.imap(calc_features, sub_dfs), total=len(sub_dfs)))

100%|██████████| 13/13 [09:07<00:00, 42.11s/it]  


In [76]:

df = pd.concat(rst_dfs, axis=1, ignore_index=True)


In [83]:
column_names = list(chain.from_iterable([_df.columns.tolist() for _df in rst_dfs]))
df.columns = column_names
df = pd.concat([
    raw_feature_df[['date','time', 'sym']], df    
],axis=1)
df.to_parquet(os.path.join(data_path,'all_features.parquet'))


# 筛选特征

In [2]:
df = pd.read_parquet(os.path.join(data_path,'all_features.parquet'))

In [4]:
na_info = df.isna().mean()
df = df[na_info[na_info < 0.1].index.tolist()]


In [10]:
from glob import glob
target_df = pd.concat([pd.read_csv(f) for f in glob(os.path.join(data_path,'raw_data/train/*'))])
target_cols =  ['label_5', 'label_10', 'label_20', 'label_40','label_60']
target_df.reset_index(drop=True, inplace=True)

In [11]:
df = df.merge(target_df[['date','time','sym'] + target_cols], on=['date','time','sym'], how='left')
# df = pd.concat([df, target_df], axis=1)

In [None]:
params = {
    'objective': 'multiclass',  # 多分类任务
    'num_class': 3,  # 类别数，这里是三分类任务
    'metric': 'multi_logloss',  # 多分类损失函数
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

In [23]:
features_cols = df.columns.tolist()[3:-5]

In [32]:
# 分离特征和目标
# 避免列名中含有 () 或 _ 
feature2num = {col: i for i, col in enumerate(features_cols)}
num2feature = {i: col for i, col in enumerate(features_cols)}
df.rename(columns=feature2num, inplace=True)

new_feature_cols = list(range(len(features_cols)))

In [33]:

from sklearn.model_selection import train_test_split
target = target_cols[0]
X = df[new_feature_cols]
y = df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

trn_dataset = lgb.Dataset(X_train, label=y_train)
val_dataset = lgb.Dataset(X_val, label=y_val)

In [34]:
# 定义LightGBM的参数
params = {
    'objective': 'multiclass',  # 多分类任务
    'num_class': 3,  # 类别数，这里是三分类任务
    'metric': 'multi_logloss',  # 多分类损失函数
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'n_estimators': 200
}
num_round = 200
bst = lgb.train(params, train_set=trn_dataset, valid_sets=[val_dataset],num_boost_round = num_round)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 608151
[LightGBM] [Info] Number of data points in the train set: 1959020, number of used features: 3046
[LightGBM] [Info] Start training from score -1.865817
[LightGBM] [Info] Start training from score -0.370680
[LightGBM] [Info] Start training from score -1.864554
[1]	valid_0's multi_logloss: 0.733296
[2]	valid_0's multi_logloss: 0.725076
[3]	valid_0's multi_logloss: 0.717464
[4]	valid_0's multi_logloss: 0.710515
[5]	valid_0's multi_logloss: 0.704341
[6]	valid_0's multi_logloss: 0.698603
[7]	valid_0's multi_logloss: 0.692854
[8]	valid_0's multi_logloss: 0.687818
[9]	valid_0's multi_logloss: 0.68287
[10]	valid_0's multi_logloss: 0.67884
[11]	valid_0's multi_logloss: 0.674537
[12]	valid_0's multi_logloss: 0.670253
[13]	valid_0's multi_logloss: 0.666895
[14]	valid_0's multi_logloss: 0.663846
[15]	valid_0's multi_logloss: 0.660547
[16]	valid_0's multi_logloss: 0.657872
[17]	valid_0's multi_logloss: 0.65

In [37]:
y_pred = bst.predict(X_val, num_iteration=bst.best_iteration)
y_pred_max = [np.argmax(pred) for pred in y_pred]
accuracy = accuracy_score(y_val, y_pred_max)


0.7652601811109636

In [39]:
feature_importance = bst.feature_importance(importance_type='split')
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})


In [51]:
importance_df.sort_values(by='Importance', ascending=False,inplace=True)
importance_df['FeatureName'] = importance_df['Feature'].map(num2feature)
importance_df.query('Importance > 10',inplace=True)

In [57]:
important_feature = importance_df.FeatureName.tolist()
important_feature = list(set(important_feature) - set(raw_features))


In [59]:

df.rename(columns=num2feature, inplace=True)
df[['date','time','sym'] + important_feature].to_parquet(os.path.join(data_path,'important_features.parquet'))
# important_feature