In [1]:
import sys
import os
import warnings
import glob
import re
from argparse import ArgumentParser
from tqdm.notebook import tqdm
sys.path.append(r'/home/datamake94/秒级高频策略/ML_Project')
from model_training_20240814 import *

os.environ['CUDA_VISIBLE_DEVICES'] = '8'

In [10]:
class UMPDataset_out(torch.utils.data.Dataset):  #读日内全部的数据
    def __init__(self, date, valid_dict) -> None:
        self.date = date
        self.valid_dict=valid_dict
        self.sec_list = params.sec_list_dict[future_ret]
        # factor_stock_list=get_default_stock_list(date)   #所有因子固定的个股
        with open(r'/home/datamake94/ITdev/YangYe_t0/stocklist/stockList_%s.txt'%date,'r') as f:
            factor_stock_list=f.read().split('\n')
        factor_stock_list = [int(i.split('=')[0]) for i in factor_stock_list[:-1]]
        self.stock_list=sorted(list(set(factor_stock_list)))

        pre_date = get_pre_date(date)
        industry_slice = Dataset.industry_table.loc[pre_date]
        self.industry_dummy = torch.from_numpy(pd.get_dummies(industry_slice).reindex(self.stock_list).fillna(False).values).float()
    
    def __getitem__(self, index):
        sec = self.sec_list[index]
        if sec<100000:
            period=93000
        else:
            period=100000
        min_se,max_se,valid_ind=self.valid_dict[period]
        factor_data = load_all_data(self.date,self.stock_list, sec, model_training=True,valid_ind=valid_ind,max_se=max_se,min_se=min_se,import_se=True)
        return [factor_data,self.industry_dummy],sec

    def __len__(self):
        return len(self.sec_list)

def get_out_factor_list(factor_list):
    '''实盘命名规则'''
    factor_prefix_dict= {
            'ysw_orderbook1':'ob1',
            'ysw_pv_a':'sec2',
            'ysw_pv_b':'sec2',
            'ysw_graph':'graph1',
            'ysw_orderbook2':'ob2',
            'ysw_pv2':'sec1',
            'yy_order_basic':'order_basic',
            'yy_order_ls1':'order_ls1',
            'yy_trans_basic':'trans_basic',
            'yy_trans_ls1':'trans_ls1',
            'yy_orderbook3':'ob3',
            'yy_pv4':'sec4',
            }
    static_factor=[]
    with open('/home/intern1/hft_factor_comb/backup_test/factor_list_update20231204.txt', 'r', encoding='utf-8') as file:
        # 逐行读取文件
        lines = file.readlines()  # 这会将所有行读入一个列表
        # 遍历每一行
        for line in lines:
            if int(line.strip().split('=')[1])>=764:
                static_factor.append(line.strip().split('=')[0])

    factor_list_daily = factor_list[:-len(static_factor)]
    factor_list_daily = [x.lower() for x in factor_list_daily]
    for i in range(len(factor_list_daily)):
        for key in factor_prefix_dict.keys():
            if factor_list_daily[i].startswith(key):
                factor_list_daily[i]=factor_prefix_dict[key]+factor_list_daily[i][len(key):]
    return factor_list_daily+[x.upper() for x in static_factor]

def get_valid_dict(date):
    valid_dict = {}
    load_file_path = r'/home/datamake94/ITdev/YangYe_t0'
    min_se = pd.read_csv(os.path.join(load_file_path, 'min_tensor', 'min_tensor_%s.csv'%date),index_col=0)
    max_se = pd.read_csv(os.path.join(load_file_path, 'max_tensor', 'max_tensor_%s.csv'%date),index_col=0)
    min_se = torch.from_numpy(min_se.values).squeeze().float()
    max_se = torch.from_numpy(max_se.values).squeeze().float()
    all_factor_list = get_default_factor_list(if_static=True)
    all_factor_list = get_out_factor_list(all_factor_list)
    for future_ret in future_ret_list:
        valid_dict[future_ret] = {}
        for period in period_list:
            future_ret_idx = {'1m':1, '5m':2, '15s':3}[future_ret]
            with open(os.path.join(load_file_path, 'featureMap', date, f'factor_list_{date}_{period}_model{future_ret_idx}.txt')) as f:
                featureMap = f.readlines()
            factor_list = [i.split('=')[0] for i in featureMap]
            valid_ind = pd.Series([True if i in factor_list else False for i in all_factor_list], index=all_factor_list)
            valid_ind = torch.from_numpy(valid_ind.values).bool()
            valid_dict[future_ret][period] = (min_se, max_se, valid_ind)

    return valid_dict


def calc_args_model_dict(store_date, period_list, future_ret_list):
    model_dict = {}
    for future_ret in future_ret_list:
        for period in period_list:
            params.period = period
            params.future_ret = future_ret

            test_model_path = r'/home/datamake94/决策库/trade_strategy/秒频_yy实盘20240427_t0/实盘模型_测试'
            ret_idx = {'1m':1, '5m':2, '15s':3}[future_ret]

            model = torch.jit.load(os.path.join(test_model_path, f'model{ret_idx}_{period}.pt'))
            model_dict[(future_ret, period)] = model

    return model_dict

def output_jit_model(period_list, future_ret_list, store_date, save_score_name, info_dict, model_dict):
    print('正在输出jit模型和基础数据')
    save_file_path = os.path.join(jit_model_output_path, rf"{store_date}:{save_score_name}")
    if not os.path.exists(save_file_path):
        os.makedirs(save_file_path)
    for period in period_list:
        for future_ret in future_ret_list:
            min_se, max_se, valid_ind, all_factor_list, factor_num = info_dict[(future_ret, period)]
            test_data = torch.randn([5000, factor_num])
            #生成一个随机的0-1矩阵，shape为5000*200
            industry_dummy = torch.randint(0, 1, [5000, 200])
            model = model_dict[(future_ret, period)].model.eval()
            jit_model = torch.jit.trace(model, (test_data, industry_dummy))

            #测试模型结果，生成三个随机的结果要求都能对上
            for i in range(3):
                test_data = torch.randn([5000, factor_num])
                industry_dummy = torch.randint(0, 1, [5000, 200])
                score = model(test_data, industry_dummy)
                score_jit = jit_model(test_data, industry_dummy)
                assert (score == score_jit).float().mean() == 1.

            model_idx = {'1m':1, '5m':2, '15s':3}[future_ret]
            torch.jit.save(jit_model, rf"{save_file_path}/model{model_idx}_{period}.pt")

            #输出基础数据
            IT_all_factor = get_out_factor_list(all_factor_list)
            min_se_output = pd.Series(min_se.numpy(), index=IT_all_factor)
            max_se_output = pd.Series(max_se.numpy(), index=IT_all_factor)
            min_se_output.to_csv(rf"{save_file_path}/min_tensor.csv")
            max_se_output.to_csv(rf"{save_file_path}/max_tensor.csv")

            valid_factor_list = [IT_all_factor[i] for i in range(len(IT_all_factor)) if valid_ind[i] == True]
            file = open(rf'{save_file_path}/factor_list_{period}_model{model_idx}.txt','w')
            for i in range(len(valid_factor_list)):
                file.write(valid_factor_list[i]+'='+str(i)+'\n')
            file.close()

    return

In [13]:
#先生成jit模型
period_list=[93000,100000]
future_ret_list=['1m','5m','15s']
save_score_name = 'yy_industry_94calc_test2'

score_output_path = r'/home/datamake94/data_nb7/sec_score_output_final'
jit_model_output_path = r'/home/datamake94/data_nb7/jit_model_output_final'
year_month_list=get_year_month(params.date_list_all)
month_list= year_month_list[year_month_list.index('202405'):year_month_list.index('202406')][::2]

for month in month_list:
    store_date=get_first_date(month,params.date_list_all)
    begin_date=get_first_date(year_month_list[year_month_list.index(month)-24],params.date_list_all)
    begin_date=begin_date if begin_date>='20200430' else '20200430'
    end_date=get_first_date(year_month_list[year_month_list.index(month)+2],params.date_list_all)
    print('训练集数据起始日为{}，模型样本外预测期为{}——{}'.format(begin_date,store_date,end_date))

    date_list = get_date_list(begin_date,store_date)
    # test_date_list = get_date_list(store_date, end_date)
    test_date_list = ['20240821','20240822']

    model_dict = calc_args_model_dict(store_date, period_list, future_ret_list)

    #1.输出jit模型和基础数据
    # output_jit_model(period_list, future_ret_list, store_date, save_score_name, info_dict, model_dict)

    #2.输出因子打分
    for future_ret in future_ret_list:
        for period in period_list:
            model_dict[(future_ret, period)]#.to('cuda:0')
    for date in tqdm(test_date_list[:], desc=month):
        total_valid_dict = get_valid_dict(date)
        for future_ret in future_ret_list:
            save_score_path = os.path.join(score_output_path, rf"{future_ret}:{save_score_name}")
            if not os.path.exists(save_score_path):
                os.makedirs(save_score_path)
            out_all=[]
            valid_dict = total_valid_dict[future_ret]
            dm = UMPDataset_out(date,valid_dict)
            factor_stock_list=dm.stock_list
            print(len(factor_stock_list))
            test_dataloader=DataLoaderX(dm,batch_size=1,collate_fn=lambda x:x[0],num_workers=20,shuffle=False,drop_last=False)
            for batch in tqdm(test_dataloader, desc=rf"{date},{future_ret}"):
                data,sec=batch
                if sec<100000:
                    period=93000
                else:
                    period=100000
                factor_data,industry_dummy = data
                factor_data=factor_data#.to('cuda:0')
                industry_dummy=industry_dummy#.to('cuda:0')
                out_list=model_dict[(future_ret, period)](factor_data,industry_dummy).cpu().detach().numpy()[:,0]
                out_series=pd.Series(out_list,index=factor_stock_list,name=sec)
                out_all.append(out_series)

            all_factor=pd.concat(out_all,axis=1).T
            all_factor.columns=all_factor.columns.astype(str)
            all_factor.index.name='second'
            all_factor.reset_index(drop=False).to_feather(save_score_path+'/%s.fea'%date)

训练集数据起始日为20220505，模型样本外预测期为20240506——20240701


202405:   0%|          | 0/2 [00:00<?, ?it/s]

4623


20240821,1m:   0%|          | 0/2736 [00:00<?, ?it/s]

4623


20240821,5m:   0%|          | 0/2736 [00:00<?, ?it/s]

4623


20240821,15s:   0%|          | 0/2736 [00:00<?, ?it/s]

4618


20240822,1m:   0%|          | 0/2736 [00:00<?, ?it/s]

4618


20240822,5m:   0%|          | 0/2736 [00:00<?, ?it/s]

4618


20240822,15s:   0%|          | 0/2736 [00:00<?, ?it/s]

In [3]:
future_ret = '5m'
time = 144000
future_ret_idx = {'1m':0, '5m':1, '15s':2}[future_ret]

test1 = pd.read_feather(r'/home/datamake94/data_nb7/sec_score_output_final/%s:yy_industry_94calc_test/20240821.fea'%future_ret).set_index('second')
test2 = pd.read_csv(rf'/home/datamake94/ITdev/YangYe_t0/shishi_fea_result/20240821/alphas_20240821_model{future_ret_idx}_{time*1000}.csv').set_index('code')
test1.columns = test1.columns.astype(int)

test1.loc[time].corr(test2['score'])

0.3497515681213395

In [15]:
model1 = torch.jit.load(r'/home/datamake94/决策库/trade_strategy/秒频_yy实盘20240427_t0/实盘模型_测试/model2_100000.pt')
model2 = torch.jit.load(r'/home/datamake94/ITdev/YangYe_t0/model/model2_100000.pt')

test_result1 = model1(factor_data, industry_dummy)
test_result2 = model2(factor_data, industry_dummy)

In [16]:
test_result1

tensor([[-0.3004],
        [-0.0091],
        [ 0.0489],
        ...,
        [ 0.0255],
        [-0.1793],
        [-0.1227]], grad_fn=<NativeBatchNormBackward0>)

In [17]:
test_result2

tensor([[-0.3004],
        [-0.0091],
        [ 0.0489],
        ...,
        [ 0.0255],
        [-0.1793],
        [-0.1227]], grad_fn=<NativeBatchNormBackward0>)

In [23]:
rq_prehold = pd.read_csv(r'/home/datamake94/ITdev/YangYe_t0/rq_prehold/20240823_rq_prehold.csv',index_col=0)
rq_prehold.index = rq_prehold.index.astype(int)
rq_prehold = rq_prehold.sort_index()
rq_prehold.index.name = 'stock'
rq_prehold = rq_prehold.reset_index()
rq_prehold = rq_prehold.groupby('stock').sum()
rq_prehold.to_csv(r'/home/datamake94/ITdev/YangYe_t0/rq_prehold/20240823_rq_prehold.csv')

In [21]:
rq_prehold

Unnamed: 0,vol,debt_vol,rq_vol
34,0,-100,100
712,0,-700,700
901,1700,0,0
2739,0,-100,100
300066,0,-200,200
300079,4300,0,0
300110,0,-100,100
300131,0,-700,700
300299,0,-100,100
300315,0,-3600,3600


In [27]:
target_vol1 = pd.read_csv(r'/home/datamake94/ITdev/YangYe_t0/target_vol/20240823_targethold.csv')
target_vol1[(target_vol1['balance_vol']!=0)|(target_vol1['total_rq_vol']!=0)]

Unnamed: 0.1,Unnamed: 0,balance_vol,balance_vol_rq,total_rq_vol
24,34,-0.0,-100.0,100.0
207,702,1800.0,0.0,0.0
214,712,-0.0,-700.0,700.0
813,2405,-0.0,-500.0,500.0
975,2607,-0.0,-3500.0,3500.0
1080,2738,-0.0,-100.0,100.0
1081,2739,-0.0,-100.0,100.0
1120,2792,-0.0,-100.0,100.0
1392,300066,-0.0,-200.0,200.0
1404,300079,-0.0,-200.0,200.0


In [26]:
target_vol2 = pd.read_csv(r'/home/datamake94/ITdev/YangYe_t0/target_vol/20240823_targethold.csv.bak')
target_vol2[(target_vol2['balance_vol']!=0)|(target_vol2['total_rq_vol']!=0)]

Unnamed: 0.1,Unnamed: 0,balance_vol,balance_vol_rq,total_rq_vol
207,702,2300.0,0.0,0.0
813,2405,-0.0,-500.0,500.0
975,2607,-0.0,-3500.0,3500.0
1080,2738,-0.0,-100.0,100.0
1120,2792,-0.0,-100.0,100.0
1404,300079,-0.0,-200.0,200.0
1448,300134,-0.0,-200.0,200.0
1558,300261,-0.0,-500.0,400.0
1597,300310,-0.0,-100.0,100.0
1607,300323,-0.0,-300.0,300.0
