In [18]:
import pandas as pd
import numpy as np
import os
import sys
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
import torch.nn as nn
np.random.seed(42)
import torch
torch.manual_seed(42)
from scipy.stats import spearmanr
import os

from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader



In [2]:
#导入数据
y_label = 'y3_label'
raw_data_path = '/home/laiminzhi/data/xydata/'
xydata = pd.read_hdf(raw_data_path+'xy_data_2836.h5')
xydata.index = xydata.index.set_names(['code1','date','code'], level=[0, 1,2])
# 删除不需要的索引层
xydata = xydata.droplevel('code1')

xydata_limit = pd.read_hdf(raw_data_path+'xy_data_2836_udlimit.h5')
selected_feature = pd.read_csv('selected_feature.csv',index_col=0)['AlphaName'].to_list()

xydata = xydata.loc[:,[y_label]+selected_feature]
# 把xydata_limit的ud_limit_h2列加到xydata中
xydata = xydata.join(xydata_limit['ud_limit_h2'])
xydata_final = xydata[xydata['ud_limit_h2']!=1]

#除去inf
xydata_final = xydata_final.replace([np.inf, -np.inf], 0.0)

xydata_final = xydata_final.drop(columns='ud_limit_h2')

In [3]:
xydata_final = xydata_final.sort_index(level='date')
train = xydata_final.loc[:'20211231',]
test = xydata_final.loc['20220104':,]

y_train = train[y_label].values
y_test = test[y_label].values

X_train = train.drop(y_label,axis=1).values

X_test = test.drop(y_label,axis=1).values

In [44]:
#train中第19和第186行数据
train_row_19 = train.iloc[19,:]
train_row_186 = train.iloc[186,:]
print(train_row_19)
print(train_row_186)


y3_label    0.011725
x_62        0.000000
x_204       0.000000
x_240       0.000000
x_114       0.000000
              ...   
x_152       0.000000
x_128       0.000000
x_176       0.000000
x_119       0.000000
x_215       0.000000
Name: (20180102, 000032.SZ), Length: 72, dtype: float64
y3_label   -0.023619
x_62        0.000190
x_204       0.321479
x_240       0.028028
x_114      -0.283466
              ...   
x_152       0.292579
x_128       0.292579
x_176      -0.083632
x_119      -0.045022
x_215      -0.045022
Name: (20180102, 000731.SZ), Length: 72, dtype: float64


In [5]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        
        self.hidden1 = nn.Linear(input_size, hidden_size)
        
        self.hidden2 = nn.Linear(hidden_size, hidden_size)

        self.output = nn.Linear(hidden_size, output_size)
        
        self.silu = nn.SiLU()

    def forward(self, x):
        x = self.silu(self.hidden1(x))
        x = self.silu(self.hidden2(x))
        x = self.output(x)
        
        return x
    
# 创建模型实例
# 假设输入大小为10，输出大小为1
input_size = X_train.shape[1]
hidden_size = 50
output_size = 1

model = MLP(input_size, hidden_size, output_size)

# 打印模型结构
print(model)

MLP(
  (hidden1): Linear(in_features=71, out_features=50, bias=True)
  (hidden2): Linear(in_features=50, out_features=50, bias=True)
  (output): Linear(in_features=50, out_features=1, bias=True)
  (silu): SiLU()
)


In [61]:
#训练以上模型
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 30
patience = 10

k_folds = 2

batch_size = 3000

kf = KFold(n_splits=k_folds, shuffle=False)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_name = "model1"

from sklearn.model_selection import train_test_split
# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15,shuffle=False)

In [62]:
model = model.to(device)

X_train_tensor = torch.tensor(X_train,dtype=torch.float32,device=device)
Y_train_tensor = torch.tensor(y_train,dtype=torch.float32,device=device)

X_val_tensor = torch.tensor(X_train,dtype=torch.float32,device=device)
Y_val_tensor = torch.tensor(y_train,dtype=torch.float32,device=device)

train_dataset = TensorDataset(X_train_tensor,Y_train_tensor)
valid_dataset = TensorDataset(X_val_tensor,Y_val_tensor)

train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=False)
valid_loader = DataLoader(dataset=valid_dataset,batch_size=batch_size,shuffle=False)

bad_epoch = 0
early_stop_epoch = 0

valid_loss_min = float("inf")

for epoch in range(epochs):
    train_loss = []
    model.train()
    for i,(X_train_tensor,Y_train_tensor) in enumerate(train_loader):
        optimizer.zero_grad()
        Y_pred = model(X_train_tensor)
        Y_pred = Y_pred.view(-1)
        mask  = Y_pred < 100
        Y_pred = torch.where(mask, Y_pred, torch.tensor(0,dtype=torch.float32,device=device))
        Y_train_tensor = torch.where(mask, Y_train_tensor, torch.tensor(0,dtype=torch.float32,device=device))

        loss = criterion(Y_pred,Y_train_tensor)
        train_loss.append(loss.item())

        if loss>1000:
            print(i)
            print("loss:",loss.item())
            #找出Y_pred>1000的值对应的index
            Y_pred = Y_pred.view(-1)
            invalid_pred_index = torch.where(Y_pred>1000)
            print("invalid_pred_index:",invalid_pred_index)
            invalid_X_train_tensor = X_train_tensor[invalid_pred_index]
            invalid_Y_train_tensor = Y_train_tensor[invalid_pred_index]
            print("invalid_X_train_tensor:",invalid_X_train_tensor)
            print("invalid_Y_train_tensor:",invalid_Y_train_tensor)
            print("invalid_Y_pred:",Y_pred[invalid_pred_index])
            
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {np.mean(train_loss):.4f}')

    model.eval()
    with torch.no_grad():
        valid_loss = []
        for X_val_tensor,Y_val_tensor in valid_loader:
            Y_pred = model(X_val_tensor)


            Y_pred = Y_pred.view(-1)
            mask  = Y_pred < 100
            Y_pred = torch.where(mask, Y_pred, torch.tensor(0,dtype=torch.float32,device=device))
            Y_val_tensor = torch.where(mask, Y_val_tensor, torch.tensor(0,dtype=torch.float32,device=device))

            test_loss = criterion(Y_pred,Y_val_tensor)
            valid_loss.append(test_loss.item())

        valid_loss_cur = np.mean(valid_loss)

        if valid_loss_cur < valid_loss_min:
            valid_loss_min = valid_loss_cur
            bad_epoch = 0
            early_stop_epoch = epoch
            torch.save(model.state_dict(), './mlp_ensemble/model/' + model_name + '.pth')
            print("model saved!")
        else:
            bad_epoch += 1
            print(bad_epoch)
            if bad_epoch >= patience:    # 如果验证集指标连续patience个epoch没有提升，就停掉训练
                print(" The training stops early in epoch {}".format(epoch))
                torch.save(model.state_dict(), './mlp_ensemble/model/' + model_name + '.pth')
                break

        print(f'Epoch [{epoch+1}/{epochs}], Validation Loss: {np.mean(valid_loss):.4f}')
        

Epoch [1/30], Loss: 0.2743
model saved!
Epoch [1/30], Validation Loss: 0.0785
Epoch [2/30], Loss: 0.0493
model saved!
Epoch [2/30], Validation Loss: 0.0291
Epoch [3/30], Loss: 0.1142
model saved!
Epoch [3/30], Validation Loss: 0.0139
Epoch [4/30], Loss: 0.0637
1
Epoch [4/30], Validation Loss: 0.0725
Epoch [5/30], Loss: 0.1962
2
Epoch [5/30], Validation Loss: 0.0681
Epoch [6/30], Loss: 0.0357
model saved!
Epoch [6/30], Validation Loss: 0.0021
Epoch [7/30], Loss: 0.1336
1
Epoch [7/30], Validation Loss: 0.0021
Epoch [8/30], Loss: 0.0059
model saved!
Epoch [8/30], Validation Loss: 0.0014
Epoch [9/30], Loss: 0.0031
model saved!
Epoch [9/30], Validation Loss: 0.0013
Epoch [10/30], Loss: 0.0065
model saved!
Epoch [10/30], Validation Loss: 0.0013
Epoch [11/30], Loss: 0.0055
model saved!
Epoch [11/30], Validation Loss: 0.0013
Epoch [12/30], Loss: 0.0110
1
Epoch [12/30], Validation Loss: 0.0024
Epoch [13/30], Loss: 0.0033
2
Epoch [13/30], Validation Loss: 0.0022
Epoch [14/30], Loss: 0.1173
3
Epo

In [63]:
#测试数据
model.load_state_dict(torch.load('./mlp_ensemble/model/' + model_name + '.pth'))
model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test,dtype=torch.float32,device=device)
    Y_test_tensor = torch.tensor(y_test,dtype=torch.float32,device=device)
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred,Y_test_tensor.view(-1,1))
    print(f'Test Loss: {test_loss.item()}')
y_pred = y_pred.cpu().numpy()


Test Loss: 0.0073842573910951614


In [64]:
from scipy.stats import spearmanr
def calculate_spearman(group):
    return spearmanr(group['Y_hat'], group['Y'])[0]

#把y_label和y_test还原回dataframe
y_test_df = pd.DataFrame(y_test, index=test.index, columns=['Y'])
y_pred_df = pd.DataFrame(y_pred, index=test.index, columns=['Y_hat'])
result = pd.concat([y_test_df,y_pred_df],axis=1)
#计算result的rank ic
spearman_correlations = result.groupby(level='date').apply(calculate_spearman)
rank_avg = spearman_correlations.mean()
print('rank_avg:',rank_avg)

rank_avg: 0.010176391815162148


In [65]:
grouped_df = result.groupby(level='date')
    
for name,group in grouped_df:
    group = group.reset_index(drop=False)
    group.to_csv('./mlp_ensemble/prediction/'+model_name+'.csv')
print("data saved!")

def cal_alpha(XY):
    #查看pnl如何计算
    d1 = XY.copy()
    enterRatio = 0.9
    exitRatio = 0.9
    ## 1) calculate yestRank;
    d1['yestRank'] = d1.groupby('date')['yest'].rank(method='average',na_option='keep',ascending=True,pct=True)
    rtnMat = pd.pivot_table(data=d1,index='date',columns='code',values='y',dropna=False)
    yestMat = pd.pivot_table(data=d1,index='date',columns='code',values='yest',dropna=False)
    yestRankMat = pd.pivot_table(data=d1,index='date',columns='code',values='yestRank',dropna=False).fillna(0)
    posiMat = pd.DataFrame(np.full(yestRankMat.shape,fill_value=0),index=yestRankMat.index,columns=yestRankMat.columns)
    ud_limitMat = pd.pivot_table(data=d1,index='date',columns='code',values='ud_limit_h2',dropna=False).fillna(0)

    ## 2) calPosiMat： ## no buy if up_limit && no sell if down_limit;
    for i,row_index in enumerate(posiMat.index):
        if (i==0):
            continue
        flag1 = (yestRankMat.iloc[i,:]>enterRatio)
        flag2 = (posiMat.iloc[i-1,:]==0) & (ud_limitMat.iloc[i,:]==1)
        posiMat.loc[row_index,(~flag2 & flag1)] = 1

        flag3 = (yestRankMat.iloc[i,:]>exitRatio) & (yestRankMat.iloc[i,:]<=enterRatio)
        flag4 = (posiMat.iloc[i-1,:]==1)
        posiMat.loc[row_index,(flag3 & flag4)] = 1

        flag5 = (posiMat.iloc[i-1,:]==1) & (posiMat.iloc[i,:]==0) & (ud_limitMat.iloc[i,:]==-1)
        posiMat.loc[row_index,flag5] = 1
        
        if (i== (posiMat.shape[0]-1)):## position=0 if yest=NA on last day;
            flag6 = yestMat.iloc[i,:].isna()
            posiMat.loc[row_index,flag6] = 0


    pnlMat = rtnMat * posiMat
    pnlVec = pnlMat.sum(axis=1)/(posiMat==1).sum(axis=1)
    alpha = pnlVec.mean()*1e4
    return alpha

def get_XY(yest,xy):
    #xy是原始xy文件，yest是合并后的预测文件
    universe = 'univ_tradable'

    XY = xy.loc[xy[universe]==1,:'ud_limit_h4']
    XY= XY.rename(columns={'y1':'y'})
    XY = pd.merge(XY, yest,on=['date','code'],how='inner')

    ##---- 1. benchmark ----##
    XY['yest'] = XY['Y_hat']
    return XY

import os
xy = pd.read_hdf("/home/laiminzhi/wenbin/DL_stock_combo/data/xy_data/xy_data_new.h5")
all_files = [pd.read_csv(f'./ridgeCV/{f}',
                            dtype={'date':str})for f in sorted(os.listdir('./ridgeCV'))]
yest = pd.concat(all_files, axis=0) #贴合所有的预测值
XY = get_XY(yest,xy)
alpha = cal_alpha(XY)
print('alpha:',alpha)

data saved!
alpha: -5.244877101848944
