In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [13]:
# 特征列名
feature_columns = [
    'Bid or Low Price', 'Ask or High Price', 'Price or Bid/Ask Average', 'Volume', 'Returns',
    'Bid', 'Ask', 'Shares Outstanding', 'Cumulative Factor to Adjust Prices',
    'Cumulative Factor to Adjust Shares/Vol', 'Open Price', 'NASDAQ Number of Trades',
    'Returns without Dividends', 'Value-Weighted Return-incl. dividends',
    'Value-Weighted Return-excl. dividends', 'Equal-Weighted Return-incl. dividends',
    'Equal-Weighted Return-excl. dividends', 'Return on the S&P 500 Index',
    'Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6',
    'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10', 'Feature_11', 'Feature_12',
    'Feature_13', 'Feature_14', 'Feature_15', 'Feature_16', 'Feature_17', 'Feature_18',
    'Feature_19', 'Feature_20', 'Feature_21', 'Feature_22', 'Feature_23', 'Feature_24',
    'Feature_25', 'Feature_26', 'Feature_27', 'Feature_28', 'Feature_29', 'Feature_30',
    'Feature_31', 'Feature_32'
    ]

# 参数设置
window_size = 5 # 时间窗口大小
input_dim = len(feature_columns) # lstm 的输入的维度，等于每只股票每天的特征数量
hidden_dim = 128 # lstm 的隐藏状态的维度，决定 lstm 对时序模式的学习能力
num_stocks = 30 # 股票数
mlp_dim = 64 # mlp 隐层维度，决定 mlp 的非线性表达能力
num_classes = 3 # 每个分类头的输出神经元数量

# 模型定义
class MultiStockLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, mlp_dim, num_classes, num_stocks=30):
        super().__init__()
        self.num_stocks = num_stocks
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.classifiers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_dim, mlp_dim),
                nn.ReLU(),
                nn.Linear(mlp_dim, num_classes)
            ) for _ in range(num_stocks) # 对每支股票构建一个分类器，每个分类器输出三个神经元
        ])

    def forward(self, x):
        batch_size, window_size, num_stocks, num_features = x.shape
        x = x.permute(0, 2, 1, 3).reshape(batch_size*num_stocks, window_size, num_features)
        lstm_out, _ = self.lstm(x)
        lstm_last = lstm_out[:, -1, :] # 取最后一个时间步的隐藏状态
        
        outputs = []
        for i in range(self.num_stocks):
            stock_out = self.classifiers[i](lstm_last[i::self.num_stocks]) # 每隔30个取一个样本
            outputs.append(stock_out)
        
        return torch.stack(outputs, dim=1)



In [29]:
# 读取训练文件
train_file_path = 'GraphAutoencoderoutput_Train_data2.0.xlsx'  # 替换为训练文件的实际路径
train_df = pd.read_excel(train_file_path)

# 读取测试文件
test_file_path = 'GraphAutoencoderoutput_TEST_data2.0.xlsx'  # 替换为测试文件的实际路径
test_df = pd.read_excel(test_file_path)

# 处理缺失值，使用前向填充
train_df[feature_columns] = train_df[feature_columns].ffill()
# print(train_df)

# 先对所有训练数据进行标准化，不区分股票
scaler_all = StandardScaler()
train_df[feature_columns] = scaler_all.fit_transform(train_df[feature_columns])
# print(train_df[feature_columns])

# 为每支股票创建一个 scaler 并对每支股票分别进行标准化
stock_list = train_df['Ticker Symbol'].unique()
# print(stock_list)
scalers = {stock: StandardScaler() for stock in stock_list}

for stock in stock_list:
    # 使用该股票的全部训练数据进行标准化，注意是对原始数据进行修改，而非副本
    mask = train_df['Ticker Symbol'] == stock  # 创建一个布尔掩码
    # 直接定位到原数据位置，在原数据上进行修改，避免创建副本
    train_df.loc[mask, feature_columns] = scalers[stock].fit_transform(train_df[train_df['Ticker Symbol'] == stock][feature_columns])

# 按 Names Date 分组处理训练集数据
grouped_train = train_df.groupby('Names Date')
# print(grouped_train)
print(len(grouped_train))

# 组织训练集数据为序列样本
X_train = []
y_train = []

for i in range(len(grouped_train)):
    break

for date, group in grouped_train:
    # print(len(group))
    # print(group)
    X = group[feature_columns].values # 转换成 ndarray
    # print(type(X))
    y = group['Class'].values


    break


273


In [None]:


# 初始化
model = MultiStockLSTM(input_dim=5, hidden_dim=128)  # 假设每支股票5个特征
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# 训练循环
for epoch in range(30):
    model.train()
    for X_batch, y_batch in train_loader:  # X_batch: [32, 5, 30, 5]
        optimizer.zero_grad()
        outputs = model(X_batch)  # [32, 30, 3]
        
        loss = 0
        for i in range(30):
            loss += nn.CrossEntropyLoss()(outputs[:, i, :], y_batch[:, i])
        loss /= 30
        
        loss.backward()
        optimizer.step()