### 此模型问题

- 对所有股票都使用相同权重
- 对测试集和训练集分别进行标准化，会导致训练集的分布泄露
    - 最好先对训练集整体进行标准化(fit_transform)，用此scaler对测试集整体进行标准化(transform)
    - 再对每只股票分别设置scaler，在每支股票的测试集上进行标准化(fit_transform)，再在对应的测试集上进行标准化(transform)
- 在保存预测结果时似乎将当天的涨跌作为真实值，实际上应该将第二天的涨跌作为真实值
    - 不知在最先的分类中是否有提前解决这一问题

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 读取文件一
file1_path = 'GraphAutoencoderoutput_data2.0.xlsx'  # 替换为文件一的实际路径
df = pd.read_excel(file1_path)

# 特征列名
feature_columns = [
    'Bid or Low Price', 'Ask or High Price', 'Price or Bid/Ask Average', 'Volume', 'Returns',
    'Bid', 'Ask', 'Shares Outstanding', 'Cumulative Factor to Adjust Prices',
    'Cumulative Factor to Adjust Shares/Vol', 'Open Price', 'NASDAQ Number of Trades',
    'Returns without Dividends', 'Value-Weighted Return-incl. dividends',
    'Value-Weighted Return-excl. dividends', 'Equal-Weighted Return-incl. dividends',
    'Equal-Weighted Return-excl. dividends', 'Return on the S&P 500 Index',
    'Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6',
    'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10', 'Feature_11', 'Feature_12',
    'Feature_13', 'Feature_14', 'Feature_15', 'Feature_16', 'Feature_17', 'Feature_18',
    'Feature_19', 'Feature_20', 'Feature_21', 'Feature_22', 'Feature_23', 'Feature_24',
    'Feature_25', 'Feature_26', 'Feature_27', 'Feature_28', 'Feature_29', 'Feature_30',
    'Feature_31', 'Feature_32'
]

# 处理缺失值，使用前向填充
df[feature_columns] = df[feature_columns].fillna(method='ffill')

# 按 PERMNO 分组处理数据
grouped = df.groupby('PERMNO')

# 设置时间窗口大小为5
window_size = 5

# 组织数据为序列样本
X_sequence = []
y_sequence = []

for permno, group in grouped:
    group = group.sort_values(by='Names Date')
    print(group)
    X = group[feature_columns].values
    y = group['Class'].values # 这里的 Class 是下跌、持平、上涨标签
    
    # 标准化特征数据
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    for i in range(len(X_scaled) - window_size):
        X_sequence.append(X_scaled[i:i+window_size]) # 这里分为多个独立窗口
        y_sequence.append(y[i+window_size])

X_sequence = np.array(X_sequence)
y_sequence = np.array(y_sequence) + 1  # 将标签值从 [-1, 0, 1] 映射到 [0, 1, 2] -1 为下跌，0为持平，1为上涨

# 检查数据中是否存在 NaN 或无穷大值
print("Checking for NaN in X_sequence:", np.isnan(X_sequence).any())
print("Checking for NaN in y_sequence:", np.isnan(y_sequence).any())
print("Checking for infinity in X_sequence:", np.isinf(X_sequence).any())
print("Checking for infinity in y_sequence:", np.isinf(y_sequence).any())

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_sequence, y_sequence, test_size=0.2, random_state=42, shuffle=False)

# 转换为PyTorch张量
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# 创建数据加载器
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # 注意这里打乱了训练数据，此时独立窗口长度数值的影响增大
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 定义LSTM+MLP模型
class LSTMMLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, mlp_dim, num_classes):
        super(LSTMMLPModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, mlp_dim),
            nn.ReLU(),
            nn.Linear(mlp_dim, num_classes)
        )

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # 取最后一个时间步的隐藏状态
        return self.mlp(lstm_out)

# 初始化模型、损失函数和优化器
input_dim = len(feature_columns)
hidden_dim = 128
mlp_dim = 64
num_classes = 3  # 上涨/持平/下跌
model = LSTMMLPModel(input_dim, hidden_dim, mlp_dim, num_classes)

# 模型权重初始化
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for param in m.parameters():
            if len(param.shape) >= 2:
                nn.init.xavier_uniform_(param)
            else:
                nn.init.zeros_(param)

model.apply(init_weights)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 训练模型
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X_batch.size(0)
    train_loss /= len(train_loader.dataset)

    # 评估模型
    model.eval()
    test_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            test_loss += loss.item() * X_batch.size(0)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.numpy())
            all_labels.extend(y_batch.numpy())
    test_loss /= len(test_loader.dataset)
    
    # 将标签值从 [0, 1, 2] 映射回 [-1, 0, 1]
    all_labels = np.array(all_labels) - 1
    all_preds = np.array(all_preds) - 1
    
    test_f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test F1: {test_f1:.4f}')

# 计算并输出最终的测试集F1分数
all_labels = np.array(all_labels) - 1
all_preds = np.array(all_preds) - 1
final_f1 = f1_score(all_labels, all_preds, average='weighted')
print(f'Final Test F1 Score: {final_f1:.4f}')

  df[feature_columns] = df[feature_columns].fillna(method='ffill')


      PERMNO Names Date Ticker Symbol  PERMCO  Bid or Low Price  \
0      10107 2020-09-01          MSFT    8048        224.429993   
30     10107 2020-09-02          MSFT    8048        227.350006   
60     10107 2020-09-03          MSFT    8048        214.960205   
90     10107 2020-09-04          MSFT    8048        205.190002   
120    10107 2020-09-08          MSFT    8048        202.199997   
...      ...        ...           ...     ...               ...   
1770   10107 2020-11-24          MSFT    8048        208.860001   
1800   10107 2020-11-25          MSFT    8048        212.460007   
1830   10107 2020-11-27          MSFT    8048        214.039993   
1860   10107 2020-11-30          MSFT    8048        210.835007   
1890   10107 2020-12-01          MSFT    8048        213.350006   

      Ask or High Price  Price or Bid/Ask Average    Volume   Returns  \
0            227.449997                227.270004  25729137  0.007715   
30           232.860001                231.649994

In [5]:
print(X_sequence)
print(y_sequence)

[[[ 2.24463976  1.91836983  2.27068896 ...  0.05938668 -0.06317475
   -0.05841732]
  [ 2.68226764  2.73914031  2.92169161 ... -2.01032452 -1.61667131
    0.99400567]
  [ 0.82538498  2.20055699  0.78883677 ...  0.05938668 -0.06317475
   -0.05841732]
  [-0.63889357  0.53927864  0.33551147 ... -0.85407993 -0.36117583
   -1.32261452]
  [-1.08701128 -0.72447873 -1.38712236 ...  0.05938668 -0.06317475
   -0.05841732]]

 [[ 2.68226764  2.73914031  2.92169161 ... -2.01032452 -1.61667131
    0.99400567]
  [ 0.82538498  2.20055699  0.78883677 ...  0.05938668 -0.06317475
   -0.05841732]
  [-0.63889357  0.53927864  0.33551147 ... -0.85407993 -0.36117583
   -1.32261452]
  [-1.08701128 -0.72447873 -1.38712236 ...  0.05938668 -0.06317475
   -0.05841732]
  [-0.41258787  0.00524886 -0.10443756 ... -1.68150314 -0.3992601
   -2.67054682]]

 [[ 0.82538498  2.20055699  0.78883677 ...  0.05938668 -0.06317475
   -0.05841732]
  [-0.63889357  0.53927864  0.33551147 ... -0.85407993 -0.36117583
   -1.32261452]
 