In [11]:
import json
import pandas as pd
import numpy as np
from torch_geometric.data import Data
import torch
import torch.nn as nn
import torch_geometric.nn as pyg_nn
from datetime import datetime, timedelta
import os

# 设置训练日期范围
start_date = datetime(2020, 9, 1)
end_date = datetime(2020, 12, 1)  # 先训练十天

# 文件路径
json_file_path = "ticker_train_data.json"
xlsx_file_path = "train_stock_data.xlsx"
output_file_path = "GraphAutoencoderoutput_data.xlsx"

# 读取 JSON 文件数据
with open(json_file_path, 'r') as f:
    json_data = json.load(f)

# 读取 Excel 文件数据
df_excel = pd.read_excel(xlsx_file_path)

# 定义图自编码器模型
class GraphAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, embedding_dim, num_nodes):
        super(GraphAutoencoder, self).__init__()
        self.num_nodes = num_nodes
        # 编码器
        self.gc1 = pyg_nn.GCNConv(input_dim, hidden_dim)
        self.gc2 = pyg_nn.GCNConv(hidden_dim, embedding_dim)
        # 解码器
        self.gc3 = pyg_nn.GCNConv(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, num_nodes)  # 添加全连接层，输出维度为 num_nodes

    def forward(self, x, edge_index):
        # 编码
        x = torch.relu(self.gc1(x, edge_index))
        embedding = self.gc2(x, edge_index)
        # 解码
        x = torch.relu(self.gc3(embedding, edge_index))
        x = self.fc(x)  # 使用全连接层将每个节点的特征映射到 num_nodes 维度
        reconstructed_adj = torch.sigmoid(x @ x.T)  # 计算重建的邻接矩阵
        return embedding, reconstructed_adj

# 主处理流程
all_dates_data = []
current_date = start_date

while current_date <= end_date:
    date_str = current_date.strftime("%Y-%m-%d")
    print(f"Processing date: {date_str}")

    # 检查当前日期在 JSON 数据中是否存在
    date_data = None
    for item in json_data:
        if item.get("Date") == date_str:
            date_data = item
            break

    if date_data:
        # 构建图数据
        companies_info = date_data.get("Affected Companies", {})
        companies = list(companies_info.keys())
        num_nodes = len(companies)
        company_to_idx = {company: idx for idx, company in enumerate(companies)}

        # 构建节点特征（negative: [1,0,0], positive: [0,1,0], neutral: [0,0,1]）
        x = []
        for company in companies:
            sentiment = companies_info[company]
            if sentiment == "negative":
                x.append([1, 0, 0])
            elif sentiment == "positive":
                x.append([0, 1, 0])
            else:  # neutral
                x.append([0, 0, 1])
        x = torch.tensor(x, dtype=torch.float)

        # 构建边（全连接图）
        edge_index = []
        for i in range(num_nodes):
            for j in range(num_nodes):
                if i != j:
                    edge_index.append([i, j])
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

        # 构建邻接矩阵（作为目标）
        adj_matrix = torch.zeros((num_nodes, num_nodes))
        for i in range(num_nodes):
            for j in range(num_nodes):
                if i != j:
                    adj_matrix[i][j] = 1.0  # 假设所有边都存在，权重为 1.0

        # 初始化模型、优化器等
        model = GraphAutoencoder(input_dim=3, hidden_dim=16, embedding_dim=32, num_nodes=num_nodes)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        criterion = nn.MSELoss()  # 使用均方误差损失函数

        # 训练图自编码器模型
        model.train()
        optimizer.zero_grad()
        embedding, reconstructed_adj = model(x, edge_index)
        loss = criterion(reconstructed_adj, adj_matrix)  # 计算重建损失
        loss.backward()
        optimizer.step()

        # 获取节点特征向量（32 维嵌入）
        node_features = embedding.detach().numpy()

        # 将特征向量添加到 Excel 数据中
        # 这里将 "Date" 修改为 "Names Date"
        date_df = df_excel[df_excel["Names Date"].astype(str).str.contains(date_str)].copy()  # 使用 .copy() 创建副本
        for idx, row in date_df.iterrows():
            ticker = row["Ticker Symbol"]
            if ticker in company_to_idx:
                feature_idx = company_to_idx[ticker]
                feature_vector = node_features[feature_idx]
                # 添加特征向量到数据帧中（这里假设数据帧有额外的列来存储特征向量）
                for i in range(32):
                    date_df.loc[idx, f"Feature_{i+1}"] = feature_vector[i]  # 使用 .loc 进行赋值
            else:
                # 未被提到的公司，特征向量赋值为 0
                for i in range(32):
                    date_df.loc[idx, f"Feature_{i+1}"] = 0.0  # 使用 .loc 进行赋值

        all_dates_data.append(date_df)
    else:
        # 当日没有 JSON 数据，所有公司特征向量赋值为 0
        # 这里将 "Date" 修改为 "Names Date"
        date_df = df_excel[df_excel["Names Date"].astype(str).str.contains(date_str)].copy()  # 使用 .copy() 创建副本
        if not date_df.empty:
            for i in range(32):
                date_df[f"Feature_{i+1}"] = 0.0
            all_dates_data.append(date_df)
        else:
            # 如果当日没有 JSON 数据且 Excel 中也没有对应日期的数据，则创建一个新的数据框
            date_df = pd.DataFrame(columns=df_excel.columns.tolist() + [f"Feature_{i+1}" for i in range(32)])
            # 假设 "Names Date" 列的格式为 "2020-09-01 00:00:00"，根据实际情况调整
            date_df["Names Date"] = date_str + " 00:00:00"
            all_dates_data.append(date_df)

    current_date += timedelta(days=1)

# 合并所有日期的数据并保存到新的 Excel 文件
final_df = pd.concat(all_dates_data)
final_df.to_excel(output_file_path, index=False)

print(f"Data processing complete. Output saved to {output_file_path}")

Processing date: 2020-09-01
Processing date: 2020-09-02
Processing date: 2020-09-03
Processing date: 2020-09-04
Processing date: 2020-09-05
Processing date: 2020-09-06
Processing date: 2020-09-07
Processing date: 2020-09-08
Processing date: 2020-09-09
Processing date: 2020-09-10
Processing date: 2020-09-11
Processing date: 2020-09-12
Processing date: 2020-09-13
Processing date: 2020-09-14
Processing date: 2020-09-15
Processing date: 2020-09-16
Processing date: 2020-09-17
Processing date: 2020-09-18
Processing date: 2020-09-19
Processing date: 2020-09-20
Processing date: 2020-09-21
Processing date: 2020-09-22
Processing date: 2020-09-23
Processing date: 2020-09-24
Processing date: 2020-09-25
Processing date: 2020-09-26
Processing date: 2020-09-27
Processing date: 2020-09-28
Processing date: 2020-09-29
Processing date: 2020-09-30
Processing date: 2020-10-01
Processing date: 2020-10-02
Processing date: 2020-10-03
Processing date: 2020-10-04
Processing date: 2020-10-05
Processing date: 202

  final_df = pd.concat(all_dates_data)


Data processing complete. Output saved to GraphAutoencoderoutput_data.xlsx
