In [None]:
import torch
print('GPU 可用：', torch.cuda.is_available(), '，设备名：', torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


GPU 可用： False ，设备名： None


In [None]:
# 导入embedding和stock label文件
import pandas as pd
from google.colab import drive
import torch
drive.mount('/content/drive')

# 2. 指定 .pt 文件路径
pt_path = '/content/drive/My Drive/545 Group/datasets/embeddings_split0.pt'

# 3. 用 torch.load 读取
embeddings = torch.load(pt_path, weights_only=False)

# 查看类型和内容
print(type(embeddings))
try:
    print(embeddings.shape)
except:
    # 如果是 dict 或 list
    print(embeddings.keys() if isinstance(embeddings, dict) else len(embeddings))

# 2. 读入标签
stock_path = '/content/drive/My Drive/545 Group/orgi/labeled_stock.csv'

df_labels = pd.read_csv(stock_path, parse_dates=["Date"])

# 把标签表里的 Date 列也格式化成 "YYYY-MM-DD" 字符串，以便跟 result['date'] 对齐
df_labels["date"] = df_labels["Date"].dt.strftime("%Y-%m-%d")
df_labels = df_labels[["date", "label"]]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'list'>
1943232


In [None]:
df_labels.head()

Unnamed: 0,date,label
0,2010-01-04,1
1,2010-01-05,1
2,2010-01-06,1
3,2010-01-07,1
4,2010-01-08,1


In [None]:
# 把embedding转成dataframe

import pandas as pd
import numpy as np

# 构造 DataFrame
df1 = pd.DataFrame({
    'date':      [sample['date'] for sample in embeddings],
    'embedding': [np.asarray(sample['embedding']).tolist() for sample in embeddings]
})

print(df1.shape)  # (194323, 2)
display(df1.head())


(1943232, 2)


Unnamed: 0,date,embedding
0,2023-12-16 23:00:00 UTC,"[-0.90576171875, -0.751953125, -0.9228515625, ..."
1,2023-12-12 00:00:00 UTC,"[-0.88232421875, -0.423583984375, -0.860839843..."
2,2023-12-12 00:00:00 UTC,"[-0.76904296875, -0.422119140625, -0.766113281..."
3,2023-12-07 00:00:00 UTC,"[-0.79150390625, -0.2430419921875, -0.08081054..."
4,2023-12-07 00:00:00 UTC,"[-0.371826171875, -0.320068359375, -0.9609375,..."


In [None]:
# 把date 去掉小时分钟和UTC，再加一
df = df1.copy()

# 1) 解析成 datetime 并只保留日期部分
#    去掉末尾 " UTC"，再 parse，最后保留 .dt.date
df['date'] = df1['date'].str[:10]

# 1) 先转成 datetime 类型
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# 2) 加一天
df['date'] = df['date'] + pd.Timedelta(days=1)

# 3) 如果你还想把它变回字符串
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

# 2) 把 embedding 列转换成 ndarray
df['emb_arr'] = df['embedding'].apply(np.asarray)

print(df.shape)
display(df.head())

(1943232, 3)


Unnamed: 0,date,embedding,emb_arr
0,2023-12-17,"[-0.90576171875, -0.751953125, -0.9228515625, ...","[-0.90576171875, -0.751953125, -0.9228515625, ..."
1,2023-12-13,"[-0.88232421875, -0.423583984375, -0.860839843...","[-0.88232421875, -0.423583984375, -0.860839843..."
2,2023-12-13,"[-0.76904296875, -0.422119140625, -0.766113281...","[-0.76904296875, -0.422119140625, -0.766113281..."
3,2023-12-08,"[-0.79150390625, -0.2430419921875, -0.08081054...","[-0.79150390625, -0.2430419921875, -0.08081054..."
4,2023-12-08,"[-0.371826171875, -0.320068359375, -0.9609375,...","[-0.371826171875, -0.320068359375, -0.9609375,..."


In [None]:
# 查看重复日期数量

# 总行数
total = len(df)

# 不同日期的个数
unique_dates = df['date'].nunique()

# 重复的日期总数
dup_count = total - unique_dates
print(f'总行数: {total}，唯一日期数: {unique_dates}，重复日期条目数: {dup_count}')

# 如果你想看每个日期出现的次数：
counts = df['date'].value_counts()
print(counts)

# 或者直接计算有多少行是重复（只保留第一次出现为非重复）：
dup_rows = df['date'].duplicated().sum()
print(f'使用duplicated()算出的重复行数: {dup_rows}')


总行数: 1943232，唯一日期数: 5089，重复日期条目数: 1938143
date
2023-12-12    47950
2023-12-17    43651
2023-12-13    43420
2023-12-14    26925
2023-12-08    25769
              ...  
2012-07-09        1
2009-06-07        1
2014-03-09        1
2012-12-16        1
2012-01-01        1
Name: count, Length: 5089, dtype: int64
使用duplicated()算出的重复行数: 1938143


In [None]:
import torch
import numpy as np
import pandas as pd

# 1) 按 date 合并，保证每条新闻都有对应的当天标签
merged = pd.merge(df[['date','emb_arr']], df_labels, on='date', how='inner')
# merged 有 ['date','emb_arr','label']

# 2) 分组构造 sequences 和 labels
sequences = []
labels    = []

for date, group in merged.groupby('date'):
    # group['emb_arr'] 是当天多条新闻的 np.ndarray list
    # 转成形状 (N_i, D) 的 Tensor
    seq = torch.stack([torch.tensor(v, dtype=torch.float32)
                       for v in group['emb_arr'].values], dim=0)
    sequences.append(seq)
    # 当天只有一个标签
    labels.append(int(group['label'].iloc[0]))




In [None]:
merged.head()

Unnamed: 0,date,emb_arr,label
0,2023-12-13,"[-0.88232421875, -0.423583984375, -0.860839843...",2
1,2023-12-13,"[-0.76904296875, -0.422119140625, -0.766113281...",2
2,2023-12-08,"[-0.79150390625, -0.2430419921875, -0.08081054...",2
3,2023-12-08,"[-0.371826171875, -0.320068359375, -0.9609375,...",2
4,2023-12-06,"[-0.89013671875, -0.4248046875, -0.6533203125,...",2


In [None]:
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ——— 准备基线特征：对每个序列求平均 ———
# sequences: list of Tensors [N_i, D]
# labels:    list of int

# 把每个 Tensor 序列沿时间维度取平均，得到 shape (D,) 的向量
X = np.stack([seq.mean(dim=0).cpu().numpy() for seq in sequences], axis=0)  # (num_days, D)
y = np.array(labels)                                                         # (num_days,)

# ——— 划分训练/验证集 ———
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ——— 随机森林基线 ———
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

print("=== Random Forest ===")
print(f"Accuracy: {accuracy_score(y_val, y_pred_rf):.4f}")
print(classification_report(y_val, y_pred_rf, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_rf))

# ——— XGBoost 基线 ———
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=1,
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_val)

print("\n=== XGBoost ===")
print(f"Accuracy: {accuracy_score(y_val, y_pred_xgb):.4f}")
print(classification_report(y_val, y_pred_xgb, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_xgb))

=== Random Forest ===
Accuracy: 0.5640
              precision    recall  f1-score   support

           0     0.4751    0.3071    0.3731       280
           1     0.0000    0.0000    0.0000        18
           2     0.5953    0.7708    0.6718       397

    accuracy                         0.5640       695
   macro avg     0.3568    0.3593    0.3483       695
weighted avg     0.5315    0.5640    0.5341       695

Confusion Matrix:
 [[ 86   0 194]
 [  4   0  14]
 [ 91   0 306]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBoost ===
Accuracy: 0.5885
              precision    recall  f1-score   support

           0     0.5174    0.3714    0.4324       280
           1     0.0000    0.0000    0.0000        18
           2     0.6174    0.7683    0.6846       397

    accuracy                         0.5885       695
   macro avg     0.3783    0.3799    0.3724       695
weighted avg     0.5611    0.5885    0.5653       695

Confusion Matrix:
 [[104   0 176]
 [  5   0  13]
 [ 92   0 305]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# ——————————————————————————————————————————————
# 1) 读入并预处理：产生 sequences, labels
# ——————————————————————————————————————————————
# 假设 df, df_labels 已经像之前那样准备好，并且合并了：
# merged = pd.merge(df[['date','emb_arr']], df_labels, on='date', how='inner')

sequences, labels = [], []
for date, group in merged.groupby('date'):
    # 每天 N_i 条新闻，每条 emb_arr 是 np.ndarray of shape (D,)
    seq = torch.stack([
        torch.tensor(v, dtype=torch.float32)
        for v in group['emb_arr'].values
    ], dim=0)  # shape (N_i, D)
    sequences.append(seq)
    labels.append(int(group['label'].iloc[0]))

# ——————————————————————————————————————————————
# 2) Dataset + collate_fn
# ——————————————————————————————————————————————
class DateNewsDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels    = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

MAX_LEN = 512  # 比如每天最多用 512 条新闻
import random

def sample_seq(s, N):
    L = s.size(0)
    if L <= N:
        return s
    # 随机选 N 个索引，再按原序排序
    idx = sorted(random.sample(range(L), N))
    return s[idx]  # 直接用张量索引

def collate_fn(batch):
    seqs, labs = zip(*batch)
    seqs = [sample_seq(s, MAX_LEN) for s in seqs]
    padded_seqs = pad_sequence(seqs, batch_first=True, padding_value=0.0)
    # … 其余不变 …
    lengths = [s.size(0) for s in seqs]
    mask = torch.zeros(len(seqs), padded_seqs.size(1), dtype=torch.bool)
    for i, L in enumerate(lengths):
        mask[i, :L] = 1
    labels = torch.tensor(labs, dtype=torch.long)
    return padded_seqs, mask, labels

# ——————————————————————————————————————————————
# 3) 划分 train/val & DataLoader
# ——————————————————————————————————————————————
seq_train, seq_val, lab_train, lab_val = train_test_split(
    sequences, labels,
    test_size=0.2, random_state=42, stratify=labels
)

train_ds = DateNewsDataset(seq_train, lab_train)
val_ds   = DateNewsDataset(seq_val,   lab_val)

train_loader = DataLoader(
    train_ds, batch_size=16, shuffle=True,
    collate_fn=collate_fn, drop_last=False
)
val_loader   = DataLoader(
    val_ds,   batch_size=16, shuffle=False,
    collate_fn=collate_fn, drop_last=False
)







In [None]:


# ——————————————————————————————————————————————
# 4) 定义 TransformerClassifier
# ——————————————————————————————————————————————

class TransformerClassifier(nn.Module):
    def __init__(self, embed_dim, num_classes,
                 nhead=8, num_layers=2, dim_feedforward=512,å
                 dropout=0.1, max_pos=5000):
        super().__init__()
        # 用一个 Embedding 表储存位置向量
        self.pos_emb = nn.Embedding(max_pos, embed_dim)
        # TransformerEncoder 和 分类头不变……
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=nhead,
            dim_feedforward=dim_feedforward, dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(256, num_classes)
        )

    def forward(self, x, mask=None):
        # x: (B, S, E)
        B, S, E = x.shape
        # positions: (B, S), 每行 [0,1,2,…,S-1]
        positions = torch.arange(S, device=x.device).unsqueeze(0).expand(B, S)
        pos = self.pos_emb(positions)     # (B, S, E)
        x = x + pos                       # 加上位置向量

        key_mask = None if mask is None else (~mask)
        x_enc = self.encoder(x, src_key_padding_mask=key_mask)
        x_mean = x_enc.mean(dim=1)
        return self.classifier(x_mean)


# ——————————————————————————————————————————————
# 5) 初始化模型、优化器、损失函数
# ——————————————————————————————————————————————


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embed_dim   = seq_train[0].shape[1]
num_classes = len(set(labels))

# 在初始化 model 之前，加上这一段：
max_pos = max(seq.size(0) for seq in sequences)
print("最长序列长度：", max_pos)

model = TransformerClassifier(
    embed_dim, num_classes,
    nhead=8, num_layers=2, dim_feedforward=512, dropout=0.1,
    max_pos=max_pos     # 保证位置 Embedding 表足够长
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# ——————————————————————————————————————————————
# 6) 训练 + 验证循环
# ——————————————————————————————————————————————
num_epochs = 5
for epoch in range(1, num_epochs+1):
    # —— 训练 ——
    model.train()
    train_loss = 0.0
    for x_batch, mask_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch} train"):
        x_batch, mask_batch, y_batch = x_batch.to(device), mask_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        logits = model(x_batch, mask_batch)
        loss   = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    # —— 验证 ——
    model.eval()
    val_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for x_batch, mask_batch, y_batch in tqdm(val_loader, desc=f"Epoch {epoch} val"):
            x_batch, mask_batch, y_batch = x_batch.to(device), mask_batch.to(device), y_batch.to(device)
            logits = model(x_batch, mask_batch)
            val_loss += criterion(logits, y_batch).item()
            preds = logits.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total   += y_batch.size(0)
    val_loss /= len(val_loader)
    val_acc   = correct / total

    print(f"[Epoch {epoch}] train_loss: {train_loss:.4f}  "
          f"val_loss: {val_loss:.4f}  val_acc: {val_acc:.4f}")


最长序列长度： 47950


Epoch 1 train: 100%|██████████| 174/174 [01:51<00:00,  1.57it/s]
  output = torch._nested_tensor_from_mask(
Epoch 1 val: 100%|██████████| 44/44 [00:06<00:00,  7.30it/s]


[Epoch 1] train_loss: 0.8041  val_loss: 0.8732  val_acc: 0.5367


Epoch 2 train: 100%|██████████| 174/174 [01:49<00:00,  1.59it/s]
Epoch 2 val: 100%|██████████| 44/44 [00:05<00:00,  7.88it/s]


[Epoch 2] train_loss: 0.7926  val_loss: 0.9028  val_acc: 0.5122


Epoch 3 train: 100%|██████████| 174/174 [01:55<00:00,  1.50it/s]
Epoch 3 val: 100%|██████████| 44/44 [00:09<00:00,  4.69it/s]


[Epoch 3] train_loss: 0.7842  val_loss: 0.8597  val_acc: 0.5266


Epoch 4 train: 100%|██████████| 174/174 [02:10<00:00,  1.34it/s]
Epoch 4 val: 100%|██████████| 44/44 [00:08<00:00,  5.14it/s]


[Epoch 4] train_loss: 0.7746  val_loss: 0.8733  val_acc: 0.4993


Epoch 5 train: 100%|██████████| 174/174 [02:41<00:00,  1.08it/s]
Epoch 5 val: 100%|██████████| 44/44 [00:08<00:00,  5.30it/s]

[Epoch 5] train_loss: 0.7615  val_loss: 0.8671  val_acc: 0.5252



