<a href="https://colab.research.google.com/github/Hijuli66/33/blob/master/fusion_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 挂载 Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

# 2. 在 33 整个目录下搜索所有 csv 文件
!find "/content/drive/MyDrive/33" -type f -name "*.csv" 2>/dev/null


/content/drive/MyDrive/33/Images/QaTa-dataset/pair_table.csv
/content/drive/MyDrive/33/Images/Models/images_features.csv
/content/drive/MyDrive/33/Text/text_features.csv
/content/drive/MyDrive/33/Text/keywords.csv
/content/drive/MyDrive/33/Data/train.csv
/content/drive/MyDrive/33/Data/val.csv
/content/drive/MyDrive/33/Data/test.csv
/content/drive/MyDrive/33/Data/test_predictions.csv
/content/drive/MyDrive/33/Data/reports2.csv
/content/drive/MyDrive/33/Data/reports1.csv
/content/drive/MyDrive/33/Data/reports_merged.csv


In [None]:
# 挂载 Google Drive
from google.colab import drive
drive.mount('/content/drive')


# ==========================================
# 阶段 0：安装必要的包
# ==========================================
print("阶段 0：正在安装依赖包...")
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 -q
!pip install transformers sentence-transformers open-clip-torch ftfy scikit-learn -q
print("阶段 0 完成：所有依赖包已安装\n")

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os

# ==========================================
# 阶段 1
# ==========================================
print("阶段 1：加载图像和文本特征文件...")
img_df = pd.read_csv("/content/drive/MyDrive/33/Images/Models/images_features.csv")
text_df = pd.read_csv("/content/drive/MyDrive/33/Text/text_features.csv")
print("加载完成，图像样本:", len(img_df), "文本样本:", len(text_df))

# 合并
df = img_df.merge(text_df, on='image_id', how='inner', suffixes=('_img', '_text'))
print(f"成功配对样本数: {len(df)}")

# 正确选择列：图像特征、文本特征、图像的标签
df = df[['image_id', 'feature_vector_img', 'feature_vector_text', 'label_img']]

# 重命名
df.rename(columns={
    'feature_vector_img': 'img_vec',
    'feature_vector_text': 'text_vec',
    'label_img': 'label'
}, inplace=True)

print("最终使用的列名:", df.columns.tolist())

# 解析函数
def str_to_vec(s, expected_dim):
    if pd.isna(s):
        return np.zeros(expected_dim, dtype=np.float32)
    s = str(s).strip()
    if s.startswith('['):
        s = s[1:-1]  # 去掉可能的外层方括号
    return np.fromstring(s, sep=',', dtype=np.float32)

print("正在解析特征向量（约10-20秒）...")
df['img_vec'] = df['img_vec'].apply(lambda x: str_to_vec(x, 576))   # 图像特征是576维
df['text_vec'] = df['text_vec'].apply(lambda x: str_to_vec(x, 768)) # 文本特征是768维

# 检查解析是否成功
print("第一条图像特征形状:", df['img_vec'].iloc[0].shape)   # 应为 (576,)
print("第一条文本特征形状:", df['text_vec'].iloc[0].shape) # 应为 (768,)
print("前5个图像特征值:", df['img_vec'].iloc[0][:5])

# 转为 numpy 并 L2 归一化（CLIP 类模型归一化）
img_feats = np.stack(df['img_vec'].values)
text_feats = np.stack(df['text_vec'].values)
labels = df['label'].values.astype(np.float32)

img_feats = img_feats / (np.linalg.norm(img_feats, axis=1, keepdims=True) + 1e-8)
text_feats = text_feats / (np.linalg.norm(text_feats, axis=1, keepdims=True) + 1e-8)

print(f"最终特征形状 → 图像: {img_feats.shape} | 文本: {text_feats.shape} | 标签: {labels.shape}")
print("阶段 1 完成：数据加载、解析、归一化全部成功\n")

# ==========================================
# 阶段 2：智能划分（自动适配任意数据量）
# ==========================================
print("阶段 2：智能划分数据集（8:1:1）...")
train_val_idx, test_idx = train_test_split(np.arange(len(df)), test_size=0.1, random_state=42, stratify=labels)
train_idx, val_idx = train_test_split(train_val_idx, test_size=0.1111, random_state=42, stratify=labels[train_val_idx])

train_img, val_img, test_img = img_feats[train_idx], img_feats[val_idx], img_feats[test_idx]
train_text, val_text, test_text = text_feats[train_idx], text_feats[val_idx], text_feats[test_idx]
train_labels, val_labels, test_labels = labels[train_idx], labels[val_idx], labels[test_idx]

print(f"训练集: {len(train_idx)} | 验证集: {len(val_idx)} | 测试集: {len(test_idx)}")
print("阶段 2 完成\n")

# ==========================================
# 阶段 3：定义模型和数据加载器
# ==========================================
print("阶段 3：定义融合模型和 DataLoader...")

class SimpleFusion(nn.Module):
    def __init__(self, img_dim=576, text_dim=768, hidden=512):
        super().__init__()
        self.proj_img = nn.Linear(img_dim, 256)
        self.proj_text = nn.Linear(text_dim, 256)
        self.classifier = nn.Sequential(
            nn.Linear(512, hidden),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden, 1)
        )

    def forward(self, img_feat, text_feat):
        i = self.proj_img(img_feat)
        t = self.proj_text(text_feat)
        fused = torch.cat([i, t], dim=-1)
        return torch.sigmoid(self.classifier(fused))

model = SimpleFusion().cuda()
print("模型已定义并移到 GPU")

class MyDataset(Dataset):
    def __init__(self, img_f, text_f, labels):
        self.img_f = torch.tensor(img_f, dtype=torch.float32)
        self.text_f = torch.tensor(text_f, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx): return self.img_f[idx], self.text_f[idx], self.labels[idx]

train_dataset = MyDataset(train_img, train_text, train_labels)
val_dataset   = MyDataset(val_img, val_text, val_labels)
test_dataset  = MyDataset(test_img, test_text, test_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=128, shuffle=False)
print("阶段 3 完成：模型和 DataLoader \n")

# ==========================================
# 阶段 4：训练模型（带早停）
# ==========================================
print("阶段 4：开始训练融合模型（最多 30 个 epoch，验证集 AUC 最高时保存）...")
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

best_val_auc = 0
best_model_path = "/content/drive/MyDrive/33/Matching/best_fusion_model.pth"

for epoch in range(30):
    model.train()
    train_loss = 0
    for i, t, l in tqdm(train_loader, desc=f"Epoch {epoch+1}/30"):
        i, t, l = i.cuda(), t.cuda(), l.cuda().unsqueeze(1)
        pred = model(i, t)
        loss = criterion(pred, l)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # 验证
    model.eval()
    val_preds, val_true = [], []
    with torch.no_grad():
        for i, t, l in val_loader:
            i, t = i.cuda(), t.cuda()
            pred = model(i, t).cpu().numpy()
            val_preds.extend(pred.flatten())
            val_true.extend(l.numpy())

    val_auc = roc_auc_score(val_true, val_preds)
    print(f"Epoch {epoch+1:2d} | Train Loss: {train_loss/len(train_loader):.4f} | Val AUC: {val_auc:.4f}")

    if val_auc > best_val_auc:
        best_val_auc = val_auc
        torch.save(model.state_dict(), best_model_path)
        print("保存最佳模型（当前最高 Val AUC）")

print("阶段 4 完成：训练结束，最佳模型已保存\n")

# ==========================================
# 阶段 5：加载最佳模型，在测试集上评估
# ==========================================
print("阶段 5：加载最佳模型，在测试集上进行最终评估")
model.load_state_dict(torch.load(best_model_path))
model.eval()

test_preds = []
with torch.no_grad():
    for i, t, l in tqdm(test_loader, desc="测试集推理"):
        i, t = i.cuda(), t.cuda()
        pred = model(i, t).cpu().numpy()
        test_preds.extend(pred.flatten())

test_preds = np.array(test_preds)
test_pred_label = (test_preds > 0.5).astype(int)

model_auc = roc_auc_score(test_labels, test_preds)
model_acc = accuracy_score(test_labels, test_pred_label)
print(f"模型在测试集上的 AUC: {model_auc:.4f} | Acc: {model_acc:.4f}")
print("阶段 5 完成\n")

# ==========================================
# 阶段 6：投影后余弦相似度 baseline
# ==========================================
print("阶段 6：使用训练好的投影层计算余弦相似度 baseline（投影到同一空间）...")

model.eval()
with torch.no_grad():
    # 把测试集特征转成 tensor 并移到 GPU
    test_img_tensor  = torch.tensor(test_img,  dtype=torch.float32).cuda()
    test_text_tensor = torch.tensor(test_text, dtype=torch.float32).cuda()

    # 只用模型的前两层投影（不走分类头）
    proj_img  = model.proj_img(test_img_tensor)   # (N, 256)
    proj_text = model.proj_text(test_text_tensor) # (N, 256)

    # L2 归一化后再计算余弦相似度（等价于点积）
    proj_img  = torch.nn.functional.normalize(proj_img,  p=2, dim=1)
    proj_text = torch.nn.functional.normalize(proj_text, p=2, dim=1)

    cos_sim = (proj_img * proj_text).sum(dim=1).cpu().numpy()  # (N,)

# 计算指标
baseline_auc = roc_auc_score(test_labels, cos_sim)
baseline_acc = accuracy_score(test_labels, (cos_sim > 0.0).astype(int))  # 投影后阈值通常接近0

print(f"【投影后余弦相似度 Baseline】 AUC: {baseline_auc:.4f} | Acc: {baseline_acc:.4f}")
print("阶段 6 完成\n")

# ==========================================
# 阶段 7：保存最终结果
# ==========================================
print("阶段 7：保存最终结果到 Google Drive...")
result_df = pd.DataFrame({
    "image_id": df.iloc[test_idx]['image_id'].values,
    "similarity": cos_sim,
    "fusion_confidence": test_preds,
    "diagnosis": test_pred_label,
    "true_label": test_labels
})
result_df.to_csv("/content/drive/MyDrive/33/Matching/final_fusion_results.csv", index=False)
print("阶段 7 完成：最终结果已保存到 final_fusion_results.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
阶段 0：正在安装依赖包...
阶段 0 完成：所有依赖包已安装

阶段 1：加载图像和文本特征文件...
加载完成，图像样本: 9219 文本样本: 9603
成功配对样本数: 9219
最终使用的列名: ['image_id', 'img_vec', 'text_vec', 'label']
正在解析特征向量（约10-20秒）...
第一条图像特征形状: (576,)
第一条文本特征形状: (768,)
前5个图像特征值: [0.48215514 0.5700968  0.36772284 0.97950566 0.8299695 ]
最终特征形状 → 图像: (9219, 576) | 文本: (9219, 768) | 标签: (9219,)
阶段 1 完成：数据加载、解析、归一化全部成功！

阶段 2：智能划分数据集（8:1:1）...
训练集: 7375 | 验证集: 922 | 测试集: 922
阶段 2 完成

阶段 3：定义融合模型和 DataLoader...
模型已定义并移到 GPU
阶段 3 完成：模型和 DataLoader 

阶段 4：开始训练融合模型（最多 30 个 epoch，验证集 AUC 最高时保存）...


Epoch 1/30: 100%|██████████| 116/116 [00:00<00:00, 379.56it/s]


Epoch  1 | Train Loss: 0.0852 | Val AUC: 1.0000
   → 保存最佳模型！（当前最高 Val AUC）


Epoch 2/30: 100%|██████████| 116/116 [00:00<00:00, 367.34it/s]


Epoch  2 | Train Loss: 0.0025 | Val AUC: 1.0000


Epoch 3/30: 100%|██████████| 116/116 [00:00<00:00, 372.33it/s]


Epoch  3 | Train Loss: 0.0012 | Val AUC: 1.0000


Epoch 4/30: 100%|██████████| 116/116 [00:00<00:00, 390.91it/s]


Epoch  4 | Train Loss: 0.0004 | Val AUC: 1.0000


Epoch 5/30: 100%|██████████| 116/116 [00:00<00:00, 383.14it/s]


Epoch  5 | Train Loss: 0.0001 | Val AUC: 1.0000


Epoch 6/30: 100%|██████████| 116/116 [00:00<00:00, 373.71it/s]


Epoch  6 | Train Loss: 0.0001 | Val AUC: 1.0000


Epoch 7/30: 100%|██████████| 116/116 [00:00<00:00, 396.11it/s]


Epoch  7 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 8/30: 100%|██████████| 116/116 [00:00<00:00, 392.82it/s]


Epoch  8 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 9/30: 100%|██████████| 116/116 [00:00<00:00, 377.06it/s]


Epoch  9 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 10/30: 100%|██████████| 116/116 [00:00<00:00, 392.02it/s]


Epoch 10 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 11/30: 100%|██████████| 116/116 [00:00<00:00, 382.72it/s]


Epoch 11 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 12/30: 100%|██████████| 116/116 [00:00<00:00, 375.79it/s]


Epoch 12 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 13/30: 100%|██████████| 116/116 [00:00<00:00, 383.22it/s]


Epoch 13 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 14/30: 100%|██████████| 116/116 [00:00<00:00, 384.43it/s]


Epoch 14 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 15/30: 100%|██████████| 116/116 [00:00<00:00, 366.30it/s]


Epoch 15 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 16/30: 100%|██████████| 116/116 [00:00<00:00, 386.10it/s]


Epoch 16 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 17/30: 100%|██████████| 116/116 [00:00<00:00, 383.49it/s]


Epoch 17 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 18/30: 100%|██████████| 116/116 [00:00<00:00, 312.35it/s]


Epoch 18 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 19/30: 100%|██████████| 116/116 [00:00<00:00, 360.80it/s]


Epoch 19 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 20/30: 100%|██████████| 116/116 [00:00<00:00, 375.73it/s]


Epoch 20 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 21/30: 100%|██████████| 116/116 [00:00<00:00, 377.85it/s]


Epoch 21 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 22/30: 100%|██████████| 116/116 [00:00<00:00, 382.61it/s]


Epoch 22 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 23/30: 100%|██████████| 116/116 [00:00<00:00, 376.54it/s]


Epoch 23 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 24/30: 100%|██████████| 116/116 [00:00<00:00, 315.29it/s]


Epoch 24 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 25/30: 100%|██████████| 116/116 [00:00<00:00, 307.06it/s]


Epoch 25 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 26/30: 100%|██████████| 116/116 [00:00<00:00, 309.76it/s]


Epoch 26 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 27/30: 100%|██████████| 116/116 [00:00<00:00, 284.53it/s]


Epoch 27 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 28/30: 100%|██████████| 116/116 [00:00<00:00, 307.11it/s]


Epoch 28 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 29/30: 100%|██████████| 116/116 [00:00<00:00, 309.35it/s]


Epoch 29 | Train Loss: 0.0000 | Val AUC: 1.0000


Epoch 30/30: 100%|██████████| 116/116 [00:00<00:00, 265.42it/s]


Epoch 30 | Train Loss: 0.0000 | Val AUC: 1.0000
阶段 4 完成：训练结束，最佳模型已保存

阶段 5：加载最佳模型，在测试集上进行最终评估


测试集推理: 100%|██████████| 8/8 [00:00<00:00, 510.58it/s]


模型在测试集上的 AUC: 1.0000 | Acc: 1.0000
阶段 5 完成

阶段 6：使用训练好的投影层计算余弦相似度 baseline（投影到同一空间）...
【投影后余弦相似度 Baseline】 AUC: 0.0078 | Acc: 0.3970
阶段 6 完成（这才是真正的强 baseline！）

阶段 7：保存最终结果到 Google Drive...
阶段 7 完成：最终结果已保存到 final_fusion_results.csv


In [None]:
# ==========================================
# 只打印列名和样例数据（诊断专用）
# ==========================================
import pandas as pd

print("正在读取图像特征文件...")
img_df = pd.read_csv("/content/drive/MyDrive/33/Images/Models/images_features.csv")
print("\n【图像特征文件】列名：")
print(img_df.columns.tolist())
print("\n【图像特征文件】前 3 行完整数据：")
print(img_df.head(3))
print(f"\n总行数：{len(img_df)}")
print("-" * 60)

print("正在读取文本特征文件...")
text_df = pd.read_csv("/content/drive/MyDrive/33/Text/text_features.csv")
print("\n【文本特征文件】列名：")
print(text_df.columns.tolist())
print("\n【文本特征文件】前 3 行完整数据：")
print(text_df.head(3))
print(f"\n总行数：{len(text_df)}")
print("=" * 60)

正在读取图像特征文件...

【图像特征文件】列名：
['image_id', 'feature_vector', 'label']

【图像特征文件】前 3 行完整数据：
          image_id                                     feature_vector  label
0  normal_4980.png  0.48215514,0.5700968,0.36772284,0.97950566,0.8...      0
1  normal_4976.png  0.3330781,0.34160233,0.56426513,0.7748047,0.52...      0
2  normal_4986.png  0.23818964,0.8397683,0.077592514,-0.03271132,0...      0

总行数：9219
------------------------------------------------------------
正在读取文本特征文件...

【文本特征文件】列名：
['image_id', 'feature_vector', 'label']

【文本特征文件】前 3 行完整数据：
         image_id                                     feature_vector  label
0   covid_424.png  0.16321910917758942,0.022436531260609627,-0.07...      1
1  covid_4161.png  0.1978515386581421,0.10673397779464722,0.56492...      1
2  covid_4188.png  0.025793712586164474,-0.401252806186676,-0.218...      1

总行数：9603
