<a href="https://colab.research.google.com/github/Heng1222/Ohsumed_classification/blob/main/Model/task3_MeSH_LoRA_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model, TaskType
import matplotlib.pyplot as plt
from torchinfo import summary
from tqdm import tqdm

# 1. 定義論文中的 MeSH Semantic Loss (WSL)
# 公式: L_WSL = (1/|P|) * sum(|CosSim(v_i, v_j) - WUP(s_i, s_j)|)^2
class MeSHSemanticLoss(nn.Module):
    def __init__(self):
        super(MeSHSemanticLoss, self).__init__()

    def forward(self, vec_i, vec_j, target_wup):
        # 計算餘弦相似度 CosSim(v_i, v_j)
        cos_sim = F.cosine_similarity(vec_i, vec_j)
        # 計算誤差 e_WSL = |CosSim - WUP|
        loss = torch.mean((cos_sim - target_wup) ** 2)
        return loss

# 2. 定義資料集類別，用於處理 CSV 讀取的資料
class SemanticPairDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.encoded_i = tokenizer(dataframe['word_i'].tolist(), padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        self.encoded_j = tokenizer(dataframe['word_j'].tolist(), padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        self.wup_sim = torch.tensor(dataframe['wup_similarity'].values.astype(float), dtype=torch.float)

    def __len__(self):
        return len(self.wup_sim)

    def __getitem__(self, idx):
        return {
            "input_ids_i": self.encoded_i["input_ids"][idx],
            "attention_mask_i": self.encoded_i["attention_mask"][idx],
            "input_ids_j": self.encoded_j["input_ids"][idx],
            "attention_mask_j": self.encoded_j["attention_mask"][idx],
            "wup_sim": self.wup_sim[idx]
        }

def run_training(csv_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 初始化模型與 Tokenizer
    model_name = "roberta-base"
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    base_model = RobertaModel.from_pretrained(model_name)

    # 4. LoRA 配置 (r=16, alpha=32)
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["query", "key", "value", "output.dense"],
        lora_dropout=0.1,
    )

    model = get_peft_model(base_model, lora_config)
    model.to(device)

    # 5. 資料切分 (9:1 訓練集/測試集)
    full_df = pd.read_csv(csv_path)
    full_df = full_df.sample(frac=0.03, random_state=42) # 減少資料量以加速訓練和除錯
    train_df = full_df.sample(frac=0.9, random_state=42) # 90% 訓練集
    test_df = full_df.drop(train_df.index)               # 10% 測試集

    # ==================================
    print(summary(model))
    print(f"test size:", test_df.shape)
    print(f"train size:", train_df.shape)
    print(train_df.describe())
    print(test_df.describe())
    # ==================================

    train_loader = DataLoader(SemanticPairDataset(train_df, tokenizer=tokenizer), batch_size=32, shuffle=True)
    val_loader = DataLoader(SemanticPairDataset(test_df, tokenizer=tokenizer), batch_size=32)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    criterion = MeSHSemanticLoss()

    # 紀錄 Loss 用於輸出圖表
    train_losses, val_losses = [], []

    # Early Stopping 參數
    patience = 3
    min_delta = 0.0001 # 最小改善幅度
    best_val_loss = float('inf')
    patience_counter = 0

    # 6. 訓練迴圈
    num_epochs = 40 # 增加 epochs 數量以配合 Early Stopping
    for epoch in range(num_epochs):
      model.train() # 訓練模式
      total_train_loss = 0
      for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        optimizer.zero_grad()

        # 直接從 batch 獲取資料並移至 device
        emb_i = model(input_ids=batch["input_ids_i"].to(device),
                      attention_mask=batch["attention_mask_i"].to(device)).last_hidden_state[:, 0, :]
        emb_j = model(input_ids=batch["input_ids_j"].to(device),
                      attention_mask=batch["attention_mask_j"].to(device)).last_hidden_state[:, 0, :]

        loss = criterion(emb_i, emb_j, batch["wup_sim"].to(device))
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

      # 驗證階段修正
      model.eval()
      total_val_loss = 0
      with torch.no_grad():
          for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
              emb_i = model(input_ids=batch["input_ids_i"].to(device),
                            attention_mask=batch["attention_mask_i"].to(device)).last_hidden_state[:, 0, :]
              emb_j = model(input_ids=batch["input_ids_j"].to(device),
                            attention_mask=batch["attention_mask_j"].to(device)).last_hidden_state[:, 0, :]
              total_val_loss += criterion(emb_i, emb_j, batch["wup_sim"].to(device)).item()

      avg_train = total_train_loss / len(train_loader)
      avg_val = total_val_loss / len(val_loader)
      train_losses.append(avg_train)
      val_losses.append(avg_val)

      print(f"Epoch {epoch+1}: Train Loss={avg_train:.6f}, Test Loss={avg_val:.6f}")

      # Early Stopping 檢查
      if avg_val < best_val_loss - min_delta:
          best_val_loss = avg_val
          patience_counter = 0
          # 可以選擇在這裡保存最佳模型
          model.save_pretrained("roberta_semantic_lora_best")
      else:
          patience_counter += 1
          print(f"Early Stopping: Validation loss has not improved for {patience_counter} epochs.")
          if patience_counter >= patience:
              print(f"Early stopping triggered after {epoch+1} epochs. Best validation loss: {best_val_loss:.6f}")
              break

    # 7. 繪製 Loss 曲線
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.title('Training and Validation Semantic Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    # 儲存 LoRA 適配器
    model.save_pretrained("roberta_semantic_lora")
    print("LoRA Adapter saved to 'roberta_semantic_lora'.")
# 執行
if __name__ == "__main__":
    run_training('mesh_dataset.csv')


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer (type:depth-idx)                                                 Param #
PeftModel                                                              --
├─LoraModel: 1-1                                                       --
│    └─RobertaModel: 2-1                                               --
│    │    └─RobertaEmbeddings: 3-1                                     (39,000,576)
│    │    └─RobertaEncoder: 3-2                                        86,971,392
│    │    └─RobertaPooler: 3-3                                         (590,592)
Total params: 126,562,560
Trainable params: 1,916,928
Non-trainable params: 124,645,632
test size: (600, 3)
train size: (5400, 3)
       wup_similarity
count     5400.000000
mean         0.200246
std          0.077916
min          0.110000
25%          0.170000
50%          0.180000
75%          0.200000
max          1.000000
       wup_similarity
count      600.000000
mean         0.200333
std          0.076369
min          0.120000
25%          0

Epoch 1 Training: 100%|██████████| 169/169 [00:41<00:00,  4.05it/s]
Epoch 1 Validation: 100%|██████████| 19/19 [00:01<00:00,  9.98it/s]


Epoch 1: Train Loss=0.142182, Test Loss=0.014496


Epoch 2 Training: 100%|██████████| 169/169 [00:40<00:00,  4.20it/s]
Epoch 2 Validation: 100%|██████████| 19/19 [00:01<00:00, 10.55it/s]


Epoch 2: Train Loss=0.010518, Test Loss=0.013138


Epoch 3 Training:  33%|███▎      | 55/169 [00:13<00:27,  4.17it/s]