<a href="https://colab.research.google.com/github/Heng1222/Ohsumed_classification/blob/main/Model/task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
import re
import requests
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel

# =================================================================
# [DATA CHANGE: 讀取 GitHub 網址資料]
# =================================================================
url = "https://media.githubusercontent.com/media/Heng1222/Ohsumed_classification/refs/heads/main/classification_data/ohsumed_dataset.csv"

def fetch_and_parse_data(url):
    print("正在從 GitHub 下載並解析資料...")
    response = requests.get(url)
    response.encoding = 'utf-8'
    content = response.text

    # 解析格式：(標題).,"(多行摘要)",(標籤)
    pattern = r'(.*?)\.,"([\s\S]*?)",(C\d+)'
    matches = re.findall(pattern, content)

    data_list = []
    for m in matches:
        data_list.append({
            'title': m[0].strip(),
            'abstract': m[1].replace('\n', ' ').strip(),
            'label': m[2].strip()
        })
    return pd.DataFrame(data_list)

df_all = fetch_and_parse_data(url)

# =================================================================
# [實驗設定：設定要測試的特徵]
# [DATA CHANGE: 若要換成其他欄位（如摘要），請修改 text_col]
# =================================================================
text_col = 'title'  # 目前設定僅使用標題進行 Baseline 測試
label_col = 'label'

print(f"資料讀取完成，共 {len(df_all)} 筆。")

# 1. 標籤處理
le = LabelEncoder()
y_encoded = le.fit_transform(df_all[label_col])

# 2. 分層抽樣切分 (依照論文與資料分佈建議 80/20 切分)
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df_all[text_col],
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded  # 必須分層抽樣以處理不平衡數據
)

# 3. 載入原始模型 (不微調)
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# 4. 定義特徵提取函數 (Baseline 核心：Linear Probing)
def get_embeddings(texts, batch_size=32):
    all_embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="提取 Embedding"):
            batch = texts.iloc[i : i + batch_size].tolist()
            inputs = tokenizer(batch, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
            outputs = model(**inputs)
            # 提取 <s> 標記 (RoBERTa 的 CLS) 作為代表向量 [cite: 25]
            emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(emb)
    return np.vstack(all_embeddings)

print("提取訓練集 Embedding...")
X_train = get_embeddings(X_train_text)
print("提取測試集 Embedding...")
X_test = get_embeddings(X_test_text)

# 5. 訓練邏輯斯迴歸 (完全遵循論文對照組設計)
print("訓練 Logistic Regression 分類器...")
clf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
clf.fit(X_train, y_train)

# 6. 產出評估報告
y_pred = clf.predict(X_test)
print("\n=== 學長 Baseline RoBERTa 重現報告 (Ohsumed 資料集) ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Macro F1: {f1_score(y_test, y_pred, average='macro'):.4f}")
print("\n各類別詳細指標：")
print(classification_report(y_test, y_pred, target_names=le.classes_))