In [10]:
from google.colab import drive
drive.mount('/content/drive')

import os, pprint
MODEL_DIR = "/content/drive/MyDrive/rob_ft_nli_final"

print("目录存在吗？", os.path.isdir(MODEL_DIR))
if os.path.isdir(MODEL_DIR):
    pprint.pprint(os.listdir(MODEL_DIR))
else:
    print("路径有误；检查文件夹实际位置/名称")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
目录存在吗？ True
['config.json',
 'model.safetensors',
 'tokenizer_config.json',
 'special_tokens_map.json',
 'vocab.json',
 'merges.txt',
 'tokenizer.json',
 'training_args.bin']


In [11]:
!pip install -q datasets evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, pandas as pd, evaluate, numpy as np
from tqdm import tqdm

# 1) 数据
ds = load_dataset("potsawee/wiki_bio_gpt3_hallucination", split="evaluation")

# 2) 模型
MODEL_DIR = "/content/drive/MyDrive/rob_ft_nli_final"
tok   = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

ID2NAME = {0:"entailment", 1:"neutral", 2:"contradiction"}

def prem(text, max_tok=350):
    """Convert table text to premise and truncate *before* tokenization."""
    return " ".join(text.split()[:max_tok])

# 3) 推理
batch, gold_bin, pred_bin = [], [], []
for item in tqdm(ds, desc="Build"):
    premise = prem(item["wiki_bio_text"])
    for sent, lab in zip(item["gpt3_sentences"], item["annotation"]):
        batch.append((premise, sent))
        gold_bin.append(0 if lab=="accurate" else 1)

B = 32
for i in tqdm(range(0, len(batch), B), desc="Infer"):
    p, h = zip(*batch[i:i+B])
    enc = tok(list(p), list(h), truncation=True, padding=True, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**enc).logits
    lbl = logits.argmax(1).cpu().tolist()
    pred_bin.extend([1 if ID2NAME[x]=="contradiction" else 0 for x in lbl])

# 4) 评估
acc = evaluate.load("accuracy").compute(predictions=pred_bin, references=gold_bin)["accuracy"]
prec = evaluate.load("precision").compute(predictions=pred_bin, references=gold_bin, average="binary")["precision"]
rec  = evaluate.load("recall").compute(predictions=pred_bin, references=gold_bin, average="binary")["recall"]
f1   = evaluate.load("f1").compute(predictions=pred_bin, references=gold_bin, average="binary")["f1"]

print(f"Accuracy={acc:.3f}  Precision={prec:.3f}  Recall={rec:.3f}  F1={f1:.3f}")

# 5) 保存 CSV
pd.DataFrame({
    "premise":[p for p,_ in batch],
    "hypothesis":[h for _,h in batch],
    "gold_label":gold_bin,
    "pred_label":pred_bin
}).to_csv("roberta_hallucination.csv", index=False)
print("CSV saved → roberta_hallucination.csv")

Build: 100%|██████████| 238/238 [00:00<00:00, 6227.93it/s]
Infer: 100%|██████████| 60/60 [00:44<00:00,  1.34it/s]


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Accuracy=0.379  Precision=0.795  Recall=0.201  F1=0.321
CSV saved → roberta_hallucination.csv


In [12]:
!find /content -name roberta_hallucination.csv

/content/roberta_hallucination.csv


In [13]:
from google.colab import files, drive

# 下载到本机
files.download("/content/roberta_hallucination.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>