In [1]:
!pip install -q transformers datasets torch scikit-learn evaluate accelerate sentence-transformers


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import torch
import pandas as pd
import evaluate
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np



In [None]:
# LOAD & PREPROCESS DATASET
# ============================================================
print("Loading dataset...")
dataset = load_dataset("potsawee/wiki_bio_gpt3_hallucination")

# Dataset has only one split: "evaluation"
dataset = dataset["evaluation"]

# Preprocess labels and texts
def preprocess_labels(example):
    label_map = {
        "accurate": "No Hallucination",
        "minor_inaccurate": "Partial Hallucination",
        "major_inaccurate": "Hallucinating"
    }
    example["label_text"] = label_map[example["annotation"][0]]

    # Keep a numeric label for training
    num_label_map = {"No Hallucination": 0, "Partial Hallucination": 1, "Hallucinating": 2}
    example["label"] = num_label_map[example["label_text"]]

    example["generated_text"] = " ".join(example["gpt3_sentences"])
    example["reference_text"] = example["wiki_bio_text"]
    return example

dataset = dataset.map(preprocess_labels)
dataset = dataset.remove_columns([
    "annotation", "gpt3_sentences", "wiki_bio_test_idx", "gpt3_text_samples"
])

# Split into train/test manually
dataset = dataset.train_test_split(test_size=0.2, seed=42)
print(dataset)

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/evaluation-00000-of-00001-e91191b8f(…):   0%|          | 0.00/2.56M [00:00<?, ?B/s]

Generating evaluation split:   0%|          | 0/238 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['gpt3_text', 'wiki_bio_text', 'label_text', 'label', 'generated_text', 'reference_text'],
        num_rows: 190
    })
    test: Dataset({
        features: ['gpt3_text', 'wiki_bio_text', 'label_text', 'label', 'generated_text', 'reference_text'],
        num_rows: 48
    })
})


In [None]:
# LOAD TOKENIZER & MODEL
# ============================================================
model_name = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# TOKENIZATION
# ============================================================
def tokenize_function(examples):
    return tokenizer(
        examples["generated_text"],
        examples["reference_text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing dataset...


Map:   0%|          | 0/190 [00:00<?, ? examples/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

In [None]:
# TRAINING CONFIGURATION
# ============================================================
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

training_args = TrainingArguments(
    output_dir="./deberta-hallucination",
    eval_strategy="epoch",
    save_strategy="epoch",          # ✓ Model will save every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,    # ✓ Best model will be loaded
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    report_to="none"
)

# TRAIN THE MODEL
# ============================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("\nStarting fine-tuning...\n")
trainer.train()

# EVALUATE MODEL
# ============================================================
print("\nEvaluating on test set...")
eval_results = trainer.evaluate()  # fine-tuned model used
print("\n Evaluation Results:")
print(eval_results)

# SAVE MODEL  (FIXED ✔)
# ============================================================
save_path = "./deberta-v3-small-hallucination"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"\n Model saved successfully to: {save_path}")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



Starting fine-tuning...





Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,0.946019,0.604167,0.251082
2,No log,0.923418,0.604167,0.251082
3,No log,0.96047,0.5625,0.3625
4,No log,0.946175,0.604167,0.407104
5,No log,0.989999,0.5625,0.388889
6,No log,0.979777,0.604167,0.435703
7,No log,1.053726,0.625,0.430913
8,No log,1.063838,0.604167,0.435703
9,No log,1.075605,0.583333,0.476403
10,No log,1.128134,0.604167,0.483333





Evaluating on test set...





 Evaluation Results:
{'eval_loss': 1.0537259578704834, 'eval_accuracy': 0.625, 'eval_f1_macro': 0.43091334894613587, 'eval_runtime': 64.6568, 'eval_samples_per_second': 0.742, 'eval_steps_per_second': 0.093, 'epoch': 10.0}

 Model saved successfully to: ./deberta-v3-small-hallucination


In [None]:
# INFERENCE FUNCTION
# ============================================================
def check_hallucination(generated_text, reference_text):
    inputs = tokenizer(
        generated_text,
        reference_text,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )
    with torch.no_grad():
        logits = model(**inputs).logits   # trained model used to classify any new pair of texts
    pred = torch.argmax(logits, dim=-1).item()
    mapping = {0: "Not hallucinating", 1: "Minor Hallucination", 2: "Hallucinating"}
    return mapping[pred]


In [None]:
# AUTOMATIC TEST EVALUATION
# ============================================================
print("\nRunning hallucination detection on test set...")
predictions, truths = [], []

for ex in dataset["test"]:
    inputs = tokenizer(
        ex["generated_text"], ex["reference_text"],
        truncation=True, padding=True, max_length=512,
        return_tensors="pt"
    )
    with torch.no_grad():
        logits = model(**inputs).logits   # trained model used
    pred = torch.argmax(logits, dim=-1).item()

    predictions.append(pred)
    truths.append(ex["label"])

acc = accuracy_score(truths, predictions)
print(f"\n Final Accuracy on Test Set: {acc*100:.2f}%\n")

print("Detailed Classification Report:")
print(classification_report(
    truths,
    predictions,
    target_names=["No Hallucination", "Partial Hallucination", "Hallucinating"]
))

cm = confusion_matrix(truths, predictions)
print("\n Confusion Matrix:")
print(cm)


Running hallucination detection on test set...

 Final Accuracy on Test Set: 62.50%

Detailed Classification Report:
                       precision    recall  f1-score   support

     No Hallucination       0.50      0.67      0.57        12
Partial Hallucination       0.69      0.76      0.72        29
        Hallucinating       0.00      0.00      0.00         7

             accuracy                           0.62        48
            macro avg       0.40      0.48      0.43        48
         weighted avg       0.54      0.62      0.58        48


 Confusion Matrix:
[[ 8  4  0]
 [ 7 22  0]
 [ 1  6  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print("\n Sample Predictions:")
for i in range(3):
    print(f"\nExample {i+1}:")
    print("Generated:", dataset['test'][i]['generated_text'][:200], "...")
    print("Reference:", dataset['test'][i]['reference_text'][:200], "...")
    label_names = ["No Hallucination", "Partial Hallucination", "Hallucinating"]
    print("True Label:", label_names[dataset["test"][i]["label"]])
    print("Predicted Label:", label_names[predictions[i]])


 Sample Predictions:

Example 1:
Generated: Jean Hugo (1894–1984) was a French painter, illustrator, engraver, and sculptor. He was born in Paris, the son of the writer Victor Hugo and his wife, Juliette Drouet. He was the grandson of the poet, ...
Reference: Jean Hugo (19 November 1894 - 21 June 1984) was a painter, illustrator, theatre designer, and author. He was born in Paris and died in his home at the Mas de Fourques, near Lunel, France. Brought up i ...
True Label: No Hallucination
Predicted Label: No Hallucination

Example 2:
Generated: Jeanine Riley (born Jeanine Marie Riley, October 13, 1940) is an American actress, singer, and dancer. She is best known for her roles as Billie Jo Bradley on the television series Petticoat Junction  ...
Reference: Jeanine Riley (born October 1, 1940 in Madera, California, USA) is an American actress. Riley has appeared in guest roles on numerous television series ("Route 66", "The Man from U.N.C.L.E.", "The Wil ...
True Label: Partial Halluci