In [3]:
# Environment and GPU check (Torch)

import os
import json
from pathlib import Path

import torch

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

print("Using device:", device)

# Do a tiny GPU computation to confirm the notebook is actually using the GPU
a = torch.randn(1024, 1024, device=device)
b = torch.randn(1024, 1024, device=device)
c = a @ b
print("GPU matmul result (mean):", c.mean().item())


Torch version: 2.8.0+cu128
CUDA available: True
CUDA device: NVIDIA RTX 6000 Ada Generation
Using device: cuda:0
GPU matmul result (mean): -0.00133326998911798


In [4]:
# Sanity Check: GPU tensor location

print("Tensor device check:", c.device)
assert str(c.device).startswith("cuda") or device.type == "cpu"


Tensor device check: cuda:0


In [5]:
# Install/verify required packages (datasets, tqdm)

import sys
!{sys.executable} -m pip install -q datasets tqdm
print("Installed/verified: datasets, tqdm")


Installed/verified: datasets, tqdm


In [6]:
# Sanity Check: Import check

from datasets import load_dataset
from tqdm import tqdm

print("datasets.load_dataset available:", callable(load_dataset))
print("tqdm imported:", tqdm is not None)


datasets.load_dataset available: True
tqdm imported: True


In [8]:
# Configure dataset source and output paths

HF_REPO = "wandb/RAGTruth-processed"   # change if you use a different RAGTruth variant
SPLIT = "train"                       # change if needed
LIMIT = None                          # set to an integer (e.g., 2000) for quick iteration

DATA_DIR = Path("..") / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
SAMPLES_DIR = DATA_DIR / "samples"

RAW_OUT = RAW_DIR / "ragtruth_raw.jsonl"
PROCESSED_OUT = PROCESSED_DIR / "ragtruth_processed.jsonl"
SAMPLE_OUT = SAMPLES_DIR / "ragtruth_sample.jsonl"
SAMPLE_N = 50

for p in [RAW_DIR, PROCESSED_DIR, SAMPLES_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print("HF_REPO:", HF_REPO)
print("SPLIT:", SPLIT)
print("RAW_OUT:", RAW_OUT)
print("PROCESSED_OUT:", PROCESSED_OUT)
print("SAMPLE_OUT:", SAMPLE_OUT)


HF_REPO: wandb/RAGTruth-processed
SPLIT: train
RAW_OUT: ../data/raw/ragtruth_raw.jsonl
PROCESSED_OUT: ../data/processed/ragtruth_processed.jsonl
SAMPLE_OUT: ../data/samples/ragtruth_sample.jsonl


In [9]:
# Sanity Check: Paths exist

assert RAW_DIR.exists()
assert PROCESSED_DIR.exists()
assert SAMPLES_DIR.exists()
print("All output directories exist.")


All output directories exist.


In [10]:
# Load the dataset from Hugging Face and inspect columns

ds = load_dataset(HF_REPO, split=SPLIT)
print("Loaded dataset:", HF_REPO, "| split:", SPLIT)
print("Number of rows:", len(ds))

first_row = ds[0]
print("First row keys:", list(first_row.keys()))

# Print a small preview (truncate long fields)
for k in list(first_row.keys())[:10]:
    v = first_row[k]
    s = str(v)
    if len(s) > 200:
        s = s[:200] + " ... (truncated)"
    print(f"- {k}: {s}")


Repo card metadata block was not found. Setting CardData to empty.


Loaded dataset: wandb/RAGTruth-processed | split: train
Number of rows: 15090
First row keys: ['id', 'query', 'context', 'output', 'task_type', 'quality', 'model', 'temperature', 'hallucination_labels', 'hallucination_labels_processed', 'input_str']
- id: 0
- query: Summarize the following news within 116 words:
- context: Seventy years ago, Anne Frank died of typhus in a Nazi concentration camp at the age of 15. Just two weeks after her supposed death on March 31, 1945, the Bergen-Belsen concentration camp where she ha ... (truncated)
- output: The Anne Frank House has revealed that Anne Frank and her older sister, Margot, likely died at least a month earlier than previously believed. The sisters, who were imprisoned in Nazi concentration ca ... (truncated)
- task_type: Summary
- quality: good
- model: gpt-4-0613
- temperature: 0.699999988079071
- hallucination_labels: []
- hallucination_labels_processed: {'evident_conflict': 0, 'baseless_info': 0}


In [11]:
# Sanity Check: Dataset non-empty

assert len(ds) > 0
print("Dataset looks non-empty and readable.")


Dataset looks non-empty and readable.


In [12]:
# Save raw dataset snapshot to data/raw (JSONL)

def write_jsonl(path: Path, rows):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

raw_rows = []
for i, row in enumerate(tqdm(ds, desc="Saving raw JSONL")):
    if LIMIT is not None and i >= LIMIT:
        break
    raw_rows.append(dict(row))

write_jsonl(RAW_OUT, raw_rows)
print(f"Saved raw JSONL: {RAW_OUT} ({len(raw_rows)} rows)")


Saving raw JSONL: 100%|██████████| 15090/15090 [00:01<00:00, 15008.87it/s]


Saved raw JSONL: ../data/raw/ragtruth_raw.jsonl (15090 rows)


In [13]:
# Sanity Check: Raw file exists and has content

assert RAW_OUT.exists()
raw_size = RAW_OUT.stat().st_size
print("Raw file size (bytes):", raw_size)
assert raw_size > 0

# Print first line
with RAW_OUT.open("r", encoding="utf-8") as f:
    line = f.readline().strip()
print("First raw JSONL line (truncated):", line[:200] + ("..." if len(line) > 200 else ""))


Raw file size (bytes): 115342141
First raw JSONL line (truncated): {"id": "0", "query": "Summarize the following news within 116 words:", "context": "Seventy years ago, Anne Frank died of typhus in a Nazi concentration camp at the age of 15. Just two weeks after her ...


In [23]:
# Define normalization helpers (stable schema for RAG judge)

def safe_str(x):
    if x is None:
        return ""
    if isinstance(x, str):
        return x
    return str(x)

def pick_first_present(row, keys):
    for k in keys:
        if k in row and row[k] not in (None, ""):
            return row[k]
    return None

def join_context(ctx):
    # Normalize context to a single string
    if ctx is None:
        return ""
    if isinstance(ctx, str):
        return ctx.strip()

    if isinstance(ctx, list):
        parts = []
        for item in ctx:
            if item is None:
                continue
            if isinstance(item, str):
                parts.append(item.strip())
            elif isinstance(item, dict):
                # common text-like keys
                for key in ("text", "content", "passage", "chunk"):
                    if key in item and item[key]:
                        parts.append(safe_str(item[key]).strip())
                        break
                else:
                    parts.append(safe_str(item).strip())
            else:
                parts.append(safe_str(item).strip())
        parts = [p for p in parts if p]
        return "\n\n---\n\n".join(parts)

    return safe_str(ctx).strip()

def normalize_row(row, idx):
    # ID
    rid = pick_first_present(row, ("id", "example_id", "uid", "qid"))
    example_id = safe_str(rid) if rid is not None else f"ragtruth_{idx}"

    # Question/query/prompt
    question = pick_first_present(row, ("question", "query", "prompt", "instruction"))
    if question is None:
        question = pick_first_present(row, ("input", "source", "document"))
    question = safe_str(question).strip() if question is not None else "[MISSING_QUESTION]"

    # Context/evidence
    context = pick_first_present(row, ("context", "contexts", "retrieved_context", "evidence", "passages"))
    context = join_context(context)

    # Answer/response/generation
    answer = pick_first_present(row, ("answer", "output", "response", "generation", "model_output"))
    answer = safe_str(answer).strip() if answer is not None else ""

    # Task/source dataset (optional)
    task = pick_first_present(row, ("task", "task_type", "dataset", "source_dataset", "benchmark"))
    task = safe_str(task).strip() if task is not None else None

    # Label (optional; varies across dataset variants)
    label = pick_first_present(row, ("label", "hallucination_labels_processed", "human_label", "faithfulness", "groundedness", "is_hallucination"))
    label = safe_str(label).strip() if label is not None else None

    normalized = {
        "example_id": example_id,
        "task": task,
        "question": question,
        "context": context,
        "answer": answer,
        "label": label,
        "meta": row,  # keep original row for traceability/debugging
    }
    return normalized

print("Normalization helpers defined.")


Normalization helpers defined.


In [24]:
# Sanity Check: Normalize a few rows and inspect

preview = []
for i in range(min(3, len(ds))):
    preview.append(normalize_row(dict(ds[i]), i))

for i, ex in enumerate(preview):
    print("\n--- Normalized example", i, "---")
    print("example_id:", ex["example_id"])
    print("task:", ex["task"])
    print("question:", ex["question"][:200] + ("..." if len(ex["question"]) > 200 else ""))
    print("context preview:", ex["context"][:200] + ("..." if len(ex["context"]) > 200 else ""))
    print("answer preview:", ex["answer"][:200] + ("..." if len(ex["answer"]) > 200 else ""))
    print("label:", ex["label"])

assert all("question" in ex and "context" in ex and "answer" in ex for ex in preview)
print("Normalization preview looks OK.")



--- Normalized example 0 ---
example_id: 0
task: Summary
question: Summarize the following news within 116 words:
context preview: Seventy years ago, Anne Frank died of typhus in a Nazi concentration camp at the age of 15. Just two weeks after her supposed death on March 31, 1945, the Bergen-Belsen concentration camp where she ha...
answer preview: The Anne Frank House has revealed that Anne Frank and her older sister, Margot, likely died at least a month earlier than previously believed. The sisters, who were imprisoned in Nazi concentration ca...
label: {'evident_conflict': 0, 'baseless_info': 0}

--- Normalized example 1 ---
example_id: 1
task: Summary
question: Summarize the following news within 116 words:
context preview: Seventy years ago, Anne Frank died of typhus in a Nazi concentration camp at the age of 15. Just two weeks after her supposed death on March 31, 1945, the Bergen-Belsen concentration camp where she ha...
answer preview: New research released by the Anne Frank H

In [25]:
# Build processed dataset and write JSONL outputs

processed_rows = []
sample_rows = []

n_total = len(ds) if LIMIT is None else min(LIMIT, len(ds))
for i in tqdm(range(n_total), desc="Normalizing rows"):
    row = dict(ds[i])
    norm = normalize_row(row, i)
    processed_rows.append(norm)
    if len(sample_rows) < SAMPLE_N:
        sample_rows.append(norm)

write_jsonl(PROCESSED_OUT, processed_rows)
write_jsonl(SAMPLE_OUT, sample_rows)

print(f"Saved processed JSONL: {PROCESSED_OUT} ({len(processed_rows)} rows)")
print(f"Saved sample JSONL: {SAMPLE_OUT} ({len(sample_rows)} rows)")


Normalizing rows: 100%|██████████| 15090/15090 [00:01<00:00, 10276.54it/s]


Saved processed JSONL: ../data/processed/ragtruth_processed.jsonl (15090 rows)
Saved sample JSONL: ../data/samples/ragtruth_sample.jsonl (50 rows)


In [26]:
# Sanity Check: Read back processed file and validate schema

assert PROCESSED_OUT.exists()
assert SAMPLE_OUT.exists()

def read_first_n_jsonl(path: Path, n=3):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for _ in range(n):
            line = f.readline()
            if not line:
                break
            rows.append(json.loads(line))
    return rows

rows_back = read_first_n_jsonl(PROCESSED_OUT, n=3)
print("Read back", len(rows_back), "rows from processed file.")

required_keys = {"example_id", "question", "context", "answer", "label", "meta", "task"}
for i, r in enumerate(rows_back):
    missing = required_keys - set(r.keys())
    print(f"Row {i} missing keys:", missing)
    assert not missing

print("Processed schema looks consistent.")


Read back 3 rows from processed file.
Row 0 missing keys: set()
Row 1 missing keys: set()
Row 2 missing keys: set()
Processed schema looks consistent.
