In [18]:
!pip install torch torchvision transformers tqdm requests datasets accelerate bitsandbytes tensorboard torch-tb-profiler

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [19]:
# --- Safe flags for Apple-silicon ---
import os, platform
os.environ["TOKENIZERS_PARALLELISM"]           = "false"   # avoid fork-after-tokenizer bug
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.9"     # leave 10 % headroom, prevents sudden kills
os.environ["FLASH_ATTENTION_FORCE_DISABLE"]    = "1"       # disable Flash-Attn v2 path

In [20]:
import os
import json
import torch
import platform
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, logging

from torch.utils.tensorboard import SummaryWriter
tb_writer = SummaryWriter("runs/halueval_llama")

# Reduce verbosity of transformers
logging.set_verbosity_error()

# Check if GPU is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}") # change this to foundry gpu if needed

Using device: mps


In [21]:
import torch, os, platform, psutil, time
print("Torch:", torch.__version__, "  Free RAM:", psutil.virtual_memory().available/1e9, "GB")
print("MPS cap:", torch.backends.mps.is_available())

Torch: 2.7.0   Free RAM: 2.966667264 GB
MPS cap: True


In [22]:
print(platform.platform(), torch.__version__)
print("MPS available:", torch.backends.mps.is_available())

macOS-13.4-arm64-arm-64bit 2.7.0
MPS available: True


In [23]:
# Define the base model using Llama from Hugging Face
class LlamaBaseNet(nn.Module):
    def __init__(self, model_name="meta-llama/Llama-2-7b-hf", num_classes=2):
        super().__init__()
        # Load Llama model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.backbone = AutoModel.from_pretrained(model_name)
        # self.backbone = self.backbone.half()           # fp16
        self.backbone.gradient_checkpointing_enable()  # save RAM
        
        # If the tokenizer doesn't have a padding token, set it
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        # Get hidden size from config
        self.hidden_size = self.backbone.config.hidden_size
        
        # Classification head
        self.classifier = nn.Linear(self.hidden_size, num_classes)

    def forward(self, texts):
        # Tokenize and move to device
        if isinstance(texts, torch.Tensor):
            # If input is already tokenized
            inputs = {'input_ids': texts}
        else:
            # If input is raw text
            inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        inputs = {k: v.to(self.classifier.weight.device) for k, v in inputs.items()}
        
        # Get model outputs
        with torch.no_grad():  # Don't compute gradients for the backbone
            outputs = self.backbone(**inputs)
            
        # Use the last hidden state of the last token for classification
        last_hidden_states = outputs.last_hidden_state
        sequence_lengths = torch.ne(inputs['input_ids'], self.tokenizer.pad_token_id).sum(-1) - 1
        batch_size = last_hidden_states.shape[0]
        
        # Get the hidden state for the last token in each sequence
        features = last_hidden_states[torch.arange(batch_size), sequence_lengths]
        
        # Apply classifier
        logits = self.classifier(features)
        
        return logits, features

In [24]:
# hugging face auth

from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()
hf_token = os.getenv("HUGGING_FACE_KEY")
login(token=hf_token)

In [25]:
# Load HaluEval dataset from Hugging Face
from datasets import load_dataset

def prepare_halueval_data_from_hf():
    """Load HaluEval dataset from Hugging Face"""
    print("Loading HaluEval dataset from Hugging Face...")
    
    # Create output directory
    output_dir = "data/halueval"
    os.makedirs(output_dir, exist_ok=True)
    
    # Process each split
    categories = ["qa", "dialogue", "summarization", "general"]
    
    # Prepare train and test sets
    for category in categories:
        print(f"Loading {category} dataset...")
        # Load the dataset for this category
        dataset = load_dataset("pminervini/HaluEval", category)
        
        # The dataset has a 'data' split containing all examples
        data = dataset['data']
        
        # Split into train/test (80/20 split)
        splits = data.train_test_split(test_size=0.2, seed=42)
        
        # Save as jsonl
        with open(f"{output_dir}/{category}_train.jsonl", 'w', encoding='utf-8') as f:
            for item in splits['train']:
                formatted_item = {
                    'question': item.get('instruction', ''),
                    'response': item.get('output', ''),
                    'is_hallucination': 1 if item.get('label') == 'hallucinated' else 0
                }
                f.write(json.dumps(formatted_item) + '\n')
        
        with open(f"{output_dir}/{category}_test.jsonl", 'w', encoding='utf-8') as f:
            for item in splits['test']:
                formatted_item = {
                    'question': item.get('instruction', ''),
                    'response': item.get('output', ''),
                    'is_hallucination': 1 if item.get('label') == 'hallucinated' else 0
                }
                f.write(json.dumps(formatted_item) + '\n')
    
    # Merge all training data
    print("Merging all training data...")
    with open(f"{output_dir}/train.jsonl", 'w', encoding='utf-8') as outfile:
        for category in categories:
            with open(f"{output_dir}/{category}_train.jsonl", 'r', encoding='utf-8') as infile:
                outfile.write(infile.read())
    
    # Merge all test data
    print("Merging all test data...")
    with open(f"{output_dir}/test.jsonl", 'w', encoding='utf-8') as outfile:
        for category in categories:
            with open(f"{output_dir}/{category}_test.jsonl", 'r', encoding='utf-8') as infile:
                outfile.write(infile.read())
    
    print("HaluEval dataset preparation complete!")
    print(f"Train data: {output_dir}/train.jsonl")
    print(f"Test data: {output_dir}/test.jsonl")
    
    return f"{output_dir}/train.jsonl", f"{output_dir}/test.jsonl"

# Run the function to get the paths
train_data_path, test_data_path = prepare_halueval_data_from_hf()

Loading HaluEval dataset from Hugging Face...
Loading qa dataset...
Loading dialogue dataset...
Loading summarization dataset...
Loading general dataset...
Merging all training data...
Merging all test data...
HaluEval dataset preparation complete!
Train data: data/halueval/train.jsonl
Test data: data/halueval/test.jsonl


In [26]:
# 3. Define the Epinet
class EpiNet(nn.Module):
    def __init__(self, feature_dim, z_dim, hidden_dims, num_classes):
        super().__init__()
        dims = [feature_dim + z_dim] + hidden_dims + [num_classes]
        layers = []
        for in_d, out_d in zip(dims, dims[1:]):
            layers += [nn.Linear(in_d, out_d), nn.ReLU()]
        self.mlp = nn.Sequential(*layers[:-1])  # drop final ReLU

    def forward(self, features, z):
        # stop-gradient on features
        features = features.detach()
        x = torch.cat([features, z], dim=1)
        return self.mlp(x)

# 4. Define the PriorNet
class PriorNet(nn.Module):
    def __init__(self, feature_dim, z_dim, num_classes):
        super().__init__()
        # Fixed random weights
        self.fc = nn.Linear(feature_dim + z_dim, num_classes)
        for p in self.parameters():
            p.requires_grad = False  # fix weights

    def forward(self, features, z):
        features = features.detach()
        x = torch.cat([features, z], dim=1)
        return self.fc(x)

In [27]:
# 5. Wrap into an Epistemic Neural Network
class EpistemicNN(nn.Module):
    def __init__(self, base: LlamaBaseNet, epinet: EpiNet, prior: PriorNet=None):
        super().__init__()
        self.base = base
        self.epinet = epinet
        self.prior = prior

    def forward(self, x, z):
        logits, features = self.base(x)         # base logits & features
        δ = self.epinet(features, z)            # learnable correction
        σP = self.prior(features, z) if self.prior else 0
        return logits + δ + σP

# 6. Sampling epistemic index z
def sample_z(batch_size, z_dim, device):
    # Gaussian prior
    return torch.randn(batch_size, z_dim, device=device)

In [28]:
def train_enn(model, dataloader, epochs, lr, λ, z_dim, device, writer):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=λ)
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0
        
        for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
            # Move tensors to device
            x_batch = x_batch.to(device, non_blocking=False)
            y_batch = y_batch.to(device)
            
            # Sample epistemic indices
            z = sample_z(len(y_batch), z_dim, device)

            # Forward pass
            logits = model(x_batch, z)
            loss = F.cross_entropy(logits, y_batch)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Track accuracy
            total_loss += loss.item()
            _, predicted = logits.max(1)
            total += y_batch.size(0)
            correct += predicted.eq(y_batch).sum().item()
            
            if batch_idx % 10 == 0:
                print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}, Loss: {loss.item():.4f}, "
                      f"Acc: {100.*correct/total:.2f}%")
                
            avg_loss = total_loss / (batch_idx + 1)
            acc      = 100. * correct / total
            global_step = epoch * len(dataloader) + batch_idx
            writer.add_scalar("train/loss", loss.item(), global_step)
            writer.add_scalar("train/acc",  acc,      global_step)

            if global_step % 50 == 0:
                writer.flush()
                
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {total_loss/(batch_idx+1):.4f}, "
              f"Accuracy: {100.*correct/total:.2f}%")

        torch.mps.empty_cache()
    
    return model

# Also update the evaluation function
def evaluate_enn(model, dataloader, z_dim, device, num_samples=10):
    model.eval()
    total = 0
    correct = 0
    epistemic_uncertainty = []
    
    with torch.no_grad():
        for x_batch, y_batch in dataloader:
            x_batch = x_batch.to(device, non_blocking=False)
            y_batch = y_batch.to(device)
            batch_size = len(x_batch)
            
            # Sample multiple z for each input
            all_logits = []
            for _ in range(num_samples):
                z = sample_z(batch_size, z_dim, device)
                logits = model(x_batch, z)
                all_logits.append(logits)
            
            # Stack all predictions
            stacked_logits = torch.stack(all_logits)  # [num_samples, batch_size, num_classes]
            
            # Mean prediction
            mean_logits = stacked_logits.mean(dim=0)
            _, predicted = mean_logits.max(1)
            total += y_batch.size(0)
            correct += predicted.eq(y_batch).sum().item()
            
            # Calculate uncertainty - variance across samples
            uncertainty = stacked_logits.var(dim=0).sum(dim=1)  # [batch_size]
            epistemic_uncertainty.append(uncertainty)
    
    accuracy = 100. * correct / total
    avg_uncertainty = torch.cat(epistemic_uncertainty).mean().item()
    
    print(f"Test Accuracy: {accuracy:.2f}%")
    print(f"Average Epistemic Uncertainty: {avg_uncertainty:.4f}")
    
    return accuracy, avg_uncertainty

In [29]:
# Hyperparameters
num_classes = 2  # Binary classification for hallucination detection
z_dim = 16
hidden_dims = [128, 64]
lr = 1e-5  # Lower learning rate for LLM fine-tuning
epochs = 3  # Reduce epochs for faster training with large model
λ = 1e-5
batch_size = 1  # Smaller batch size for large model
max_length = 128
import random

# Choose a Llama model - use a smaller version if memory is limited
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Start with smaller model for testing

class HaluEvalDataset(Dataset):
    def __init__(self, data_items, tokenizer):
        self.data = data_items
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # ---------- 1. pick the *content* string ----------
        if "question" in item:          # QA
            content = f"Context: {item['knowledge']}\nQuestion: {item['question']}"
            gold   = item["right_answer"]
            halluc = item["hallucinated_answer"]

        elif "dialogue_history" in item:      # Dialogue
            content = f"Context: {item['knowledge']}\nDialogue: {item['dialogue_history']}"
            gold   = item["right_response"]
            halluc = item["hallucinated_response"]

        elif "document" in item:              # Summarisation
            content = f"Document: {item['document']}"
            gold   = item["right_summary"]
            halluc = item["hallucinated_summary"]

        else:                                 # General split
            content = f"User query: {item['user_query']}"
            # general‐split has only one answer plus a Yes/No label
            gold   = item["chatgpt_response"]
            halluc = None                     # there is no alt-answer

        # ---------- 2. choose gold vs hallucinated version ----------
        if halluc is not None and torch.rand(1).item() > 0.5:
            text  = f"{content}\nAnswer: {halluc}"
            label = 1            # hallucination
        else:
            text  = f"{content}\nAnswer: {gold}"
            label = 0            # factual

        # ---------- 3. tokenise ----------
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        )

        return enc["input_ids"].squeeze(0), torch.tensor(label, dtype=torch.long)
    
# Initialize models
print("Initializing models...")
base = LlamaBaseNet(model_name, num_classes).to(device)
feature_dim = base.hidden_size
epinet = EpiNet(feature_dim, z_dim, hidden_dims, num_classes).to(device)
prior = PriorNet(feature_dim, z_dim, num_classes).to(device)
enn = EpistemicNN(base, epinet, prior).to(device)

# Create train/test splits from the loaded HaluEval dataset
print("Loading dataset...")
all_data = []
categories = ["qa", "dialogue", "summarization", "general"]

for category in categories:
    print(f"Loading {category} dataset...")
    dataset = load_dataset("pminervini/HaluEval", category)
    all_data.extend(dataset['data'])

# Shuffle and split the data
random.seed(42)
random.shuffle(all_data)
split_idx = int(0.8 * len(all_data))
train_data = all_data[:split_idx]
test_data = all_data[split_idx:]

# -------------------- build proper Dataset objects --------------------
train_dataset = HaluEvalDataset(train_data, base.tokenizer)
test_dataset  = HaluEvalDataset(test_data,  base.tokenizer)

trainloader = DataLoader(train_dataset,
                         batch_size=batch_size,
                         shuffle=True)

testloader  = DataLoader(test_dataset,
                         batch_size=batch_size,
                         shuffle=False)

print(f"Training samples: {len(train_data)}")
print(f"Testing samples: {len(test_data)}")

Initializing models...
Loading dataset...
Loading qa dataset...
Loading dialogue dataset...
Loading summarization dataset...
Loading general dataset...
Training samples: 27605
Testing samples: 6902


In [30]:
# small_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False)
# enn.eval();                # no grads
# x,y = next(iter(small_loader))
# z   = sample_z(1, z_dim, device)
# with torch.no_grad():      # forward only
#     _ = enn(x.to(device), z)
# print("✓ tiny batch ran")

In [31]:
import torch, gc

torch.mps.empty_cache()   # releases unused cached blocks
gc.collect()              # Python-side garbage collection


1402

In [42]:
%load_ext tensorboard
%tensorboard --logdir runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 95975), started 0:00:15 ago. (Use '!kill 95975' to kill it.)

In [None]:
# Train the model first
print("Training model...")
trained_model = train_enn(enn, trainloader, epochs, lr, λ, z_dim, device, tb_writer)

# Then evaluate the model
print("Evaluating model...")
accuracy, uncertainty = evaluate_enn(trained_model, testloader, z_dim, device)

# Save the model
torch.save(trained_model.state_dict(), "enn_halueval_model.pt")
print("Model saved to enn_halueval_model.pt")
tb_writer.close()

Training model...




Epoch 1/3, Batch 0, Loss: 4.2329, Acc: 0.00%
Epoch 1/3, Batch 10, Loss: 1.6306, Acc: 36.36%
Epoch 1/3, Batch 20, Loss: 0.1659, Acc: 57.14%
Epoch 1/3, Batch 30, Loss: 1.9210, Acc: 51.61%
Epoch 1/3, Batch 40, Loss: 0.4872, Acc: 56.10%
Epoch 1/3, Batch 50, Loss: 1.0253, Acc: 52.94%
Epoch 1/3, Batch 60, Loss: 2.2024, Acc: 54.10%
Epoch 1/3, Batch 70, Loss: 3.1466, Acc: 56.34%
Epoch 1/3, Batch 80, Loss: 1.1493, Acc: 50.62%
Epoch 1/3, Batch 90, Loss: 0.5534, Acc: 52.75%
Epoch 1/3, Batch 100, Loss: 0.0941, Acc: 51.49%
Epoch 1/3, Batch 110, Loss: 0.0243, Acc: 53.15%
Epoch 1/3, Batch 120, Loss: 0.6817, Acc: 54.55%
Epoch 1/3, Batch 130, Loss: 0.0532, Acc: 53.44%
Epoch 1/3, Batch 140, Loss: 2.5350, Acc: 53.19%
Epoch 1/3, Batch 150, Loss: 0.9787, Acc: 54.30%
Epoch 1/3, Batch 160, Loss: 3.2568, Acc: 54.66%
Epoch 1/3, Batch 170, Loss: 0.0961, Acc: 55.56%
Epoch 1/3, Batch 180, Loss: 0.0854, Acc: 56.35%
Epoch 1/3, Batch 190, Loss: 0.1460, Acc: 56.54%
Epoch 1/3, Batch 200, Loss: 0.3086, Acc: 55.72%
Epoc

In [14]:
# Test with custom examples
def predict_with_uncertainty(model, texts, z_dim=16, num_samples=10):
    """Make predictions with uncertainty estimation"""
    device = next(model.parameters()).device
    model.eval()
    
    all_probs = []
    
    with torch.no_grad():
        for _ in range(num_samples):
            # Sample z for each input
            z = sample_z(len(texts), z_dim, device)
            
            # Get model predictions
            logits = model(texts, z)
            probs = torch.softmax(logits, dim=1)
            all_probs.append(probs)
    
    # Stack all predictions [num_samples, batch_size, num_classes]
    stacked_probs = torch.stack(all_probs)
    
    # Calculate mean and variance
    mean_probs = stacked_probs.mean(dim=0)
    uncertainty = stacked_probs.var(dim=0).sum(dim=1)
    
    # Get class predictions
    predicted_class = mean_probs.argmax(dim=1)
    
    # Convert to numpy for easier handling
    predicted_class = predicted_class.cpu().numpy()
    uncertainty = uncertainty.cpu().numpy()
    hallucination_prob = mean_probs[:, 1].cpu().numpy()  # Assuming class 1 is hallucination
    
    return predicted_class, uncertainty, hallucination_prob

# Example inputs - add your own examples here
sample_texts = [
    "Q: What is the capital of France? A: Paris is the capital of France.",
    "Q: How many planets are in our solar system? A: There are 9 planets in our solar system.",
    "Q: Who wrote 'The Great Gatsby'? A: F. Scott Fitzgerald wrote 'The Great Gatsby'.",
    "Q: What is the boiling point of water? A: Water boils at 130 degrees Celsius at sea level.",
    "Q: What's the smallest prime number? A: The smallest prime number is 2.",
    "Q: What is the tallest mountain? A: Mount Kilimanjaro is the tallest mountain in the world.",
]

# Make predictions with uncertainty
classes, uncertainties, hallucination_probs = predict_with_uncertainty(trained_model, sample_texts)

# Print results
print("\n===== HALLUCINATION DETECTION RESULTS =====")
print("0 = Factual, 1 = Hallucination\n")

for i, (text, cls, uncertainty, prob) in enumerate(zip(sample_texts, classes, uncertainties, hallucination_probs)):
    print(f"Example {i+1}:")
    print(f"Text: {text}")
    print(f"Prediction: {'Hallucination' if cls == 1 else 'Factual'}")
    print(f"Hallucination Probability: {prob:.4f}")
    print(f"Epistemic Uncertainty: {uncertainty:.4f}")
    print("-" * 50)

# Print summary
print("\nSummary:")
print(f"Total examples: {len(sample_texts)}")
print(f"Detected hallucinations: {sum(classes)}")
print(f"Average uncertainty: {uncertainties.mean():.4f}")

# Examples of highest and lowest uncertainty
most_uncertain_idx = uncertainties.argmax()
least_uncertain_idx = uncertainties.argmin()
print(f"\nMost uncertain: '{sample_texts[most_uncertain_idx]}'")
print(f"Least uncertain: '{sample_texts[least_uncertain_idx]}'")

KeyboardInterrupt: 