In [1]:
!pip install torch torchvision transformers tqdm requests datasets accelerate bitsandbytes tensorboard torch-tb-profiler openai anthropic google-generativeai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# --- Safe flags for Apple-silicon ---
import os, platform
os.environ["TOKENIZERS_PARALLELISM"]           = "false"   # avoid fork-after-tokenizer bug
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.9"     # leave 10 % headroom, prevents sudden kills
os.environ["FLASH_ATTENTION_FORCE_DISABLE"]    = "1"       # disable Flash-Attn v2 path

In [3]:
import os
import json
import torch
import platform
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, logging

from torch.utils.tensorboard import SummaryWriter
tb_writer = SummaryWriter("runs/halueval_llama")

# Reduce verbosity of transformers
logging.set_verbosity_error()

# Check if GPU is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}") # change this to foundry gpu if needed

  from .autonotebook import tqdm as notebook_tqdm


Using device: mps


In [4]:
import torch, os, platform, psutil, time
print("Torch:", torch.__version__, "  Free RAM:", psutil.virtual_memory().available/1e9, "GB")
print("MPS cap:", torch.backends.mps.is_available())

Torch: 2.7.0   Free RAM: 5.742149632 GB
MPS cap: True


In [5]:
print(platform.platform(), torch.__version__)
print("MPS available:", torch.backends.mps.is_available())

macOS-13.4-arm64-arm-64bit 2.7.0
MPS available: True


In [6]:
# Define the base model using Llama from Hugging Face
class LlamaBaseNet(nn.Module):
    def __init__(self, model_name="meta-llama/Llama-2-7b-hf", num_classes=2):
        super().__init__()
        # Load Llama model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.backbone = AutoModel.from_pretrained(model_name)
        # self.backbone = self.backbone.half()           # fp16
        self.backbone.gradient_checkpointing_enable()  # save RAM
        
        # If the tokenizer doesn't have a padding token, set it
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        # Get hidden size from config
        self.hidden_size = self.backbone.config.hidden_size
        
        # Classification head
        self.classifier = nn.Linear(self.hidden_size, num_classes)

    def forward(self, texts):
        # Tokenize and move to device
        if isinstance(texts, torch.Tensor):
            # If input is already tokenized
            inputs = {'input_ids': texts}
        else:
            # If input is raw text
            inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        inputs = {k: v.to(self.classifier.weight.device) for k, v in inputs.items()}
        
        # Get model outputs
        with torch.no_grad():  # Don't compute gradients for the backbone
            outputs = self.backbone(**inputs)
            
        # Use the last hidden state of the last token for classification
        last_hidden_states = outputs.last_hidden_state
        sequence_lengths = torch.ne(inputs['input_ids'], self.tokenizer.pad_token_id).sum(-1) - 1
        batch_size = last_hidden_states.shape[0]
        
        # Get the hidden state for the last token in each sequence
        features = last_hidden_states[torch.arange(batch_size), sequence_lengths]
        
        # Apply classifier
        logits = self.classifier(features)
        
        return logits, features

In [7]:
# hugging face auth

from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()
hf_token = os.getenv("HUGGING_FACE_KEY")
login(token=hf_token)

In [8]:
# Load HaluEval dataset from Hugging Face
from datasets import load_dataset

def prepare_halueval_data_from_hf():
    """Load HaluEval dataset from Hugging Face"""
    print("Loading HaluEval dataset from Hugging Face...")
    
    # Create output directory
    output_dir = "data/halueval"
    os.makedirs(output_dir, exist_ok=True)
    
    # Process each split
    categories = ["qa", "dialogue", "summarization", "general"]
    
    # Prepare train and test sets
    for category in categories:
        print(f"Loading {category} dataset...")
        # Load the dataset for this category
        dataset = load_dataset("pminervini/HaluEval", category)
        
        # The dataset has a 'data' split containing all examples
        data = dataset['data']
        
        # Split into train/test (80/20 split)
        splits = data.train_test_split(test_size=0.2, seed=42)
        
        # Save as jsonl
        with open(f"{output_dir}/{category}_train.jsonl", 'w', encoding='utf-8') as f:
            for item in splits['train']:
                formatted_item = {
                    'question': item.get('instruction', ''),
                    'response': item.get('output', ''),
                    'is_hallucination': 1 if item.get('label') == 'hallucinated' else 0
                }
                f.write(json.dumps(formatted_item) + '\n')
        
        with open(f"{output_dir}/{category}_test.jsonl", 'w', encoding='utf-8') as f:
            for item in splits['test']:
                formatted_item = {
                    'question': item.get('instruction', ''),
                    'response': item.get('output', ''),
                    'is_hallucination': 1 if item.get('label') == 'hallucinated' else 0
                }
                f.write(json.dumps(formatted_item) + '\n')
    
    # Merge all training data
    print("Merging all training data...")
    with open(f"{output_dir}/train.jsonl", 'w', encoding='utf-8') as outfile:
        for category in categories:
            with open(f"{output_dir}/{category}_train.jsonl", 'r', encoding='utf-8') as infile:
                outfile.write(infile.read())
    
    # Merge all test data
    print("Merging all test data...")
    with open(f"{output_dir}/test.jsonl", 'w', encoding='utf-8') as outfile:
        for category in categories:
            with open(f"{output_dir}/{category}_test.jsonl", 'r', encoding='utf-8') as infile:
                outfile.write(infile.read())
    
    print("HaluEval dataset preparation complete!")
    print(f"Train data: {output_dir}/train.jsonl")
    print(f"Test data: {output_dir}/test.jsonl")
    
    return f"{output_dir}/train.jsonl", f"{output_dir}/test.jsonl"

# Run the function to get the paths
train_data_path, test_data_path = prepare_halueval_data_from_hf()

Loading HaluEval dataset from Hugging Face...
Loading qa dataset...
Loading dialogue dataset...
Loading summarization dataset...
Loading general dataset...
Merging all training data...
Merging all test data...
HaluEval dataset preparation complete!
Train data: data/halueval/train.jsonl
Test data: data/halueval/test.jsonl


In [9]:
# 3. Define the Epinet
class EpiNet(nn.Module):
    def __init__(self, feature_dim, z_dim, hidden_dims, num_classes):
        super().__init__()
        dims = [feature_dim + z_dim] + hidden_dims + [num_classes]
        layers = []
        for in_d, out_d in zip(dims, dims[1:]):
            layers += [nn.Linear(in_d, out_d), nn.ReLU()]
        self.mlp = nn.Sequential(*layers[:-1])  # drop final ReLU

    def forward(self, features, z):
        # stop-gradient on features
        features = features.detach()
        x = torch.cat([features, z], dim=1)
        return self.mlp(x)

# 4. Define the PriorNet
class PriorNet(nn.Module):
    def __init__(self, feature_dim, z_dim, num_classes):
        super().__init__()
        # Fixed random weights
        self.fc = nn.Linear(feature_dim + z_dim, num_classes)
        for p in self.parameters():
            p.requires_grad = False  # fix weights

    def forward(self, features, z):
        features = features.detach()
        x = torch.cat([features, z], dim=1)
        return self.fc(x)

In [10]:
# 5. Wrap into an Epistemic Neural Network
class EpistemicNN(nn.Module):
    def __init__(self, base: LlamaBaseNet, epinet: EpiNet, prior: PriorNet=None):
        super().__init__()
        self.base = base
        self.epinet = epinet
        self.prior = prior

    def forward(self, x, z):
        logits, features = self.base(x)         # base logits & features
        δ = self.epinet(features, z)            # learnable correction
        σP = self.prior(features, z) if self.prior else 0
        return logits + δ + σP

# 6. Sampling epistemic index z
def sample_z(batch_size, z_dim, device):
    # Gaussian prior
    return torch.randn(batch_size, z_dim, device=device)

In [28]:
def train_enn(model, dataloader, epochs, lr, λ, z_dim, device, writer):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=λ)
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0
        
        for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
            # Move tensors to device
            x_batch = x_batch.to(device, non_blocking=False)
            y_batch = y_batch.to(device)
            
            # Sample epistemic indices
            z = sample_z(len(y_batch), z_dim, device)

            # Forward pass
            logits = model(x_batch, z)
            loss = F.cross_entropy(logits, y_batch)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Track accuracy
            total_loss += loss.item()
            _, predicted = logits.max(1)
            total += y_batch.size(0)
            correct += predicted.eq(y_batch).sum().item()
            
            if batch_idx % 10 == 0:
                print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}, Loss: {loss.item():.4f}, "
                      f"Acc: {100.*correct/total:.2f}%")
                
            avg_loss = total_loss / (batch_idx + 1)
            acc      = 100. * correct / total
            global_step = epoch * len(dataloader) + batch_idx
            writer.add_scalar("train/loss", loss.item(), global_step)
            writer.add_scalar("train/acc",  acc,      global_step)

            if global_step % 50 == 0:
                writer.flush()
                
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {total_loss/(batch_idx+1):.4f}, "
              f"Accuracy: {100.*correct/total:.2f}%")

        torch.mps.empty_cache()
    
    return model

# Also update the evaluation function
def evaluate_enn(model, dataloader, z_dim, device, num_samples=10):
    model.eval()
    total = 0
    correct = 0
    epistemic_uncertainty = []
    
    with torch.no_grad():
        for x_batch, y_batch in dataloader:
            x_batch = x_batch.to(device, non_blocking=False)
            y_batch = y_batch.to(device)
            batch_size = len(x_batch)
            
            # Sample multiple z for each input
            all_logits = []
            for _ in range(num_samples):
                z = sample_z(batch_size, z_dim, device)
                logits = model(x_batch, z)
                all_logits.append(logits)
            
            # Stack all predictions
            stacked_logits = torch.stack(all_logits)  # [num_samples, batch_size, num_classes]
            
            # Mean prediction
            mean_logits = stacked_logits.mean(dim=0)
            _, predicted = mean_logits.max(1)
            total += y_batch.size(0)
            correct += predicted.eq(y_batch).sum().item()
            
            # Calculate uncertainty - variance across samples
            uncertainty = stacked_logits.var(dim=0).sum(dim=1)  # [batch_size]
            epistemic_uncertainty.append(uncertainty)
    
    accuracy = 100. * correct / total
    avg_uncertainty = torch.cat(epistemic_uncertainty).mean().item()
    
    print(f"Test Accuracy: {accuracy:.2f}%")
    print(f"Average Epistemic Uncertainty: {avg_uncertainty:.4f}")
    
    return accuracy, avg_uncertainty

In [29]:
# Hyperparameters
num_classes = 2  # Binary classification for hallucination detection
z_dim = 16
hidden_dims = [128, 64]
lr = 1e-5  # Lower learning rate for LLM fine-tuning
epochs = 3  # Reduce epochs for faster training with large model
λ = 1e-5
batch_size = 1  # Smaller batch size for large model
max_length = 128
import random

# Choose a Llama model - use a smaller version if memory is limited
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Start with smaller model for testing

class HaluEvalDataset(Dataset):
    def __init__(self, data_items, tokenizer):
        self.data = data_items
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # ---------- 1. pick the *content* string ----------
        if "question" in item:          # QA
            content = f"Context: {item['knowledge']}\nQuestion: {item['question']}"
            gold   = item["right_answer"]
            halluc = item["hallucinated_answer"]

        elif "dialogue_history" in item:      # Dialogue
            content = f"Context: {item['knowledge']}\nDialogue: {item['dialogue_history']}"
            gold   = item["right_response"]
            halluc = item["hallucinated_response"]

        elif "document" in item:              # Summarisation
            content = f"Document: {item['document']}"
            gold   = item["right_summary"]
            halluc = item["hallucinated_summary"]

        else:                                 # General split
            content = f"User query: {item['user_query']}"
            # general‐split has only one answer plus a Yes/No label
            gold   = item["chatgpt_response"]
            halluc = None                     # there is no alt-answer

        # ---------- 2. choose gold vs hallucinated version ----------
        if halluc is not None and torch.rand(1).item() > 0.5:
            text  = f"{content}\nAnswer: {halluc}"
            label = 1            # hallucination
        else:
            text  = f"{content}\nAnswer: {gold}"
            label = 0            # factual

        # ---------- 3. tokenise ----------
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        )

        return enc["input_ids"].squeeze(0), torch.tensor(label, dtype=torch.long)
    
# Initialize models
print("Initializing models...")
base = LlamaBaseNet(model_name, num_classes).to(device)
feature_dim = base.hidden_size
epinet = EpiNet(feature_dim, z_dim, hidden_dims, num_classes).to(device)
prior = PriorNet(feature_dim, z_dim, num_classes).to(device)
enn = EpistemicNN(base, epinet, prior).to(device)

# Create train/test splits from the loaded HaluEval dataset
print("Loading dataset...")
all_data = []
categories = ["qa", "dialogue", "summarization", "general"]

for category in categories:
    print(f"Loading {category} dataset...")
    dataset = load_dataset("pminervini/HaluEval", category)
    all_data.extend(dataset['data'])

# Shuffle and split the data
random.seed(42)
random.shuffle(all_data)
split_idx = int(0.8 * len(all_data))
train_data = all_data[:split_idx]
test_data = all_data[split_idx:]

# -------------------- build proper Dataset objects --------------------
train_dataset = HaluEvalDataset(train_data, base.tokenizer)
test_dataset  = HaluEvalDataset(test_data,  base.tokenizer)

trainloader = DataLoader(train_dataset,
                         batch_size=batch_size,
                         shuffle=True)

testloader  = DataLoader(test_dataset,
                         batch_size=batch_size,
                         shuffle=False)

print(f"Training samples: {len(train_data)}")
print(f"Testing samples: {len(test_data)}")

Initializing models...
Loading dataset...
Loading qa dataset...
Loading dialogue dataset...
Loading summarization dataset...
Loading general dataset...
Training samples: 27605
Testing samples: 6902


In [30]:
# small_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False)
# enn.eval();                # no grads
# x,y = next(iter(small_loader))
# z   = sample_z(1, z_dim, device)
# with torch.no_grad():      # forward only
#     _ = enn(x.to(device), z)
# print("✓ tiny batch ran")

In [31]:
import torch, gc

torch.mps.empty_cache()   # releases unused cached blocks
gc.collect()              # Python-side garbage collection


1402

In [43]:
# Train the model first
print("Training model...")
trained_model = train_enn(enn, trainloader, epochs, lr, λ, z_dim, device, tb_writer)

# Then evaluate the model
print("Evaluating model...")
accuracy, uncertainty = evaluate_enn(trained_model, testloader, z_dim, device)

# Save the model
torch.save(trained_model.state_dict(), "enn_halueval_model.pt")
print("Model saved to enn_halueval_model.pt")
tb_writer.close()

Training model...




Epoch 1/3, Batch 0, Loss: 4.2329, Acc: 0.00%
Epoch 1/3, Batch 10, Loss: 1.6306, Acc: 36.36%
Epoch 1/3, Batch 20, Loss: 0.1659, Acc: 57.14%
Epoch 1/3, Batch 30, Loss: 1.9210, Acc: 51.61%
Epoch 1/3, Batch 40, Loss: 0.4872, Acc: 56.10%
Epoch 1/3, Batch 50, Loss: 1.0253, Acc: 52.94%
Epoch 1/3, Batch 60, Loss: 2.2024, Acc: 54.10%
Epoch 1/3, Batch 70, Loss: 3.1466, Acc: 56.34%
Epoch 1/3, Batch 80, Loss: 1.1493, Acc: 50.62%
Epoch 1/3, Batch 90, Loss: 0.5534, Acc: 52.75%
Epoch 1/3, Batch 100, Loss: 0.0941, Acc: 51.49%
Epoch 1/3, Batch 110, Loss: 0.0243, Acc: 53.15%
Epoch 1/3, Batch 120, Loss: 0.6817, Acc: 54.55%
Epoch 1/3, Batch 130, Loss: 0.0532, Acc: 53.44%
Epoch 1/3, Batch 140, Loss: 2.5350, Acc: 53.19%
Epoch 1/3, Batch 150, Loss: 0.9787, Acc: 54.30%
Epoch 1/3, Batch 160, Loss: 3.2568, Acc: 54.66%
Epoch 1/3, Batch 170, Loss: 0.0961, Acc: 55.56%
Epoch 1/3, Batch 180, Loss: 0.0854, Acc: 56.35%
Epoch 1/3, Batch 190, Loss: 0.1460, Acc: 56.54%
Epoch 1/3, Batch 200, Loss: 0.3086, Acc: 55.72%
Epoc

In [11]:
Z_DIM          = 16              # must match training
SAMPLES_Z      = 30              # how many z draws per candidate
TRADE_OFF_LAMB = 0.5             # higher ⇒ penalise uncertainty more
# DEVICE         = "mps" if torch.backends.mps.is_available() else (
#                  "cuda" if torch.cuda.is_available() else "cpu")
DEVICE = "cpu"

base   = LlamaBaseNet("TinyLlama/TinyLlama-1.1B-Chat-v1.0", num_classes=2).to(DEVICE)
epinet = EpiNet(base.hidden_size, Z_DIM, [128, 64], num_classes=2).to(DEVICE)
prior  = PriorNet(base.hidden_size, Z_DIM, num_classes=2).to(DEVICE)
enn    = EpistemicNN(base, epinet, prior).to(DEVICE)

state_dict = torch.load("enn_halueval_model.pt", map_location=DEVICE)
enn.load_state_dict(state_dict)
enn.eval()

@torch.no_grad()
def score_answer(raw_text):
    """Return (prob_factual, variance) for one candidate string."""
    logits_all = []
    for _ in range(SAMPLES_Z):
        z = sample_z(1, Z_DIM, DEVICE)
        logits = enn(raw_text, z)          # ← only one tensor comes back
        logits_all.append(logits)          # shape [1,2]

    logits_stacked = torch.stack(logits_all)      # [S,1,2]
    mean_logits    = logits_stacked.mean(0).squeeze()   # [2]
    variance       = logits_stacked.var(0).sum().item()
    prob_factual   = torch.softmax(mean_logits, dim=-1)[0].item()
    return prob_factual, variance

def select_best(question, answers, lam=TRADE_OFF_LAMB):
    rows = []
    for ans in answers:
        text = f"{question}\nAnswer: {ans}"           # keep template same as training
        p, var = score_answer(text)
        score  = p - lam * var
        rows.append((score, p, var, ans))
    rows.sort(reverse=True)
    return rows[0], rows     # (best_row, all_rows)

In [None]:
# MAKE SURE TO BENCHMARK
from dotenv import load_dotenv
import os
from openai import OpenAI

load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

client = OpenAI(api_key=OPENAI_API_KEY)
def get_openai_ensemble(question, temps=[0.01, 0.3, 0.7, 1.0], model="gpt-4o"):
    answers = []
    for temp in temps:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": question}],
            temperature=temp,
            max_completion_tokens=64
        )
        answers.append(response.choices[0].message.content)
    return answers

# Example usage
question = "Who discovered the fibonacci sequence"
ensemble_answers = get_openai_ensemble(question)
for i, ans in enumerate(ensemble_answers):
    print(f"Temperature {['0.01','0.3','0.7','1.0'][i]}: {ans}")

Temperature 0.01: The Fibonacci sequence is named after the Italian mathematician Leonardo of Pisa, who is also known as Fibonacci. He introduced the sequence to the Western world in his 1202 book "Liber Abaci" (The Book of Calculation). However, the sequence had been previously described in Indian mathematics. Indian mathematicians such as Ping
Temperature 0.3: The Fibonacci sequence is named after the Italian mathematician Leonardo of Pisa, who is also known as Fibonacci. He introduced the sequence to the Western world in his 1202 book "Liber Abaci" (The Book of Calculation). However, the sequence had been previously described in Indian mathematics. Indian mathematicians such as Ping
Temperature 0.7: The Fibonacci sequence is named after the Italian mathematician Leonardo of Pisa, who is also known as Fibonacci. He introduced the sequence to the Western world in his 1202 book "Liber Abaci" (The Book of Calculation). However, the sequence had been previously described in Indian mathem

In [13]:
# currently all the same answer, so nothing crazy

best, table = select_best(question, ensemble_answers)
print("===== ENSEMBLE EVALUATION =====")
print(f"Best answer (score={best[0]:+.3f}, p_factual={best[1]:.3f}, var={best[2]:.3f}):")
print(best[3])
print("\nAll answers ranked by score (score = p_factual - λ·var):")
for rank, (score, p_factual, var, ans) in enumerate(table, 1):
    print(f"\n{rank}. Temperature {['0.01','0.3','0.7','1.0'][rank-1]}:")
    print(f"   Score: {score:+.3f} | p_factual: {p_factual:.3f} | var: {var:.3f}")
    print(f"   Answer: {ans}")

===== ENSEMBLE EVALUATION =====
Best answer (score=+0.687, p_factual=0.688, var=0.003):
The Fibonacci sequence is named after the Italian mathematician Leonardo of Pisa, also known as Fibonacci. He introduced this sequence to the Western world in his 1202 book "Liber Abaci" (The Book of Calculation). However, the sequence had been previously described in Indian mathematics. In particular, it appeared in the works

All answers ranked by score (score = p_factual - λ·var):

1. Temperature 0.01:
   Score: +0.687 | p_factual: 0.688 | var: 0.003
   Answer: The Fibonacci sequence is named after the Italian mathematician Leonardo of Pisa, also known as Fibonacci. He introduced this sequence to the Western world in his 1202 book "Liber Abaci" (The Book of Calculation). However, the sequence had been previously described in Indian mathematics. In particular, it appeared in the works

2. Temperature 0.3:
   Score: +0.430 | p_factual: 0.432 | var: 0.005
   Answer: The Fibonacci sequence is named a

In [None]:
# MAKE SURE TO BENCHMARK
from dotenv import load_dotenv
import os
from openai import OpenAI
from anthropic import Anthropic
import google.genai as genai

load_dotenv()

# Load API keys and initialize clients
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

print("Environment check:")
print(f"OpenAI API key loaded: {'Yes' if OPENAI_API_KEY else 'No'}")
print(f"Anthropic API key loaded: {'Yes' if ANTHROPIC_API_KEY else 'No'}")
print(f"Google API key loaded: {'Yes' if GOOGLE_API_KEY else 'No'}")

# Initialize clients if keys are available
openai_client = OpenAI(api_key=OPENAI_API_KEY) if OPENAI_API_KEY else None
anthropic_client = Anthropic(api_key=ANTHROPIC_API_KEY) if ANTHROPIC_API_KEY else None
google_client = genai.Client(api_key=GOOGLE_API_KEY)

# if GOOGLE_API_KEY:
#     genai.configure(api_key=GOOGLE_API_KEY)

def get_multi_provider_ensemble(question, providers=None):
    """Get answers from multiple providers with different settings.
    
    Args:
        question (str): The question to ask
        providers (list): List of tuples (provider, model, temp)
                        If None, uses default configuration
    
    Default providers:
    [
        ("openai", "gpt-4", 0.3),
        ("openai", "gpt-3.5-turbo", 0.3),
        ("anthropic", "claude-3-opus", 0.3),
        ("anthropic", "claude-3-sonnet", 0.3),
        ("google", "gemini-pro", 0.3)
    ]
    
    Returns:
        list: List of answers from different providers
        list: List of provider descriptions for each answer
    """
    if providers is None:
        providers = [
            ("openai", "gpt-4", 0.3),
            ("openai", "gpt-3.5-turbo", 0.3),
            ("anthropic", "claude-3-7-sonnet-20250219", 0.3),
            ("anthropic", "claude-3-5-sonnet-20241022", 0.3),
            ("google", "gemini-2.0-flash", 0.3)
        ]
    
    answers = []
    provider_info = []  # Track which provider/model gave which answer
    
    for provider, model, temp in providers:
        try:
            if provider == "openai" and openai_client:
                response = openai_client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": question}],
                    temperature=temp,
                    max_tokens=64
                )
                answers.append(response.choices[0].message.content)
                provider_info.append(f"{provider}-{model} (temp={temp})")
                
            elif provider == "anthropic" and anthropic_client:
                response = anthropic_client.messages.create(
                    model=model,
                    max_tokens=64,
                    temperature=temp,
                    messages=[{"role": "user", "content": question}]
                )
                answers.append(response.content[0].text)
                provider_info.append(f"{provider}-{model} (temp={temp})")
                
            elif provider == "google" and GOOGLE_API_KEY:
                # model_obj = genai.GenerativeModel(model)
                response = google_client.models.generate_content(
                    model=model,
                    contents=question,
                    config=genai.types.GenerateContentConfig(
                        temperature=temp,
                        max_output_tokens=64
                    )
                )
                answers.append(response.text)
                provider_info.append(f"{provider}-{model} (temp={temp})")
                
        except Exception as e:
            print(f"Error with {provider} {model}: {str(e)}")
            continue
            
    return answers, provider_info

# Example usage
question = "Who discovered the fibonacci sequence"

# Default usage with all providers
answers, providers = get_multi_provider_ensemble(question)

# Print answers with provider information
print("\nEnsemble Responses:")
for ans, provider in zip(answers, providers):
    print(f"\n{provider}:")
    print(ans)

# Custom provider configuration example
custom_providers = [
    ("openai", "gpt-4o", 0.1),
    ("openai", "gpt-4o", 0.7),
    ("anthropic", "claude-3-7-sonnet-20250219", 0.3),
    ("google", "gemini-2.0-flash", 0.5)
]

# Get answers with custom configuration
custom_answers, custom_providers = get_multi_provider_ensemble(question, custom_providers)

# Evaluate with ENN
best, table = select_best(question, custom_answers)
print("\n===== ENSEMBLE EVALUATION =====")
print(f"Best answer (score={best[0]:+.3f}, p_factual={best[1]:.3f}, var={best[2]:.3f}):")
print(best[3])
print("\nAll answers ranked by score (score = p_factual - λ·var):")
for rank, (score, p_factual, var, ans) in enumerate(table, 1):
    print(f"\n{rank}. {custom_providers[rank-1]}:")
    print(f"   Score: {score:+.3f} | p_factual: {p_factual:.3f} | var: {var:.3f}")
    print(f"   Answer: {ans}")

Environment check:
OpenAI API key loaded: Yes
Anthropic API key loaded: Yes
Google API key loaded: Yes

Ensemble Responses:

openai-gpt-4 (temp=0.3):
The Fibonacci sequence was discovered by Leonardo of Pisa, who is also known as Fibonacci. He introduced it to the Western world in his 1202 book "Liber Abaci," but the sequence had been previously described in Indian mathematics.

openai-gpt-3.5-turbo (temp=0.3):
The Fibonacci sequence was discovered by Leonardo of Pisa, also known as Fibonacci, an Italian mathematician in the 13th century.

anthropic-claude-3-7-sonnet-20250219 (temp=0.3):
The Fibonacci sequence was first described by Leonardo of Pisa, who was also known as Fibonacci, in his book "Liber Abaci" published in 1202. However, Indian mathematicians were already aware of the sequence as early as the 6th century. Fibonacci introduced this sequence

anthropic-claude-3-5-sonnet-20241022 (temp=0.3):
The Fibonacci sequence was first described by Leonardo of Pisa, better known as Fib

In [32]:
from dotenv import load_dotenv
import os
from openai import OpenAI
from anthropic import Anthropic
import google.genai as genai

load_dotenv()

# Initialize API clients
openai_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
anthropic_client = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
google_client = genai.Client(api_key=GOOGLE_API_KEY)

def get_multi_model_ensemble(question, knowledge=None):
    """Get answers from 6 specified models with different configurations.
    
    Models:
    1. GPT-4 (o4-mini)
    2. GPT-4 (o1-mini)
    3. Claude 3 Sonnet
    4. Claude 3 Haiku
    5. Gemini Flash Lite
    6. Gemini Flash Preview
    """
    answers = []
    provider_info = []
    
    # Format the prompt with knowledge if provided
    prompt = f"Knowledge: {knowledge}\nQuestion: {question}" if knowledge else question
    
    try:
        # 1. GPT-4 (o4-mini)
        response = openai_client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3,
            max_completion_tokens=64
        )
        answers.append(response.choices[0].message.content)
        provider_info.append("openai-gpt-4o")
        
        # 2. GPT-4 (o1-mini)
        response = openai_client.chat.completions.create(
            model="gpt-4.1",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3,
            max_completion_tokens=64
        )
        answers.append(response.choices[0].message.content)
        provider_info.append("openai-o1-mini")
        
        # 3. Claude 3 Sonnet
        response = anthropic_client.messages.create(
            model="claude-3-7-sonnet-20250219",
            max_tokens=64,
            temperature=0.3,
            messages=[{"role": "user", "content": prompt}]
        )
        answers.append(response.content[0].text)
        provider_info.append("anthropic-claude-3-sonnet")
        
        # 4. Claude 3 Haiku
        response = anthropic_client.messages.create(
            model="claude-3-5-haiku-20241022",
            max_tokens=64,
            temperature=0.3,
            messages=[{"role": "user", "content": prompt}]
        )
        answers.append(response.content[0].text)
        provider_info.append("anthropic-claude-3-haiku")
        
        # 5. Gemini Flash Lite

        # FIX GEMINI CHAT COMPLETION SYNTAX

        # model = genai.GenerativeModel("gemini-2.0-flash-lite")
        response = google_client.models.generate_content(
        model="gemini-2.0-flash-lite",  # or "gemini-2.5-flash-preview-04-17"
        contents=prompt,
        config=genai.types.GenerateContentConfig(
            temperature=0.3,
            max_output_tokens=64
            )
        )
        answers.append(response.text)
        provider_info.append("google-gemini-flash-lite")
        
        # 6. Gemini Flash Preview
        # model = genai.GenerativeModel("gemini-2.5-flash-preview-04-17")
        response = google_client.models.generate_content(
            model="gemini-2.5-flash-preview-04-17",
            contents=question,
            config=genai.types.GenerateContentConfig(
                temperature=0.3,
                max_output_tokens=64
            )
        )
        answers.append(response.text)
        provider_info.append("google-gemini-flash-preview")
        
    except Exception as e:
        print(f"Error during API calls: {str(e)}")
        
    return answers, provider_info

# Test the ensemble with HaluEval dataset
def evaluate_on_halueval(num_samples=5):
    """Evaluate the ensemble on HaluEval dataset samples."""
    # Load HaluEval dataset
    from datasets import load_dataset
    
    # Load a few samples from each category
    results = []
    categories = ["qa", "dialogue", "summarization", "general"]
    
    for category in categories:
        dataset = load_dataset("pminervini/HaluEval", category, split="data")
        samples = dataset.select(range(min(num_samples, len(dataset))))
        
        for item in samples:
            # Get the question and knowledge
            if "question" in item:
                question = item["question"]
                knowledge = item.get("knowledge", "")
            elif "dialogue_history" in item:
                question = item["dialogue_history"]
                knowledge = item.get("knowledge", "")
            elif "document" in item:
                question = "Summarize this document"
                knowledge = item["document"]
            else:
                question = item["user_query"]
                knowledge = ""
            
            # Get ensemble answers
            answers, providers = get_multi_model_ensemble(question, knowledge)
            
            # Evaluate with ENN
            best, table = select_best(question, answers)
            
            # Store results
            results.append({
                "category": category,
                "question": question,
                "knowledge": knowledge,
                "answers": answers,
                "providers": providers,
                "best_answer": best,
                "all_scores": table
            })
            
            # Print results for this sample
            print(f"\n===== {category.upper()} SAMPLE =====")
            print(f"Question: {question}")
            print(f"Knowledge: {knowledge[:100]}...")
            print("\nModel responses and scores:")
            for (score, p_factual, var, ans), provider in zip(table, providers):
                print(f"\n{provider}:")
                print(f"Score: {score:+.3f} | p_factual: {p_factual:.3f} | var: {var:.3f}")
                print(f"Answer: {ans[:100]}...")
            
            print("\nBest Answer:")
            print(f"Score: {best[0]:+.3f} | p_factual: {best[1]:.3f} | var: {best[2]:.3f}")
            print(f"Answer: {best[3]}")
            
    return results

# Run evaluation
print("Starting HaluEval evaluation...")
results = evaluate_on_halueval(num_samples=100)  # Test with k samples per category
print("\nEvaluation complete!")

Starting HaluEval evaluation...

===== QA SAMPLE =====
Question: Which magazine was started first Arthur's Magazine or First for Women?
Knowledge: Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 1...

Model responses and scores:

openai-gpt-4o:
Score: +0.408 | p_factual: 0.410 | var: 0.005
Answer: Arthur's Magazine was started first. It was published in the 19th century (1844-1846), while First f...

openai-o1-mini:
Score: +0.394 | p_factual: 0.396 | var: 0.005
Answer: ...

anthropic-claude-3-sonnet:
Score: +0.339 | p_factual: 0.341 | var: 0.004
Answer: Based on the provided information, Arthur's Magazine was started first. Arthur's Magazine was publis...

anthropic-claude-3-haiku:
Score: +0.043 | p_factual: 0.045 | var: 0.005
Answer: Based on the given knowledge, Arthur's Magazine was started first. It was published from 1844 to 184...

google-gemini-flash-lite:
Score: +0.019 | p_factual: 0.021 | var: 0.003
Answer: Arthur's Magazine w

In [None]:
# Import required libraries for evaluation
from dotenv import load_dotenv
import os
from openai import OpenAI
from anthropic import Anthropic
import google.genai as genai
import time
import json
from tqdm.notebook import tqdm
import numpy as np

load_dotenv()

# Initialize API clients
openai_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
anthropic_client = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
google_client = genai.Client(api_key=GOOGLE_API_KEY)

def evaluate_full_halueval(max_samples_per_category=None, save_results=True, checkpoint_interval=10):
    """
    Evaluate the ensemble on the full HaluEval dataset with comprehensive metrics.
    
    Args:
        max_samples_per_category: Maximum samples to use per category (None for all)
        save_results: Whether to save results to disk
        checkpoint_interval: Save results every N samples
    """
    from datasets import load_dataset
    
    # Initialize results storage
    results = {
        "qa": [],
        "dialogue": [],
        "summarization": [],
        "general": []
    }
    
    # Track metrics
    metrics = {
        "qa": {"scores": [], "p_factual": [], "variance": [], "best_models": []},
        "dialogue": {"scores": [], "p_factual": [], "variance": [], "best_models": []},
        "summarization": {"scores": [], "p_factual": [], "variance": [], "best_models": []},
        "general": {"scores": [], "p_factual": [], "variance": [], "best_models": []}
    }
    
    # For comparing to "ground truth" (if available)
    ground_truth = {
        "qa": {"y_true": [], "y_pred": []},
        "dialogue": {"y_true": [], "y_pred": []},
        "summarization": {"y_true": [], "y_pred": []},
        "general": {"y_true": [], "y_pred": []}
    }
    
    # Track rate limits and timing
    rate_limit_counter = 0
    start_time = time.time()
    
    # Process by category
    for category in ["qa", "dialogue", "summarization", "general"]:
        print(f"\n===== Processing {category.upper()} category =====")
        
        # Load dataset
        try:
            dataset = load_dataset("pminervini/HaluEval", category, split="data")
            print(f"Loaded {len(dataset)} examples from {category} category")
            
            # Limit samples if specified
            if max_samples_per_category is not None:
                dataset = dataset.select(range(min(max_samples_per_category, len(dataset))))
                print(f"Using {len(dataset)} examples")
            
            # Process each example with progress bar
            for idx, item in enumerate(tqdm(dataset, desc=f"Processing {category}")):
                # Get the question and knowledge
                if "question" in item:
                    question = item["question"]
                    knowledge = item.get("knowledge", "")
                    # For ground truth comparison if available
                    right_answer = item.get("right_answer", "")
                    hallucinated_answer = item.get("hallucinated_answer", "")
                elif "dialogue_history" in item:
                    question = item["dialogue_history"]
                    knowledge = item.get("knowledge", "")
                    right_answer = item.get("right_response", "")
                    hallucinated_answer = item.get("hallucinated_response", "")
                elif "document" in item:
                    question = "Summarize this document"
                    knowledge = item["document"]
                    right_answer = item.get("right_summary", "")
                    hallucinated_answer = item.get("hallucinated_summary", "")
                else:
                    question = item["user_query"]
                    knowledge = ""
                    right_answer = item.get("chatgpt_response", "")
                    hallucinated_answer = ""
                
                # Rate limit management (to avoid API throttling)
                if rate_limit_counter >= 20:  # Reset after 20 API calls
                    time.sleep(10)  # Wait 10 seconds
                    rate_limit_counter = 0
                
                # Get ensemble answers
                answers, providers = get_multi_model_ensemble(question, knowledge)
                rate_limit_counter += 6  # 6 API calls made
                
                # If we have ground truth, record it
                if right_answer and hallucinated_answer:
                    # We have paired examples - could be used for true accuracy
                    ground_truth[category]["y_true"].append(0)  # 0 = not hallucinated
                    ground_truth[category]["y_true"].append(1)  # 1 = hallucinated
                
                # Evaluate with ENN
                best, table = select_best(question, answers)
                
                # Record which model had the best answer
                best_model_idx = [score for score, _, _, _ in table].index(best[0])
                best_model = providers[best_model_idx]
                
                # Store metrics
                metrics[category]["scores"].append(best[0])
                metrics[category]["p_factual"].append(best[1]) 
                metrics[category]["variance"].append(best[2])
                metrics[category]["best_models"].append(best_model)
                
                # Store full result
                results[category].append({
                    "id": idx,
                    "question": question,
                    "knowledge": knowledge,
                    "answers": answers,
                    "providers": providers,
                    "best_answer": best,
                    "all_scores": table
                })
                
                # Periodically save checkpoints
                if save_results and (idx + 1) % checkpoint_interval == 0:
                    checkpoint_file = f"halueval_{category}_checkpoint_{idx+1}.json"
                    with open(checkpoint_file, 'w') as f:
                        json.dump(results[category], f, indent=2)
                    print(f"Saved checkpoint for {category} at sample {idx+1}")
        
        except Exception as e:
            print(f"Error processing {category} category: {str(e)}")
        
        # Save category results
        if save_results:
            with open(f"halueval_{category}_results.json", 'w') as f:
                json.dump(results[category], f, indent=2)
            print(f"Saved results for {category} category")
    
    # Calculate and display overall metrics
    print("\n===== OVERALL EVALUATION RESULTS =====")
    
    # Aggregate metrics across all categories
    all_scores = []
    all_p_factual = []
    all_variance = []
    all_best_models = []
    
    for category in ["qa", "dialogue", "summarization", "general"]:
        cat_scores = metrics[category]["scores"]
        cat_p_factual = metrics[category]["p_factual"]
        cat_variance = metrics[category]["variance"]
        cat_best_models = metrics[category]["best_models"]
        
        if cat_scores:  # Only process if we have data
            # Category stats
            avg_score = sum(cat_scores) / len(cat_scores)
            avg_p_factual = sum(cat_p_factual) / len(cat_p_factual)
            avg_variance = sum(cat_variance) / len(cat_variance)
            
            # Model preference
            model_counts = {}
            for model in cat_best_models:
                model_counts[model] = model_counts.get(model, 0) + 1
            
            preferred_model = max(model_counts.items(), key=lambda x: x[1])
            
            print(f"\n{category.upper()} Category Stats:")
            print(f"  Samples processed: {len(cat_scores)}")
            print(f"  Average Score: {avg_score:.3f}")
            print(f"  Average p_factual: {avg_p_factual:.3f}")
            print(f"  Average Variance: {avg_variance:.3f}")
            print(f"  Preferred Model: {preferred_model[0]} ({preferred_model[1]} times, {preferred_model[1]/len(cat_best_models)*100:.1f}%)")
            
            # Extend to global lists
            all_scores.extend(cat_scores)
            all_p_factual.extend(cat_p_factual)
            all_variance.extend(cat_variance)
            all_best_models.extend(cat_best_models)
    
    # Global stats
    if all_scores:
        print("\nGLOBAL Stats:")
        print(f"  Total samples processed: {len(all_scores)}")
        print(f"  Average Score: {sum(all_scores)/len(all_scores):.3f}")
        print(f"  Average p_factual: {sum(all_p_factual)/len(all_p_factual):.3f}")
        print(f"  Average Variance: {sum(all_variance)/len(all_variance):.3f}")
        
        # Global model preference
        model_counts = {}
        for model in all_best_models:
            model_counts[model] = model_counts.get(model, 0) + 1
        
        # Sort models by frequency
        sorted_models = sorted(model_counts.items(), key=lambda x: x[1], reverse=True)
        
        print("\nModel Performance Ranking:")
        for rank, (model, count) in enumerate(sorted_models, 1):
            print(f"  {rank}. {model}: {count} wins ({count/len(all_best_models)*100:.1f}%)")
    
    # Calculate elapsed time
    elapsed_time = time.time() - start_time
    hours, remainder = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f"\nTotal evaluation time: {int(hours)}h {int(minutes)}m {int(seconds)}s")
    
    # Save final aggregated results
    if save_results:
        with open("halueval_full_evaluation_results.json", 'w') as f:
            json.dump({
                "metrics": metrics,
                "ground_truth": ground_truth,
                "elapsed_time": elapsed_time
            }, f, indent=2)
        print("\nSaved full evaluation results")
    
    return results, metrics

# Run full evaluation with configurable sample size
# Set max_samples_per_category to a smaller number for a quicker test
# Use None to run on the entire dataset
print("Starting full HaluEval benchmark evaluation...")
# Adjust this number based on how much data you want to process
# For a small test: max_samples_per_category=10 (40 total samples)
# For medium test: max_samples_per_category=100 (400 total samples)
# For full evaluation: max_samples_per_category=None (all ~6,700 samples)
results, metrics = evaluate_full_halueval(max_samples_per_category=None, checkpoint_interval=5)
print("\nEvaluation complete!")