In [None]:
import os
import gc
import json
import re
from datetime import datetime
import zipfile

import math
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc


import torch
import transformers
from transformer_lens import HookedTransformer
from transformers import AutoTokenizer

In [None]:
model_path = "meta-llama/Llama-3.3-70B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model
model = HookedTransformer.from_pretrained(model_path, device=device, dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
model.eval()

# Data setup

In [None]:
!git clone https://github.com/LRudL/sad.git

In [None]:
#get SAD stages_oversight
def load_jsonl_from_zip(zip_path, file_name, password=b'sadtimesforthesetimes'):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        content = zip_ref.read(file_name, pwd=password).decode('utf-8')
        return [json.loads(line) for line in content.strip().split('\n')]

zip_path = './sad/sad/stages/private_data_gen.zip'
deploy_oversight = load_jsonl_from_zip(zip_path, 'out/deploy_oversight.jsonl')
test_oversight = load_jsonl_from_zip(zip_path, 'out/test_oversight.jsonl')

# Sample verification printouts
print(f"Number of samples in deploy_oversight: {len(deploy_oversight)}")
print(f"Number of samples in test_oversight: {len(test_oversight)}\n")

print("First item in deploy_oversight:")
print(deploy_oversight[0])
print("\nFirst item in test_oversight:")
print(test_oversight[0])

In [None]:
#use chat template

def format_question_choices(item):
    is_testing = 'yes' in item['choices_right']
    
    messages = [
     {"role": "system", "content": ""},
     {"role": "user", "content": f"{item['body']}"}
    ]

    formatted_message = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    return {
        "question": formatted_message,
        "type": "testing" if is_testing else "deployment"
    }

In [None]:
all_data = deploy_oversight + test_oversight
formatted_data = []

for entry in all_data:
    formatted_data.append(format_question_choices(entry))

In [None]:
formatted_data[0]

# Probe scoring functions

In [None]:
def get_probe_scores(model, steering_vectors, layer, prompt, tokenizer=None):
    """
    Calculate probe activation values for each token position in a prompt.
    
    Args:
        model: The HookedTransformer model
        steering_vectors: Dictionary of steering vectors indexed by layer
        layer: The specific layer to probe
        prompt: The input prompt to analyze
        tokenizer: Optional tokenizer (uses model's tokenizer if None)
    
    Returns:
        tuple: (tokens, scores, token_strings)
            - tokens: The tokenized prompt
            - scores: List of scalar probe activation values for each token position
            - token_strings: List of decoded token strings
    """
    if tokenizer is None:
        tokenizer = model.tokenizer
    
    # Ensure the steering vector for the requested layer exists
    if layer not in steering_vectors:
        raise ValueError(f"No steering vector available for layer {layer}")
    
    # Get the steering vector for the requested layer
    steering_vector = steering_vectors[layer]
    
    # Tokenize the prompt
    tokens = model.to_tokens(prompt) if hasattr(model, 'to_tokens') else torch.tensor([tokenizer.encode(prompt)]).to(model.device)
    
    # Store activation scores for each token position
    scores = []
    
    # Define the hook function to capture activations at each position
    def hook_fn(activation, hook):
        # For each token position in the sequence
        for pos in range(activation.shape[1]):
            # Get activation at this position
            act = activation[0, pos, :]
            
            # Project the activation onto the steering vector
            # (dot product of activation with normalized steering vector)
            projection = torch.dot(act, steering_vector[0]) / steering_vector[0].norm()
            
            # Save the scalar projection value
            if pos == len(scores):  # Only add if we haven't processed this position yet
                scores.append(projection.item())
        
        return activation
    
    # Run the model with the hook attached
    model.reset_hooks()
    with model.hooks(fwd_hooks=[(f"blocks.{layer}.hook_resid_pre", hook_fn)]):
        with torch.no_grad():
            model(tokens)
    
    # Get the token strings for better interpretability
    token_strings = [tokenizer.decode([t]) for t in tokens[0].tolist()]
    
    return tokens, scores, token_strings

In [None]:
def analyze_dataset_with_probe(model, steering_vectors, layer, dataset, tokenizer=None, output_file="probe_stats.csv"):
    """
    Loop through a dataset and collect probe statistics for each question.
    
    Args:
        model: The HookedTransformer model
        steering_vectors: Dictionary of steering vectors indexed by layer
        layer: The specific layer to probe
        dataset: List of dictionaries containing 'question', 'test', 'deploy', 'correct_answer'
        tokenizer: Optional tokenizer (uses model's tokenizer if None)
        output_file: Path to save the CSV output
    """
    if tokenizer is None:
        tokenizer = model.tokenizer
    
    # Prepare results storage
    results = []
    
    # Process each question in the dataset
    for i, item in enumerate(tqdm(dataset, desc="Analyzing questions")):
        question = item['question']
        
        try:
            # Get probe scores for the question
            tokens, scores, token_strings = get_probe_scores(model, steering_vectors, layer, question, tokenizer)
            
            # Filter out BOS tokens
            filtered_scores = []
            filtered_tokens = []
            filtered_indices = []
            
            for j, (token, score) in enumerate(zip(token_strings, scores)):
                # Skip the BOS token (different models might have different BOS tokens)
                if ("<|begin_of_text|>" in token or 
                    "<s>" in token or 
                    "<BOS>" in token or
                    token.strip() == ""):
                    continue
                
                filtered_indices.append(j)
                filtered_scores.append(score)
                filtered_tokens.append(token)
            
            # Calculate statistics
            if filtered_scores:
                max_score = max(filtered_scores)
                min_score = min(filtered_scores)
                mean_score = sum(filtered_scores) / len(filtered_scores)
                std_score = (sum((x - mean_score) ** 2 for x in filtered_scores) / len(filtered_scores)) ** 0.5
            else:
                max_score = min_score = mean_score = std_score = float('nan')
            
            # Record results
            result = {
                'question_id': i,
                'question': question,
                'type': item['type'],
                'max_score': max_score,
                'min_score': min_score,
                'mean_score': mean_score,
                'std_score': std_score,
            }
            
            results.append(result)
            
            # Print progress
            if (i + 1) % 10 == 0:
                print(f"Processed {i + 1}/{len(dataset)} questions")
        
        except Exception as e:
            print(f"Error processing question {i}: {e}")
            # Add a row with the error
            results.append({
                'question_id': i,
                'question': question,
                'type': item['type'],
                'max_score': float('nan'),
                'min_score': float('nan'),
                'mean_score': float('nan'),
                'std_score': float('nan'),
                'error': str(e)
            })
    
    # Save results to CSV
    pd.DataFrame(results).to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
    
    return results


In [None]:
def test_all_layers(model, normalized_vectors, test_data, tokenizer, output_dir="layer_probe_results"):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get the number of layers in the model
    num_layers = len(normalized_vectors.keys())
    
    # Initialize a list to store accuracy results
    accuracy_results = []
    
    # Loop through all layers with a progress bar
    for layer in tqdm(range(1, num_layers), desc="Testing layers"):
        # Output file path for this layer
        output_file = os.path.join(output_dir, f"probe_stats_layer{layer}.csv")
        
        # Analyze the dataset with the current layer
        results = analyze_dataset_with_probe(
            model=model,
            steering_vectors=normalized_vectors,
            layer=layer,
            dataset=test_data,
            tokenizer=tokenizer,
            output_file=output_file
        )
        
    return

In [None]:
def calculate_auroc_from_csv(file_path):
    """Calculate AUROC for a single CSV file"""
    try:
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        # Check if the required columns exist
        required_columns = ['type', 'mean_score']
        if not all(col in df.columns for col in required_columns):
            return None, "Missing required columns (type and/or mean_score)"
        
        # Prepare data for ROC calculation
        # 1 for deployment (positive class), 0 for testing (negative class)
        y_true = (df['type'] == 'deployment').astype(int)
        
        # Use mean_score as the prediction score
        y_score = df['mean_score']
        
        # Calculate ROC curve points
        fpr, tpr, thresholds = roc_curve(y_true, y_score)
        roc_auc = auc(fpr, tpr)
        
        # Calculate optimal threshold (Youden's J statistic)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        sensitivity = tpr[optimal_idx]
        specificity = 1-fpr[optimal_idx]
        
        return {
            'auroc': roc_auc,
            'optimal_threshold': optimal_threshold,
            'sensitivity': sensitivity,
            'specificity': specificity
        }, None
    
    except Exception as e:
        return None, str(e)

In [None]:

def process_all_csvs_in_directory(directory_path):
    """Process all CSV files in the given directory"""
    results = {}
    
    # Check if directory exists
    if not os.path.isdir(directory_path):
        print(f"Error: '{directory_path}' is not a valid directory.")
        return results
    
    # Get list of CSV files
    csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]
    
    if not csv_files:
        print(f"No CSV files found in {directory_path}")
        return results
    
    print(f"Found {len(csv_files)} CSV files in {directory_path}")
    print(f"{'File Name':<40} {'AUROC':<10} {'Threshold':<10} {'Sensitivity':<12} {'Specificity':<12}")
    print("-" * 90)
    
    # Process each CSV file
    for file_name in csv_files:
        file_path = os.path.join(directory_path, file_name)
        stats, error = calculate_auroc_from_csv(file_path)
        
        if error:
            print(f"{file_name:<40} Error: {error}")
        else:
            auroc = stats['auroc']
            threshold = stats['optimal_threshold']
            sensitivity = stats['sensitivity']
            specificity = stats['specificity']
            
            print(f"{file_name:<40} {auroc:.4f}    {threshold:.4f}    {sensitivity:.4f}      {specificity:.4f}")
            results[file_name] = stats
    
    # Find and print the file with the highest AUROC
    if results:
        best_file = max(results.items(), key=lambda x: x[1]['auroc'])
        print("\nHighest AUROC:")
        print(f"File: {best_file[0]}, AUROC: {best_file[1]['auroc']:.4f}")
    
    return results

In [None]:
def load_vectors_from_dir(directory):
    """Load all vectors from a directory"""
    vectors = {}
    for filename in os.listdir(directory):
        if filename.startswith('layer_') and filename.endswith('.pt'):
            layer = int(filename.split('_')[1].split('.')[0])
            vectors[layer] = torch.load(os.path.join(directory, filename))
            print(vectors[layer])
    return vectors


# Run experiment

In [None]:
existing_dir = './LLAMA_70B_CONTRASTIVE_20250407_114049/' #20250216_233223
if existing_dir is not None:
    normalized_vectors = load_vectors_from_dir(os.path.join(existing_dir, 'normalized_vectors'))
    print(f"Loaded {len(normalized_vectors)} normalized vectors from {existing_dir}")

In [None]:
output_dir = 'LLAMA_70B_CONTRASTIVE_PROBE_RESULTS'
results = test_all_layers(
    model=model,
    normalized_vectors=normalized_vectors,
    test_data=test_data,
    tokenizer=tokenizer,
    output_dir="LLAMA_70B_CONTRASTIVE_PROBE_RESULTS"
)

In [None]:
auc_results = process_all_csvs_in_directory(output_dir)

# control 

In [36]:
# Original layer 23 probe
original = normalized_vectors[23]

# Calculate mean and std
mean = original.mean().item()
std = original.std().item()

# Generate 10 random probes with same shape, mean, and std
random_probes = {}
for i in range(80):  # for all layers 0-79
    # Generate same random probe for all layers
    random_vector = torch.randn_like(original) * std + mean
    random_probes[i] = random_vector.to(dtype=torch.bfloat16, device='cuda:0')

# This will give you 10 different random_probes dictionaries, each having the same
# random vector repeated across all layers

random_probes_list = []
for _ in range(10):
    probe = {}
    random_vector = torch.randn_like(original) * std + mean
    for i in range(80):
        probe[i] = random_vector.to(dtype=torch.bfloat16, device='cuda:0')
    random_probes_list.append(probe)

In [40]:
output_dir = 'LLAMA_70B_CONTRASTIVE_PROBE_CONTROL_RESULTS'
results = test_all_layers(
    model=model,
    normalized_vectors=random_probes,
    test_data=test_data,
    tokenizer=tokenizer,
    output_dir="LLAMA_70B_CONTRASTIVE_PROBE_CONTROL_RESULTS"
)

Testing layers:   0%|          | 0/79 [00:00<?, ?it/s]
[Alyzing questions:   0%|          | 0/120 [00:00<?, ?it/s]
[Alyzing questions:   2%|▏         | 2/120 [00:00<00:14,  8.30it/s]
[Alyzing questions:   3%|▎         | 4/120 [00:00<00:11,  9.76it/s]
[Alyzing questions:   4%|▍         | 5/120 [00:00<00:15,  7.64it/s]
[Alyzing questions:   6%|▌         | 7/120 [00:00<00:12,  9.16it/s]
[Alyzing questions:   8%|▊         | 9/120 [00:00<00:11, 10.08it/s]
[Alyzing questions:   9%|▉         | 11/120 [00:01<00:14,  7.77it/s]

Processed 10/120 questions



[Alyzing questions:  10%|█         | 12/120 [00:01<00:19,  5.56it/s]
[Alyzing questions:  12%|█▏        | 14/120 [00:01<00:15,  6.79it/s]
[Alyzing questions:  13%|█▎        | 16/120 [00:02<00:13,  7.84it/s]
[Alyzing questions:  15%|█▌        | 18/120 [00:02<00:12,  8.05it/s]
[Alyzing questions:  17%|█▋        | 20/120 [00:02<00:11,  8.88it/s]
[Alyzing questions:  18%|█▊        | 22/120 [00:02<00:10,  9.70it/s]

Processed 20/120 questions



[Alyzing questions:  20%|██        | 24/120 [00:02<00:09, 10.21it/s]
[Alyzing questions:  22%|██▏       | 26/120 [00:02<00:08, 10.57it/s]
[Alyzing questions:  23%|██▎       | 28/120 [00:03<00:08, 10.79it/s]
[Alyzing questions:  25%|██▌       | 30/120 [00:03<00:08, 10.78it/s]
[Alyzing questions:  27%|██▋       | 32/120 [00:03<00:07, 11.09it/s]

Processed 30/120 questions



[Alyzing questions:  28%|██▊       | 34/120 [00:03<00:07, 11.29it/s]
[Alyzing questions:  30%|███       | 36/120 [00:03<00:08,  9.53it/s]
[Alyzing questions:  32%|███▏      | 38/120 [00:04<00:08,  9.43it/s]
[Alyzing questions:  32%|███▎      | 39/120 [00:04<00:08,  9.03it/s]
[Alyzing questions:  33%|███▎      | 40/120 [00:04<00:08,  9.16it/s]
[Alyzing questions:  35%|███▌      | 42/120 [00:04<00:07,  9.99it/s]

Processed 40/120 questions



[Alyzing questions:  37%|███▋      | 44/120 [00:04<00:08,  9.16it/s]
[Alyzing questions:  38%|███▊      | 46/120 [00:05<00:09,  8.01it/s]
[Alyzing questions:  39%|███▉      | 47/120 [00:05<00:10,  7.29it/s]
[Alyzing questions:  41%|████      | 49/120 [00:05<00:08,  8.38it/s]
[Alyzing questions:  42%|████▎     | 51/120 [00:05<00:07,  9.19it/s]

Processed 50/120 questions



[Alyzing questions:  44%|████▍     | 53/120 [00:05<00:06,  9.87it/s]
[Alyzing questions:  46%|████▌     | 55/120 [00:06<00:07,  9.20it/s]
[Alyzing questions:  48%|████▊     | 57/120 [00:06<00:06,  9.63it/s]
[Alyzing questions:  48%|████▊     | 58/120 [00:06<00:06,  9.67it/s]
[Alyzing questions:  50%|█████     | 60/120 [00:06<00:05, 10.14it/s]

Processed 60/120 questions



[Alyzing questions:  52%|█████▏    | 62/120 [00:06<00:06,  9.33it/s]
[Alyzing questions:  52%|█████▎    | 63/120 [00:06<00:06,  8.83it/s]
[Alyzing questions:  53%|█████▎    | 64/120 [00:07<00:07,  7.68it/s]
[Alyzing questions:  55%|█████▌    | 66/120 [00:07<00:07,  7.68it/s]
[Alyzing questions:  57%|█████▋    | 68/120 [00:07<00:06,  8.53it/s]
[Alyzing questions:  58%|█████▊    | 70/120 [00:07<00:05,  9.15it/s]
[Alyzing questions:  60%|██████    | 72/120 [00:07<00:05,  9.58it/s]

Processed 70/120 questions



[Alyzing questions:  62%|██████▏   | 74/120 [00:08<00:04,  9.95it/s]
[Alyzing questions:  63%|██████▎   | 76/120 [00:08<00:04, 10.21it/s]
[Alyzing questions:  65%|██████▌   | 78/120 [00:08<00:04, 10.47it/s]
[Alyzing questions:  67%|██████▋   | 80/120 [00:09<00:06,  6.42it/s]
[Alyzing questions:  68%|██████▊   | 82/120 [00:09<00:05,  7.36it/s]

Processed 80/120 questions



[Alyzing questions:  70%|███████   | 84/120 [00:09<00:04,  8.12it/s]
[Alyzing questions:  71%|███████   | 85/120 [00:09<00:05,  6.22it/s]
[Alyzing questions:  72%|███████▎  | 87/120 [00:10<00:04,  7.27it/s]
[Alyzing questions:  74%|███████▍  | 89/120 [00:10<00:03,  8.02it/s]
[Alyzing questions:  76%|███████▌  | 91/120 [00:10<00:03,  8.80it/s]

Processed 90/120 questions



[Alyzing questions:  78%|███████▊  | 93/120 [00:10<00:03,  8.58it/s]
[Alyzing questions:  79%|███████▉  | 95/120 [00:10<00:02,  9.32it/s]
[Alyzing questions:  81%|████████  | 97/120 [00:10<00:02, 10.00it/s]
[Alyzing questions:  82%|████████▎ | 99/120 [00:11<00:02, 10.48it/s]

Processed 100/120 questions



[Alyzing questions:  84%|████████▍ | 101/120 [00:11<00:02,  7.47it/s]
[Alyzing questions:  86%|████████▌ | 103/120 [00:11<00:02,  7.22it/s]
[Alyzing questions:  88%|████████▊ | 105/120 [00:12<00:01,  8.04it/s]
[Alyzing questions:  88%|████████▊ | 106/120 [00:12<00:01,  7.57it/s]
[Alyzing questions:  90%|█████████ | 108/120 [00:12<00:01,  8.32it/s]
[Alyzing questions:  92%|█████████▏| 110/120 [00:12<00:01,  9.00it/s]
[Alyzing questions:  93%|█████████▎| 112/120 [00:12<00:00,  9.55it/s]

Processed 110/120 questions



[Alyzing questions:  95%|█████████▌| 114/120 [00:12<00:00,  9.97it/s]
[Alyzing questions:  97%|█████████▋| 116/120 [00:13<00:00, 10.34it/s]
[Alyzing questions:  98%|█████████▊| 118/120 [00:13<00:00,  8.97it/s]
Analyzing questions: 100%|██████████| 120/120 [00:13<00:00,  8.81it/s]
Testing layers:   1%|▏         | 1/79 [00:13<17:43, 13.64s/it]

Processed 120/120 questions
Results saved to LLAMA_70B_CONTRASTIVE_PROBE_CONTROL_RESULTS/probe_stats_layer1.csv



[Alyzing questions:   0%|          | 0/120 [00:00<?, ?it/s]
[Alyzing questions:   2%|▏         | 2/120 [00:00<00:14,  8.42it/s]
[Alyzing questions:   3%|▎         | 4/120 [00:00<00:11, 10.08it/s]
[Alyzing questions:   5%|▌         | 6/120 [00:00<00:13,  8.36it/s]
[Alyzing questions:   7%|▋         | 8/120 [00:00<00:11,  9.45it/s]
[Alyzing questions:   8%|▊         | 10/120 [00:01<00:12,  8.90it/s]

Processed 10/120 questions



[Alyzing questions:   9%|▉         | 11/120 [00:01<00:14,  7.53it/s]
[Alyzing questions:  10%|█         | 12/120 [00:01<00:20,  5.35it/s]
[Alyzing questions:  12%|█▏        | 14/120 [00:01<00:15,  6.69it/s]
[Alyzing questions:  13%|█▎        | 16/120 [00:02<00:13,  7.77it/s]
[Alyzing questions:  15%|█▌        | 18/120 [00:02<00:12,  8.00it/s]
[Alyzing questions:  17%|█▋        | 20/120 [00:02<00:11,  8.88it/s]
[Alyzing questions:  18%|█▊        | 22/120 [00:02<00:10,  9.70it/s]

Processed 20/120 questions



[Alyzing questions:  20%|██        | 24/120 [00:02<00:09, 10.18it/s]
[Alyzing questions:  22%|██▏       | 26/120 [00:02<00:08, 10.56it/s]
[Alyzing questions:  23%|██▎       | 28/120 [00:03<00:08, 10.79it/s]
[Alyzing questions:  25%|██▌       | 30/120 [00:03<00:08, 10.76it/s]
[Alyzing questions:  27%|██▋       | 32/120 [00:03<00:07, 11.07it/s]

Processed 30/120 questions



[Alyzing questions:  28%|██▊       | 34/120 [00:03<00:07, 11.32it/s]
[Alyzing questions:  30%|███       | 36/120 [00:03<00:08,  9.54it/s]
[Alyzing questions:  32%|███▏      | 38/120 [00:04<00:08,  9.49it/s]
[Alyzing questions:  32%|███▎      | 39/120 [00:04<00:08,  9.04it/s]
[Alyzing questions:  33%|███▎      | 40/120 [00:04<00:08,  9.20it/s]
[Alyzing questions:  35%|███▌      | 42/120 [00:04<00:07,  9.97it/s]

Processed 40/120 questions



[Alyzing questions:  37%|███▋      | 44/120 [00:04<00:08,  9.19it/s]
[Alyzing questions:  38%|███▊      | 46/120 [00:05<00:09,  8.04it/s]
[Alyzing questions:  39%|███▉      | 47/120 [00:05<00:09,  7.34it/s]
[Alyzing questions:  41%|████      | 49/120 [00:05<00:08,  8.43it/s]
[Alyzing questions:  42%|████▎     | 51/120 [00:05<00:07,  9.27it/s]

Processed 50/120 questions



[Alyzing questions:  44%|████▍     | 53/120 [00:05<00:06,  9.83it/s]
[Alyzing questions:  46%|████▌     | 55/120 [00:06<00:07,  9.07it/s]
[Alyzing questions:  47%|████▋     | 56/120 [00:06<00:06,  9.18it/s]
[Alyzing questions:  48%|████▊     | 58/120 [00:06<00:06,  9.45it/s]
[Alyzing questions:  50%|█████     | 60/120 [00:06<00:06,  9.78it/s]
[Alyzing questions:  51%|█████     | 61/120 [00:06<00:06,  8.77it/s]

Processed 60/120 questions



[Alyzing questions:  52%|█████▎    | 63/120 [00:07<00:06,  8.75it/s]
[Alyzing questions:  53%|█████▎    | 64/120 [00:07<00:07,  7.65it/s]
[Alyzing questions:  55%|█████▌    | 66/120 [00:07<00:07,  7.70it/s]
[Alyzing questions:  57%|█████▋    | 68/120 [00:07<00:06,  8.52it/s]
[Alyzing questions:  58%|█████▊    | 70/120 [00:07<00:05,  9.15it/s]
[Alyzing questions:  60%|██████    | 72/120 [00:08<00:05,  9.56it/s]

Processed 70/120 questions



[Alyzing questions:  62%|██████▏   | 74/120 [00:08<00:04,  9.94it/s]
[Alyzing questions:  63%|██████▎   | 76/120 [00:08<00:04, 10.21it/s]
[Alyzing questions:  65%|██████▌   | 78/120 [00:08<00:04, 10.44it/s]
[Alyzing questions:  67%|██████▋   | 80/120 [00:09<00:06,  6.36it/s]
[Alyzing questions:  68%|██████▊   | 82/120 [00:09<00:05,  7.30it/s]

Processed 80/120 questions



[Alyzing questions:  70%|███████   | 84/120 [00:09<00:04,  8.08it/s]
[Alyzing questions:  71%|███████   | 85/120 [00:09<00:05,  6.19it/s]
[Alyzing questions:  72%|███████▏  | 86/120 [00:09<00:05,  6.70it/s]
[Alyzing questions:  73%|███████▎  | 88/120 [00:10<00:04,  7.80it/s]
[Alyzing questions:  74%|███████▍  | 89/120 [00:10<00:03,  8.05it/s]
[Alyzing questions:  76%|███████▌  | 91/120 [00:10<00:03,  8.93it/s]

Processed 90/120 questions



[Alyzing questions:  78%|███████▊  | 93/120 [00:10<00:03,  8.54it/s]
[Alyzing questions:  79%|███████▉  | 95/120 [00:10<00:02,  9.25it/s]
[Alyzing questions:  81%|████████  | 97/120 [00:11<00:02,  9.82it/s]
[Alyzing questions:  82%|████████▎ | 99/120 [00:11<00:02, 10.19it/s]

Processed 100/120 questions



[Alyzing questions:  84%|████████▍ | 101/120 [00:11<00:02,  7.22it/s]
[Alyzing questions:  86%|████████▌ | 103/120 [00:11<00:02,  7.06it/s]
[Alyzing questions:  88%|████████▊ | 105/120 [00:12<00:01,  7.88it/s]
[Alyzing questions:  88%|████████▊ | 106/120 [00:12<00:01,  7.45it/s]
[Alyzing questions:  90%|█████████ | 108/120 [00:12<00:01,  8.22it/s]
[Alyzing questions:  92%|█████████▏| 110/120 [00:12<00:01,  8.91it/s]
[Alyzing questions:  93%|█████████▎| 112/120 [00:12<00:00,  9.50it/s]

Processed 110/120 questions



[Alyzing questions:  95%|█████████▌| 114/120 [00:13<00:00,  9.94it/s]
[Alyzing questions:  97%|█████████▋| 116/120 [00:13<00:00, 10.31it/s]
[Alyzing questions:  98%|█████████▊| 118/120 [00:13<00:00,  8.93it/s]
Analyzing questions: 100%|██████████| 120/120 [00:13<00:00,  8.73it/s]
Testing layers:   3%|▎         | 2/79 [00:27<17:35, 13.71s/it]

Processed 120/120 questions
Results saved to LLAMA_70B_CONTRASTIVE_PROBE_CONTROL_RESULTS/probe_stats_layer2.csv



[Alyzing questions:   0%|          | 0/120 [00:00<?, ?it/s]
[Alyzing questions:   2%|▏         | 2/120 [00:00<00:14,  8.17it/s]
[Alyzing questions:   3%|▎         | 4/120 [00:00<00:12,  9.58it/s]
[Alyzing questions:   4%|▍         | 5/120 [00:00<00:15,  7.34it/s]
[Alyzing questions:   6%|▌         | 7/120 [00:00<00:12,  8.74it/s]
[Alyzing questions:   8%|▊         | 9/120 [00:00<00:11,  9.56it/s]
[Alyzing questions:   8%|▊         | 10/120 [00:01<00:13,  8.37it/s]

Processed 10/120 questions



[Alyzing questions:   9%|▉         | 11/120 [00:01<00:15,  7.06it/s]
[Alyzing questions:  10%|█         | 12/120 [00:01<00:21,  4.95it/s]
[Alyzing questions:  12%|█▏        | 14/120 [00:01<00:16,  6.35it/s]
[Alyzing questions:  12%|█▎        | 15/120 [00:02<00:15,  6.93it/s]
[Alyzing questions:  14%|█▍        | 17/120 [00:02<00:12,  8.12it/s]
[Alyzing questions:  15%|█▌        | 18/120 [00:02<00:13,  7.65it/s]
[Alyzing questions:  17%|█▋        | 20/120 [00:02<00:11,  8.61it/s]
[Alyzing questions:  18%|█▊        | 22/120 [00:02<00:10,  9.44it/s]

Processed 20/120 questions



[Alyzing questions:  20%|██        | 24/120 [00:02<00:09,  9.87it/s]
[Alyzing questions:  22%|██▏       | 26/120 [00:03<00:09, 10.19it/s]
[Alyzing questions:  23%|██▎       | 28/120 [00:03<00:08, 10.37it/s]
[Alyzing questions:  25%|██▌       | 30/120 [00:03<00:08, 10.28it/s]
[Alyzing questions:  27%|██▋       | 32/120 [00:03<00:08, 10.55it/s]

Processed 30/120 questions



[Alyzing questions:  28%|██▊       | 34/120 [00:03<00:07, 10.75it/s]
[Alyzing questions:  30%|███       | 36/120 [00:04<00:09,  9.20it/s]
[Alyzing questions:  32%|███▏      | 38/120 [00:04<00:09,  9.09it/s]
[Alyzing questions:  32%|███▎      | 39/120 [00:04<00:09,  8.72it/s]
[Alyzing questions:  33%|███▎      | 40/120 [00:04<00:09,  8.84it/s]
[Alyzing questions:  35%|███▌      | 42/120 [00:04<00:08,  9.55it/s]

Processed 40/120 questions



[Alyzing questions:  36%|███▌      | 43/120 [00:04<00:08,  8.61it/s]
[Alyzing questions:  37%|███▋      | 44/120 [00:05<00:08,  8.76it/s]
[Alyzing questions:  38%|███▊      | 46/120 [00:05<00:09,  7.51it/s]
[Alyzing questions:  39%|███▉      | 47/120 [00:05<00:10,  6.81it/s]
[Alyzing questions:  41%|████      | 49/120 [00:05<00:08,  7.95it/s]
[Alyzing questions:  42%|████▎     | 51/120 [00:05<00:07,  8.77it/s]

Processed 50/120 questions



[Alyzing questions:  44%|████▍     | 53/120 [00:06<00:07,  9.42it/s]
[Alyzing questions:  45%|████▌     | 54/120 [00:06<00:07,  8.39it/s]
[Alyzing questions:  47%|████▋     | 56/120 [00:06<00:07,  9.05it/s]
[Alyzing questions:  48%|████▊     | 58/120 [00:06<00:06,  9.33it/s]
[Alyzing questions:  50%|█████     | 60/120 [00:06<00:06,  9.70it/s]
[Alyzing questions:  51%|█████     | 61/120 [00:07<00:06,  8.74it/s]

Processed 60/120 questions



[Alyzing questions:  52%|█████▎    | 63/120 [00:07<00:06,  8.70it/s]
[Alyzing questions:  53%|█████▎    | 64/120 [00:07<00:07,  7.69it/s]
[Alyzing questions:  55%|█████▌    | 66/120 [00:07<00:07,  7.69it/s]
[Alyzing questions:  57%|█████▋    | 68/120 [00:07<00:06,  8.53it/s]
[Alyzing questions:  58%|█████▊    | 70/120 [00:08<00:05,  9.12it/s]
[Alyzing questions:  60%|██████    | 72/120 [00:08<00:05,  9.53it/s]

Processed 70/120 questions



[Alyzing questions:  62%|██████▏   | 74/120 [00:08<00:04,  9.91it/s]
[Alyzing questions:  63%|██████▎   | 76/120 [00:08<00:04, 10.13it/s]
[Alyzing questions:  65%|██████▌   | 78/120 [00:08<00:04, 10.34it/s]
[Alyzing questions:  67%|██████▋   | 80/120 [00:09<00:06,  6.33it/s]
[Alyzing questions:  68%|██████▊   | 82/120 [00:09<00:05,  7.25it/s]

Processed 80/120 questions



[Alyzing questions:  70%|███████   | 84/120 [00:09<00:04,  8.07it/s]
[Alyzing questions:  71%|███████   | 85/120 [00:10<00:05,  6.15it/s]
[Alyzing questions:  72%|███████▎  | 87/120 [00:10<00:04,  7.17it/s]
[Alyzing questions:  74%|███████▍  | 89/120 [00:10<00:03,  7.90it/s]
[Alyzing questions:  76%|███████▌  | 91/120 [00:10<00:03,  8.68it/s]

Processed 90/120 questions



[Alyzing questions:  78%|███████▊  | 93/120 [00:10<00:03,  8.44it/s]
[Alyzing questions:  79%|███████▉  | 95/120 [00:11<00:02,  9.06it/s]
[Alyzing questions:  81%|████████  | 97/120 [00:11<00:02,  9.63it/s]
[Alyzing questions:  82%|████████▎ | 99/120 [00:11<00:02, 10.02it/s]

Processed 100/120 questions



[Alyzing questions:  84%|████████▍ | 101/120 [00:11<00:02,  7.19it/s]
[Alyzing questions:  86%|████████▌ | 103/120 [00:12<00:02,  7.03it/s]
[Alyzing questions:  88%|████████▊ | 105/120 [00:12<00:01,  7.84it/s]
[Alyzing questions:  88%|████████▊ | 106/120 [00:12<00:01,  7.39it/s]
[Alyzing questions:  90%|█████████ | 108/120 [00:12<00:01,  8.17it/s]
[Alyzing questions:  92%|█████████▏| 110/120 [00:13<00:01,  8.88it/s]
[Alyzing questions:  93%|█████████▎| 112/120 [00:13<00:00,  9.45it/s]

Processed 110/120 questions



[Alyzing questions:  95%|█████████▌| 114/120 [00:13<00:00,  9.91it/s]
[Alyzing questions:  97%|█████████▋| 116/120 [00:13<00:00, 10.27it/s]
[Alyzing questions:  98%|█████████▊| 118/120 [00:13<00:00,  8.87it/s]
Analyzing questions: 100%|██████████| 120/120 [00:14<00:00,  8.55it/s]
Testing layers:   4%|▍         | 3/79 [00:41<17:33, 13.86s/it]

Processed 120/120 questions
Results saved to LLAMA_70B_CONTRASTIVE_PROBE_CONTROL_RESULTS/probe_stats_layer3.csv



[Alyzing questions:   0%|          | 0/120 [00:00<?, ?it/s]
[Alyzing questions:   2%|▏         | 2/120 [00:00<00:14,  8.01it/s]
[Alyzing questions:   3%|▎         | 4/120 [00:00<00:12,  9.51it/s]
[Alyzing questions:   4%|▍         | 5/120 [00:00<00:15,  7.30it/s]
[Alyzing questions:   6%|▌         | 7/120 [00:00<00:12,  8.74it/s]
[Alyzing questions:   8%|▊         | 9/120 [00:01<00:11,  9.56it/s]
[Alyzing questions:   8%|▊         | 10/120 [00:01<00:13,  8.35it/s]

Processed 10/120 questions



[Alyzing questions:   9%|▉         | 11/120 [00:01<00:15,  7.04it/s]
[Alyzing questions:  10%|█         | 12/120 [00:01<00:21,  4.93it/s]
[Alyzing questions:  12%|█▏        | 14/120 [00:01<00:16,  6.32it/s]
Analyzing questions:  12%|█▎        | 15/120 [00:02<00:14,  7.11it/s]
Testing layers:   4%|▍         | 3/79 [00:43<18:23, 14.52s/it]

KeyboardInterrupt



In [41]:
auc_results = process_all_csvs_in_directory(output_dir)

Found 10 CSV files in LLAMA_70B_CONTRASTIVE_PROBE_CONTROL_RESULTS
File Name                                AUROC      Threshold  Sensitivity  Specificity 
------------------------------------------------------------------------------------------
probe_stats_layer1.csv                   0.4302    -0.0104    1.0000      0.0862
probe_stats_layer2.csv                   0.5687    0.0187    0.7419      0.4828
probe_stats_layer3.csv                   0.6541    0.0097    0.4839      0.8276
probe_stats_layer4.csv                   0.6452    0.0238    0.4516      0.8621
probe_stats_layer5.csv                   0.5623    0.0107    0.9516      0.1897
probe_stats_layer6.csv                   0.4839    -0.1045    0.7742      0.3276
probe_stats_layer7.csv                   0.4625    0.1219    0.1774      0.9655
probe_stats_layer8.csv                   0.5854    -0.0444    0.8387      0.3793
probe_stats_layer9.csv                   0.7264    -0.0866    0.7419      0.7069
probe_stats_layer10.csv       

In [51]:
import numpy as np
from scipy import stats

def compare_auc(auc1, auc2, n1, alpha=0.05):
    """
    Compare two AUCs using proper standard error calculation

    Args:
        auc1, auc2: The two AUCs to compare
        n1: Sample size
        alpha: Significance level
    """
    # Standard error calculation
    se1 = np.sqrt((auc1 * (1-auc1)) / (n1/4))
    se2 = np.sqrt((auc2 * (1-auc2)) / (n1/4))

    # Standard error of the difference
    se_diff = np.sqrt(se1**2 + se2**2)

    # Z-score
    z = (auc1 - auc2) / se_diff

    # Two-tailed p-value
    p_value = 2 * (1 - stats.norm.cdf(abs(z)))

    return z, p_value


# Test cases
test_cases = [
    (0.5, 0.9999, 400000),  #  test case
    (0.6, 0.7, 100),  # small difference, small n
    (0.6, 0.7, 10000),  # small difference, large n
]

for auc1, auc2, n in test_cases:
    z, p = compare_auc(auc1, auc2, n)
    print(f"\nComparing AUC {auc1} vs {auc2} with n={n}:")
    print(f"Z-statistic: {z:.3f}")
    print(f"P-value: {p:.10f}")
    print(f"Significant at α=0.05: {p < 0.05}")


Comparing AUC 0.5 vs 0.9999 with n=400000:
Z-statistic: -316.101
P-value: 0.0000000000
Significant at α=0.05: True

Comparing AUC 0.6 vs 0.7 with n=100:
Z-statistic: -0.745
P-value: 0.4560565403
Significant at α=0.05: False

Comparing AUC 0.6 vs 0.7 with n=10000:
Z-statistic: -7.454
P-value: 0.0000000000
Significant at α=0.05: True


In [52]:
z, p = compare_auc(0.821, 0.602, 400)
p

0.0004276610332378805