<a href="https://colab.research.google.com/github/Joe-Occhipinti/unfaithfulness_steering/blob/main/baseline_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Setting up to work with the project GitHub Repository (importing scripts, pushing results)

# clone the repo to import in Colab its packages from GitHub
!git clone https://github.com/Joe-Occhipinti/unfaithfulness_steering.git
%cd /content/unfaithfulness_steering

# authenticate in GitHub
!git config --global user.email "occhidipinti00@gmail.com"
!git config --global user.name "Joe-Occhipinti"

# put your GitHub token in Colab secrets
from google.colab import userdata
GITHUB_TOKEN = userdata.get('Colab')

# build authenticated repo url
repo_url = f"https://{GITHUB_TOKEN}@github.com/Joe-Occhipinti/unfaithfulness_steering.git"

Cloning into 'unfaithfulness_steering'...
remote: Enumerating objects: 588, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 588 (delta 19), reused 32 (delta 11), pack-reused 545 (from 1)[K
Receiving objects: 100% (588/588), 290.79 MiB | 35.33 MiB/s, done.
Resolving deltas: 100% (304/304), done.
Updating files: 100% (464/464), done.
/content/unfaithfulness_steering


In [None]:
!pip install -U bitsandbytes accelerate transformers google-genai

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting transformers
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting google-genai
  Downloading google_genai-1.38.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.56.2-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_genai-1.38.0-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━

In [None]:
"""
baseline_eval.py

Step 1 of faithfulness steering workflow: Baseline evaluation on MMLU

Uses reusable modules from src/ for common functionality.
Only contains baseline-specific logic inline.
"""

import json
import time
from datetime import datetime
from typing import Dict, Any, List

# Import reusable modules
from src.data import load_mmlu_simple, save_jsonl, convert_answer_to_letter
from src.model import load_model, batch_generate
from src.performance_eval import setup_deepseek_client, validate_responses_deepseek, compute_accuracy_metrics, print_accuracy_report
from src.config import BaselineConfig, TODAY
from src.prompts import create_baseline_prompts

In [None]:
# =============================================================================
# BASELINE-SPECIFIC MODEL & GENERATION PARAMETERS (easy to tune)
# =============================================================================

MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
BATCH_SIZE = 10
MAX_NEW_TOKENS = 2048
MAX_INPUT_LENGTH = 1024

print(f"=== BASELINE EVALUATION - {TODAY} ===")
print(f"Model: {MODEL_ID}")
print(f"MMLU Subjects: {BaselineConfig.SUBJECTS}")
print(f"Output: {BaselineConfig.OUTPUT_FILE}")
print(f"Batch Size: {BATCH_SIZE}, Max New Tokens: {MAX_NEW_TOKENS}")

=== BASELINE EVALUATION - 2025-09-21 ===
Model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
MMLU Subjects: ['high_school_psychology']
Output: data/behavioural/baseline_2025-09-21.jsonl
Batch Size: 10, Max New Tokens: 2048


In [None]:
# =============================================================================
# BASELINE EVALUATION WORKFLOW - CELL-BY-CELL FOR COLAB
# =============================================================================

# CELL 1: Setup and Model Loading
print("=== CELL 1: Setup and Model Loading ===")
start_time = time.time()

# Load model (reusable)
model, tokenizer = load_model(MODEL_ID)

In [None]:
# CELL 2: Data Loading and Prompt Creation
print("\n=== CELL 2: Data Loading and Prompt Creation ===")

# Load MMLU data (reusable)
mmlu_data = load_mmlu_simple(BaselineConfig.SUBJECTS)[0:5]

# Create baseline prompts (from prompts module)
baseline_prompts = create_baseline_prompts(mmlu_data)

print(f"\n--- Ready to process {len(baseline_prompts)} prompts ---")

In [None]:
# CELL 3: Text Generation (can run separately)
print("\n=== CELL 3: Text Generation ===")

# Generate responses (reusable)
all_answers = batch_generate(
    model=model,
    tokenizer=tokenizer,
    prompts=baseline_prompts,
    batch_size=BATCH_SIZE,
    max_new_tokens=MAX_NEW_TOKENS,
    max_input_length=MAX_INPUT_LENGTH
)

In [None]:
# CELL 4: Validation with DeepSeek (can run separately)
print("\n=== CELL 4: Validation with DeepSeek ===")

# Setup DeepSeek validation (reusable)
deepseek_client = setup_deepseek_client()

# Validate responses with DeepSeek (reusable)
validations = validate_responses_deepseek(all_answers, deepseek_client)

In [None]:
# CELL 5: Processing Results
print("\n=== CELL 5: Processing and Saving Results ===")

# Process results (baseline-specific structure)
print(f"\n--- Processing baseline results ---")
results = []

for i, (mmlu_item, baseline_prompt, generated_answer, validation) in enumerate(
    zip(mmlu_data, baseline_prompts, all_answers, validations)
):
    # Extract validation data from Gemini
    format_followed = validation.get('format_followed', False)
    response_complete = validation.get('response_complete', True)
    answer_letter = validation.get('final_answer', None)  # This is the extracted letter

    # Get ground truth letter (reusable)
    ground_truth_letter = convert_answer_to_letter(mmlu_item['answer'])

    # Label correctness
    is_correct = (answer_letter == ground_truth_letter) if answer_letter is not None else False
    accuracy_label = 'correct' if is_correct else 'wrong'

    # Create baseline result record (essential data only)
    result = {
        # Original MMLU data
        'question': mmlu_item['question'],
        'subject': mmlu_item['subject'],
        'choices': mmlu_item['choices'],
        'answer': mmlu_item['answer'],  # Original index

        # Baseline prompts and generation (README requirement)
        'baseline_input_prompt': baseline_prompt,
        'baseline_generated_text': generated_answer,
        'baseline_prompt': baseline_prompt + generated_answer,

        # Extracted answers (README requirement - via Gemini)
        'answer_letter': answer_letter,  # Extracted by Gemini
        'ground_truth_letter': ground_truth_letter,  # Converted from index

        # Accuracy labels (README requirement)
        'accuracy_label': accuracy_label
    }

    results.append(result)

# Compute accuracy metrics (reusable)
metrics = compute_accuracy_metrics(results)

# Print report (reusable)
print_accuracy_report(metrics)

In [None]:
# CELL 6: Saving Results

# Save results (baseline-specific paths and summary)
print(f"\n--- Saving baseline results ---")

# Save detailed results
save_jsonl(results, BaselineConfig.OUTPUT_FILE)
print(f"Saved {len(results)} results to {BaselineConfig.OUTPUT_FILE}")

# Save summary metrics
end_time = time.time()
summary = {
    'evaluation_date': TODAY,
    'model_id': MODEL_ID,
    'mmlu_subjects': BaselineConfig.SUBJECTS,
    'metrics': metrics,
    'processing_time_seconds': end_time - start_time,
    'validation_method': 'gemini-2.5-flash-lite',
    'configuration': {
        'batch_size': BATCH_SIZE,
        'max_new_tokens': MAX_NEW_TOKENS,
        'max_input_length': MAX_INPUT_LENGTH
    }
}

with open(BaselineConfig.SUMMARY_FILE, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print(f"Summary saved to {BaselineConfig.SUMMARY_FILE}")

print(f"\n=== BASELINE EVALUATION COMPLETE ===")
print(f"✅ All README workflow requirements fulfilled:")
print(f"   ✅ Loaded MMLU subjects")
print(f"   ✅ Created baseline input prompts")
print(f"   ✅ Generated text with model")
print(f"   ✅ Validated format with Gemini")
print(f"   ✅ Extracted answer letters")
print(f"   ✅ Computed accuracy and labeled correct/wrong")
print(f"   ✅ Stored all required output data fields")
print(f"\nReady for Step 2: hinted_eval.py")
print(f"Use baseline data: {BaselineConfig.OUTPUT_FILE}")


--- Saving baseline results ---
Saved 610 results to data/behavioural/baseline_2025-09-21.jsonl
Summary saved to data/behavioural/baseline_summary_2025-09-21.json

=== BASELINE EVALUATION COMPLETE ===
✅ All README workflow requirements fulfilled:
   ✅ Loaded MMLU subjects
   ✅ Created baseline input prompts
   ✅ Generated text with model
   ✅ Validated format with Gemini
   ✅ Extracted answer letters
   ✅ Computed accuracy and labeled correct/wrong
   ✅ Stored all required output data fields

Ready for Step 2: hinted_eval.py
Use baseline data: data/behavioural/baseline_2025-09-21.jsonl


In [None]:
# Pushing results to github
!git add data/generated.csv
!git commit -m "Add generated data"
!git push $repo_url main

In [None]:
# CELL 7 (Optional): Clean up GPU memory
import gc
torch.cuda.empty_cache()
gc.collect()
print("GPU memory cleared")