# SemEval 2025 Unlearning Challenge - Evaluation

In [None]:
import os
import json
import subprocess
import pandas as pd
from pathlib import Path
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import numpy as np
from datetime import datetime

## Configuration

Set up default paths and parameters for evaluation.

In [None]:
# Default configuration
DEFAULT_MODEL_PATH = "./models/best_model"  # Change this to your model path
DATA_PATH = "./data/"  # SemEval unlearning data (contains parquet files)
MIA_DATA_PATH = "./data/mia_data/"  # MIA attack data (contains JSONL files)
OUTPUT_DIR = "./evaluation_results"
MMLU_OUTPUT_DIR = "./mmlu_results"
PROCESSED_DATA_PATH = "./processed_data/"  # For converted JSONL files

# Evaluation parameters
MAX_NEW_TOKENS = 256
BATCH_SIZE = 8  # Adjust based on your GPU memory
SEED = 42

print(f"Configuration set up:")
print(f"Model path: {DEFAULT_MODEL_PATH}")
print(f"Data path: {DATA_PATH}")
print(f"Output directory: {OUTPUT_DIR}")

# List available JSONL files in DATA_PATH
if os.path.exists(DATA_PATH):
    jsonl_files = [f for f in os.listdir(DATA_PATH) if f.endswith('.jsonl')]
    print(f"Available JSONL files: {jsonl_files}")
else:
    print(f"Data directory not found: {DATA_PATH}")

In [None]:
# Quick conversion - uncomment to convert parquet files immediately
# if parquet_ok and not jsonl_ok:
#     print("Converting parquet files to JSONL...")
#     convert_parquet_to_jsonl()
#     verify_converted_data()
#     print("Conversion completed!")
# else:
#     print("Parquet files not found or JSONL files already exist.")

print("Ready for data conversion!")
print("Uncomment the code above or run convert_parquet_to_jsonl() to convert data.")

## Verify Data and Model Paths

In [None]:
def verify_paths():
    """Verify all required paths exist"""
    checks = {
        "Model path": DEFAULT_MODEL_PATH,
        "Data path": DATA_PATH,
        "MIA data path": MIA_DATA_PATH,
        "MIA member data": os.path.join(MIA_DATA_PATH, "member.jsonl"),
        "MIA nonmember data": os.path.join(MIA_DATA_PATH, "nonmember.jsonl")
    }
    
    # Check parquet files
    parquet_checks = {
        "Forget train parquet": os.path.join(DATA_PATH, "forget_train-00000-of-00001.parquet"),
        "Forget validation parquet": os.path.join(DATA_PATH, "forget_validation-00000-of-00001.parquet"),
        "Retain train parquet": os.path.join(DATA_PATH, "retain_train-00000-of-00001.parquet"),
        "Retain validation parquet": os.path.join(DATA_PATH, "retain_validation-00000-of-00001.parquet")
    }
    
    # Check processed JSONL files (if they exist)
    jsonl_checks = {
        "Processed forget data": os.path.join(PROCESSED_DATA_PATH, "forget.jsonl"),
        "Processed retain data": os.path.join(PROCESSED_DATA_PATH, "retain.jsonl")
    }
    
    all_good = True
    parquet_available = True
    jsonl_available = True
    
    print("=== BASIC PATH VERIFICATION ===")
    for name, path in checks.items():
        if os.path.exists(path):
            print(f"✓ {name}: {path}")
        else:
            print(f"✗ {name}: {path} [NOT FOUND]")
            all_good = False
    
    print("\\n=== PARQUET FILES (SOURCE DATA) ===")
    for name, path in parquet_checks.items():
        if os.path.exists(path):
            print(f"✓ {name}: {path}")
        else:
            print(f"✗ {name}: {path} [NOT FOUND]")
            parquet_available = False
    
    print("\\n=== PROCESSED JSONL FILES (FOR EVALUATION) ===")
    for name, path in jsonl_checks.items():
        if os.path.exists(path):
            print(f"✓ {name}: {path}")
        else:
            print(f"✗ {name}: {path} [NOT FOUND]")
            jsonl_available = False
    
    if parquet_available and not jsonl_available:
        print("\\n💡 Parquet files found but JSONL files missing.")
        print("   Run convert_parquet_to_jsonl() to create evaluation-ready files.")
    elif not parquet_available:
        print("\\n❌ Source parquet files missing. Cannot proceed with evaluation.")
    elif jsonl_available:
        print("\\n✅ Both parquet and JSONL files available. Ready for evaluation!")
    
    return all_good, parquet_available, jsonl_available

# Create output directories
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(MMLU_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(PROCESSED_DATA_PATH).mkdir(parents=True, exist_ok=True)

paths_ok, parquet_ok, jsonl_ok = verify_paths()
print(f"\\nSummary:")
print(f"- Basic paths OK: {paths_ok}")
print(f"- Parquet files available: {parquet_ok}")  
print(f"- JSONL files available: {jsonl_ok}")
print(f"- Ready for evaluation: {jsonl_ok}")

## MMLU Evaluation

Run MMLU evaluation to assess general knowledge retention.

In [None]:
def evaluate_mmlu_subject(model, tokenizer, subject, max_samples=None):
    """Evaluate model on a specific MMLU subject"""
    dataset = load_dataset("hendrycks/test", subject)["test"]
    
    if max_samples:
        dataset = dataset.select(range(min(max_samples, len(dataset))))
    
    correct = 0
    total = 0
    
    for example in tqdm(dataset, desc=f"Evaluating {subject}"):
        question = example["question"]
        choices = example["choices"]
        correct_answer = example["answer"]
        
        # Format the question
        prompt = f"Question: {question}\n"
        for i, choice in enumerate(choices):
            prompt += f"{chr(65+i)}. {choice}\n"
        prompt += "Answer:"
        
        # Tokenize and generate
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=5,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated = tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):], skip_special_tokens=True).strip()
        
        # Extract the answer (A, B, C, or D)
        predicted_answer = None
        for char in generated.upper():
            if char in "ABCD":
                predicted_answer = ord(char) - ord("A")
                break
        
        if predicted_answer == correct_answer:
            correct += 1
        total += 1
    
    accuracy = correct / total if total > 0 else 0
    return accuracy, correct, total

def run_mmlu_evaluation(model_path, subjects=None, max_samples_per_subject=None):
    """Run MMLU evaluation on specified subjects"""
    if subjects is None:
        subjects = MMLU_SUBJECTS
    
    print(f"Loading model from {model_path}...")
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    results = {}
    total_correct = 0
    total_questions = 0
    
    for subject in subjects:
        try:
            print(f"\nEvaluating {subject}...")
            accuracy, correct, total = evaluate_mmlu_subject(
                model, tokenizer, subject, max_samples_per_subject
            )
            results[subject] = {
                "accuracy": accuracy,
                "correct": correct,
                "total": total
            }
            total_correct += correct
            total_questions += total
            print(f"{subject}: {accuracy:.3f} ({correct}/{total})")
        except Exception as e:
            print(f"Error evaluating {subject}: {e}")
            results[subject] = {"error": str(e)}
    
    overall_accuracy = total_correct / total_questions if total_questions > 0 else 0
    results["average_acc"] = overall_accuracy
    results["total_correct"] = total_correct
    results["total_questions"] = total_questions
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = os.path.join(MMLU_OUTPUT_DIR, f"mmlu_results_{timestamp}.json")
    with open(results_file, "w") as f:
        json.dump(results, f, indent=2)
    
    print(f"\nOverall MMLU accuracy: {overall_accuracy:.3f} ({total_correct}/{total_questions})")
    print(f"Results saved to {results_file}")
    
    return results_file, results

# Quick MMLU test on a subset of subjects (for faster testing)
QUICK_SUBJECTS = ["abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge"]

print("Ready to run MMLU evaluation!")
print(f"Quick test subjects: {QUICK_SUBJECTS}")
print("Run the next cell to start evaluation.")

In [None]:
# Run MMLU evaluation (uncomment to run)
# Choose one of the following options:

# Option 1: Quick test (5 subjects, 50 samples each)
# mmlu_results_file, mmlu_results = run_mmlu_evaluation(
#     DEFAULT_MODEL_PATH, 
#     subjects=QUICK_SUBJECTS, 
#     max_samples_per_subject=50
# )

# Option 2: Full evaluation (all subjects)
# mmlu_results_file, mmlu_results = run_mmlu_evaluation(DEFAULT_MODEL_PATH)

# Option 3: Manual specification
# mmlu_results_file, mmlu_results = run_mmlu_evaluation(
#     DEFAULT_MODEL_PATH,
#     subjects=["high_school_mathematics", "college_mathematics", "machine_learning"],
#     max_samples_per_subject=100
# )

print("Uncomment one of the options above to run MMLU evaluation.")

# For demonstration, let's assume we have results file
mmlu_results_file = None  # Set this to your actual results file path

## Official SemEval Evaluation

Run the official evaluation script with proper parameters.

In [None]:
def run_official_evaluation(model_path, data_path, mia_data_path=None, mmlu_file=None, 
                           max_new_tokens=256, batch_size=8, debug=False, use_local_script=False):
    """Run the official evaluation script"""
    
    # Choose which evaluation script to use
    if use_local_script and os.path.exists(os.path.join("data", "evaluate_generations.py")):
        script_path = os.path.join("data", "evaluate_generations.py")
        print("Using local evaluation script from data/ folder")
    else:
        script_path = "evaluation.py"
        print("Using root evaluation script")
    
    # Prepare command
    cmd = [
        "python", script_path,
        "--data_path", data_path,
        "--checkpoint_path", model_path,
        "--output_dir", OUTPUT_DIR,
        "--max_new_tokens", str(max_new_tokens),
        "--batch_size", str(batch_size),
        "--seed", str(SEED)
    ]
    
    if mia_data_path and os.path.exists(mia_data_path):
        cmd.extend(["--mia_data_path", mia_data_path])
    
    if mmlu_file and os.path.exists(mmlu_file):
        cmd.extend(["--mmlu_metrics_file_path", mmlu_file])
    
    if debug:
        cmd.append("--debug")
    
    print("Running official evaluation with command:")
    print(" ".join(cmd))
    print("\\nThis may take a while...\\n")
    
    # Run the evaluation
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=7200)  # 2 hour timeout
        
        if result.returncode == 0:
            print("✓ Evaluation completed successfully!")
            print("STDOUT:")
            print(result.stdout)
            
            # Load results
            results_file = os.path.join(OUTPUT_DIR, "evaluation_results.jsonl")
            if os.path.exists(results_file):
                with open(results_file, "r") as f:
                    results = json.load(f)
                return results
            else:
                print(f"Results file not found: {results_file}")
                return None
        else:
            print(f"✗ Evaluation failed with return code {result.returncode}")
            print("STDERR:")
            print(result.stderr)
            print("STDOUT:")
            print(result.stdout)
            return None
            
    except subprocess.TimeoutExpired:
        print("✗ Evaluation timed out after 2 hours")
        return None
    except Exception as e:
        print(f"✗ Evaluation failed with error: {e}")
        return None

print("Official evaluation function ready!")
print("Note: The function will automatically use processed JSONL files for evaluation.")

In [None]:
# Run the official evaluation
print("=== AUTOMATED EVALUATION PIPELINE ===\\n")

# Step 1: Convert parquet to JSONL if needed
if parquet_ok and not jsonl_ok:
    print("Step 1: Converting parquet files to JSONL format...")
    convert_parquet_to_jsonl()
    print("\\nStep 1.5: Verifying converted data...")
    data_verification_ok = verify_converted_data()
    if not data_verification_ok:
        print("❌ Data conversion failed. Cannot proceed with evaluation.")
    else:
        print("✅ Data conversion successful!")
        jsonl_ok = True
elif jsonl_ok:
    print("Step 1: JSONL files already exist, skipping conversion.")
    data_verification_ok = verify_converted_data()
else:
    print("❌ No parquet files found. Cannot convert data.")
    data_verification_ok = False

# Step 2: Run evaluation if data is ready
if jsonl_ok and data_verification_ok:
    print("\\n" + "="*50)
    print("Step 2: Starting official SemEval 2025 evaluation...")
    print("="*50)
    
    eval_results = run_official_evaluation(
        model_path=DEFAULT_MODEL_PATH,
        data_path=PROCESSED_DATA_PATH,  # Use processed JSONL data
        mia_data_path=MIA_DATA_PATH,
        mmlu_file=mmlu_results_file if 'mmlu_results_file' in locals() else None,
        max_new_tokens=MAX_NEW_TOKENS,
        batch_size=BATCH_SIZE,
        debug=True,  # Set to False for less verbose output
        use_local_script=False  # Set to True to use data/evaluate_generations.py
    )
    
    if eval_results:
        print("\\n" + "="*60)
        print("EVALUATION RESULTS SUMMARY")
        print("="*60)
        
        # Key metrics
        if 'aggregate-score' in eval_results:
            print(f"Final Aggregate Score: {eval_results['aggregate-score']:.4f}")
        
        if 'harmonic-mean-task-aggregate' in eval_results:
            print(f"Task Aggregate (Harmonic Mean): {eval_results['harmonic-mean-task-aggregate']:.4f}")
        
        if 'mmlu_average' in eval_results:
            print(f"MMLU Average: {eval_results['mmlu_average']:.4f}")
        
        if 'mia_loss_acc' in eval_results:
            print(f"MIA Loss Accuracy: {eval_results['mia_loss_acc']:.4f}")
            if 'mia_final_score' in eval_results:
                print(f"MIA Final Score: {eval_results['mia_final_score']:.4f}")
        
        # Task-specific results
        print("\\nTask-specific Results:")
        for key in ['forget-set', 'retain-set']:
            if key in eval_results:
                print(f"\\n{key.upper()}:")
                task_results = eval_results[key]
                if 'overall-regurgitation-score' in task_results:
                    print(f"  Overall Regurgitation: {task_results['overall-regurgitation-score']:.4f}")
                if 'overall-knowledge-score' in task_results:
                    print(f"  Overall Knowledge: {task_results['overall-knowledge-score']:.4f}")
                
                # Task-specific breakdowns
                for task_key, task_data in task_results.items():
                    if task_key.startswith('Task') and isinstance(task_data, dict):
                        print(f"  {task_key}:")
                        for metric, value in task_data.items():
                            print(f"    {metric}: {value:.4f}")
        
        # Step 3: Save detailed analysis
        print("\\n" + "="*50)
        print("Step 3: Saving detailed analysis...")
        save_evaluation_summary(eval_results)
        print("✅ Evaluation pipeline completed successfully!")
        
    else:
        print("❌ Evaluation failed. Check the error messages above.")

else:
    print("❌ Cannot run evaluation:")
    if not parquet_ok:
        print("  - Parquet source files not found")
    if not jsonl_ok:
        print("  - JSONL conversion failed") 
    if not data_verification_ok:
        print("  - Data verification failed")
    
    print("\\n💡 TROUBLESHOOTING TIPS:")
    print("1. Ensure the DEFAULT_MODEL_PATH points to your trained model")
    print("2. Check that parquet files exist in the data/ folder")
    print("3. Verify MIA data files exist in data/mia_data/")
    print("4. Run convert_parquet_to_jsonl() manually if conversion fails")

## Results Analysis

Analyze and visualize the evaluation results.

In [None]:
def analyze_evaluation_results(results):
    """Provide detailed analysis of evaluation results"""
    if not results:
        print("No results to analyze")
        return
    
    print("DETAILED RESULTS ANALYSIS")
    print("=" * 50)
    
    # Overall performance
    if 'aggregate-score' in results:
        score = results['aggregate-score']
        print(f"\n🎯 FINAL SCORE: {score:.4f}")
        
        if score >= 0.8:
            print("   Status: Excellent performance! 🏆")
        elif score >= 0.6:
            print("   Status: Good performance 👍")
        elif score >= 0.4:
            print("   Status: Moderate performance ⚠️")
        else:
            print("   Status: Needs improvement 🔧")
    
    # Unlearning effectiveness
    print("\n📊 UNLEARNING ANALYSIS:")
    if 'forget-set' in results:
        forget_results = results['forget-set']
        forget_regurg = forget_results.get('overall-regurgitation-score', 0)
        forget_knowledge = forget_results.get('overall-knowledge-score', 0)
        
        print(f"   Forget Regurgitation: {forget_regurg:.4f} (lower is better)")
        print(f"   Forget Knowledge: {forget_knowledge:.4f} (lower is better)")
        
        if forget_regurg < 0.3 and forget_knowledge < 0.3:
            print("   ✅ Excellent forgetting - model successfully unlearned target information")
        elif forget_regurg < 0.5 and forget_knowledge < 0.5:
            print("   🟡 Good forgetting - reasonable unlearning performance")
        else:
            print("   ❌ Poor forgetting - model retains too much target information")
    
    # Knowledge retention
    print("\n🧠 KNOWLEDGE RETENTION:")
    if 'retain-set' in results:
        retain_results = results['retain-set']
        retain_regurg = retain_results.get('overall-regurgitation-score', 0)
        retain_knowledge = retain_results.get('overall-knowledge-score', 0)
        
        print(f"   Retain Regurgitation: {retain_regurg:.4f} (higher is better)")
        print(f"   Retain Knowledge: {retain_knowledge:.4f} (higher is better)")
        
        if retain_regurg > 0.7 and retain_knowledge > 0.7:
            print("   ✅ Excellent retention - model preserves important knowledge")
        elif retain_regurg > 0.5 and retain_knowledge > 0.5:
            print("   🟡 Good retention - acceptable knowledge preservation")
        else:
            print("   ❌ Poor retention - model lost important knowledge (catastrophic forgetting)")
    
    # General knowledge (MMLU)
    if 'mmlu_average' in results:
        mmlu_score = results['mmlu_average']
        print(f"\n📚 GENERAL KNOWLEDGE (MMLU): {mmlu_score:.4f}")
        
        if mmlu_score >= 0.371:  # 75% of baseline threshold
            print("   ✅ Meets general knowledge threshold")
        else:
            print("   ❌ Below general knowledge threshold - may affect final ranking")
    
    # MIA resistance
    if 'mia_loss_acc' in results:
        mia_acc = results['mia_loss_acc']
        print(f"\n🛡️ MIA RESISTANCE: {mia_acc:.4f}")
        print(f"   Distance from ideal (0.5): {abs(mia_acc - 0.5):.4f}")
        
        if abs(mia_acc - 0.5) < 0.1:
            print("   ✅ Excellent MIA resistance - balanced unlearning")
        elif abs(mia_acc - 0.5) < 0.2:
            print("   🟡 Good MIA resistance")
        else:
            if mia_acc > 0.7:
                print("   ❌ Poor MIA resistance - under-unlearning detected")
            else:
                print("   ❌ Poor MIA resistance - over-unlearning detected")
    
    # Task breakdown
    print("\n📋 TASK-SPECIFIC BREAKDOWN:")
    for split in ['forget-set', 'retain-set']:
        if split not in results:
            continue
            
        print(f"\n   {split.replace('-set', '').upper()} SET:")
        split_data = results[split]
        
        for key, value in split_data.items():
            if key.startswith('Task') and isinstance(value, dict):
                print(f"     {key}:")
                for metric, score in value.items():
                    print(f"       {metric}: {score:.4f}")

# Run analysis if we have results
if 'eval_results' in locals() and eval_results:
    analyze_evaluation_results(eval_results)
else:
    print("No evaluation results available. Run the evaluation first.")

## Utility Functions

Additional helper functions for evaluation and analysis.

In [None]:
def load_previous_results(results_file):
    """Load results from a previous evaluation"""
    if os.path.exists(results_file):
        with open(results_file, 'r') as f:
            return json.load(f)
    else:
        print(f"Results file not found: {results_file}")
        return None

def compare_results(results1, results2, names=None):
    """Compare two evaluation results"""
    if names is None:
        names = ["Model 1", "Model 2"]
    
    print(f"COMPARISON: {names[0]} vs {names[1]}")
    print("=" * 50)
    
    metrics = [
        ('aggregate-score', 'Final Score'),
        ('harmonic-mean-task-aggregate', 'Task Aggregate'),
        ('mmlu_average', 'MMLU Average'),
        ('mia_final_score', 'MIA Final Score')
    ]
    
    for key, label in metrics:
        val1 = results1.get(key, 'N/A')
        val2 = results2.get(key, 'N/A')
        
        if isinstance(val1, float) and isinstance(val2, float):
            diff = val2 - val1
            arrow = "↑" if diff > 0 else "↓" if diff < 0 else "→"
            print(f"{label:20}: {val1:.4f} vs {val2:.4f} ({arrow} {abs(diff):.4f})")
        else:
            print(f"{label:20}: {val1} vs {val2}")

def save_evaluation_summary(results, output_file=None):
    """Save a human-readable summary of evaluation results"""
    if output_file is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"evaluation_summary_{timestamp}.txt"
    
    with open(output_file, 'w') as f:
        f.write("SemEval 2025 Unlearning Challenge - Evaluation Summary\n")
        f.write("=" * 60 + "\n\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        if 'aggregate-score' in results:
            f.write(f"Final Aggregate Score: {results['aggregate-score']:.4f}\n\n")
        
        # Write all metrics
        f.write("Detailed Results:\n")
        f.write("-" * 30 + "\n")
        for key, value in results.items():
            if isinstance(value, dict):
                f.write(f"\n{key}:\n")
                for subkey, subvalue in value.items():
                    f.write(f"  {subkey}: {subvalue}\n")
            else:
                f.write(f"{key}: {value}\n")
    
    print(f"Summary saved to {output_file}")

print("Utility functions loaded!")
print("\nAvailable functions:")
print("- load_previous_results(file): Load previous evaluation results")
print("- compare_results(r1, r2, names): Compare two evaluation results")
print("- save_evaluation_summary(results, file): Save human-readable summary")

## Example Usage

Here's how to use this notebook step by step:

In [None]:
print("STEP-BY-STEP EVALUATION WORKFLOW:")
print("=" * 40)
print("1. Update DEFAULT_MODEL_PATH in the configuration cell")
print("2. Verify all data paths are correct")
print("3. (Optional) Run MMLU evaluation first")
print("4. Run the official SemEval evaluation")
print("5. Analyze results using the analysis functions")
print("")
print("QUICK START:")
print("1. Set DEFAULT_MODEL_PATH = '/path/to/your/model'")
print("2. Run all cells in order")
print("3. Uncomment the evaluation calls when ready")
print("")
print("FILES GENERATED:")
print(f"- MMLU results: {MMLU_OUTPUT_DIR}/mmlu_results_*.json")
print(f"- SemEval results: {OUTPUT_DIR}/evaluation_results.jsonl")
print("- CSV files with detailed predictions (if keep_files=True)")
print("")
print("Ready to start evaluation! 🚀")