# MMLU Benchmark for OpenAI-Compatible APIs (Ollama, OpenAI, etc.)

This notebook runs the MMLU (Massive Multitask Language Understanding) benchmark against any OpenAI-compatible API, including Ollama, OpenAI, and other compatible services.

MMLU is a benchmark that tests knowledge across 57 subjects including mathematics, US history, computer science, law, and more.

## Configuration
- **API Endpoint**: Configure the base URL for your OpenAI-compatible API (e.g., Ollama, OpenAI)
- **Model Name**: Specify the model to use (e.g., "llama3.2:3b" for Ollama, "gpt-4" for OpenAI)
- **Number of Questions**: Select how many questions to test (per subject or total)
- **Subjects**: Choose specific subjects or test all
- **Multi-Server Support**: Load balance across multiple API instances

## 1. Install Requirements

In [1]:
!pip install -q requests datasets pandas tqdm numpy scikit-learn matplotlib seaborn

## 2. Configuration Settings

In [2]:
import requests
import json
import re
from datasets import load_dataset
import pandas as pd
from tqdm.auto import tqdm
import time
from typing import List, Dict, Tuple
import numpy as np
import os
from pathlib import Path

# API Configuration - Ollama (local)
API_BASE_URL = "http://127.0.0.1:11434"  # Ollama default
MODEL_NAME = "llama3:8b-instruct-q4_K_M"  # Model name (e.g., "llama3.2:3b")

# Multiple Servers Support (optional - for load balancing across multiple Ollama instances)
# Leave as single URL if you only have one server
API_SERVERS = [
    {"base_url": API_BASE_URL, "model": MODEL_NAME},
    # Add more servers here if needed:
    # {"base_url": "http://192.168.1.100:11434", "model": "llama3.2:3b"},
]

# Benchmark Configuration
NUM_QUESTIONS_PER_SUBJECT = None  # Set to None to run ALL questions, or a number to limit per subject
SELECTED_SUBJECTS = None  # List of subjects to test, or None for all subjects
TIMEOUT_SECONDS = 600  # Timeout for each question (10 minutes)

# Checkpoint Configuration
ENABLE_CHECKPOINTS = True  # Enable checkpoint/resume functionality
CHECKPOINT_INTERVAL = 500  # Save checkpoint after every N questions (reduced from 1 to prevent I/O overhead)

# Results Directory
RESULTS_DIR = Path("results")  # Results will be saved in Notebooks/results/
RESULTS_DIR.mkdir(exist_ok=True)

# Generation Settings (Ollama options)
GENERATION_SETTINGS = {
    "temperature": 0.0,  # Lower temperature for more deterministic answers
    "num_predict": 3,    # Limit tokens since we only need A/B/C/D
    "top_p": 0.95,
}

print("Configuration loaded successfully!")
print(f"API Base URL: {API_BASE_URL}")
print(f"Model: {MODEL_NAME}")
print(f"Number of servers: {len(API_SERVERS)}")
print(f"Questions per subject: {NUM_QUESTIONS_PER_SUBJECT if NUM_QUESTIONS_PER_SUBJECT else 'ALL'}")
print(f"Timeout per question: {TIMEOUT_SECONDS} seconds ({TIMEOUT_SECONDS/60:.1f} minutes)")
print(f"Results directory: {RESULTS_DIR.absolute()}")
print(f"Checkpoints: {'Enabled' if ENABLE_CHECKPOINTS else 'Disabled'} (interval: {CHECKPOINT_INTERVAL} questions)")


  from .autonotebook import tqdm as notebook_tqdm


Configuration loaded successfully!
API Base URL: http://127.0.0.1:11434
Model: llama3:8b-instruct-q4_K_M
Number of servers: 1
Questions per subject: ALL
Timeout per question: 600 seconds (10.0 minutes)
Results directory: c:\Users\user\NextCloud\nextcloud.nicojoerger.de\Documents\Schulen\Studium\Abschlussarbeit\Git\Embedded-CPU-LLM\Code\Notebooks\results
Checkpoints: Enabled (interval: 500 questions)


## 3. Helper Functions

In [3]:
def check_api_health(server_config: Dict) -> Dict:
    """Check if the Ollama API is accessible by making a simple request."""
    try:
        response = requests.get(
            f"{server_config['base_url']}/api/tags",
            timeout=10
        )
        response.raise_for_status()

        return {
            "status": "healthy",
            "model": server_config['model'],
            "base_url": server_config['base_url']
        }
    except Exception as e:
        raise ConnectionError(f"Failed to connect to API: {e}")


def generate_response(server_config: Dict, prompt: str, timeout: int = TIMEOUT_SECONDS) -> str:
    """Send a prompt to the Ollama API and get the generated response.
    
    Format matches the working HTTP file (Chat mit erweiterten Optionen).
    """
    try:
        # Build the request payload with Ollama format (matching working HTTP file)
        payload = {
            "model": server_config['model'],
            "messages": [
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "stream": False,
            "options": GENERATION_SETTINGS
        }

        # Send request to Ollama API
        response = requests.post(
            f"{server_config['base_url']}/api/chat",
            json=payload,
            timeout=timeout,
            headers={"Content-Type": "application/json"}
        )
        response.raise_for_status()

        # Parse Ollama-style response
        result = response.json()

        # Validate response structure
        if 'message' not in result or 'content' not in result['message']:
            raise RuntimeError(f"Invalid response format: {result}")

        generated_text = result['message']['content']

        return generated_text.strip()

    except requests.Timeout:
        raise TimeoutError(f"Request timed out after {timeout} seconds")
    except requests.RequestException as e:
        raise RuntimeError(f"Request failed: {e}")
    except KeyError as e:
        raise RuntimeError(f"Response parsing failed - missing key {e}: {result if 'result' in locals() else 'No response'}")
    except Exception as e:
        raise RuntimeError(f"Generation failed: {e}")


def extract_answer(response: str) -> str:
    """Extract the answer choice (A, B, C, or D) from the model's response.
    
    STRICT matching to avoid false positives from words like 'Compute' or 'evAluate'.
    """
    # Clean the response
    response = response.strip()
    
    # First, try exact patterns that indicate a clear answer
    # Pattern 1: Single letter possibly with punctuation at START of response
    # Examples: "A", "A.", "A)", "A:", "B " etc.
    match = re.match(r'^([A-Da-d])[\.\)\:\s]?', response)
    if match:
        return match.group(1).upper()
    
    # Pattern 2: "Answer: A" or "The answer is B" etc.
    match = re.search(r'(?:answer|choice)(?:\s+is)?[\s:]+([A-Da-d])(?:[^a-zA-Z]|$)', response, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    
    # Pattern 3: Single letter answer on its own line
    for line in response.split('\n'):
        line = line.strip()
        if re.match(r'^([A-Da-d])[\.\)\:]?$', line):
            return line[0].upper()
    
    # Pattern 4: Response is ONLY a single letter (after stripping whitespace)
    if len(response) == 1 and response.upper() in ['A', 'B', 'C', 'D']:
        return response.upper()
    
    # If none of the strict patterns match, return UNKNOWN
    # Do NOT try to find letters within words!
    return "UNKNOWN"


def format_mmlu_prompt(question: str, choices: List[str]) -> str:
    """Format a MMLU question with multiple choice options."""
    prompt = f"""Answer the following multiple choice question. Respond with only the letter (A, B, C, or D) of the correct answer.

Question: {question}

A) {choices[0]}
B) {choices[1]}
C) {choices[2]}
D) {choices[3]}

Answer:"""
    return prompt


def save_checkpoint(results_dict: Dict[int, Dict], checkpoint_file: Path):
    """Save checkpoint with question indices as keys."""
    checkpoint_data = {
        'results': results_dict,  # Dict with int keys (JSON converts to strings)
        'timestamp': time.time(),
        'num_processed': len(results_dict)
    }
    
    # Atomic save to prevent corruption
    temp_file = checkpoint_file.with_suffix('.tmp')
    with open(temp_file, 'w') as f:
        json.dump(checkpoint_data, f, indent=2)
    temp_file.replace(checkpoint_file)  # Atomic rename


def load_checkpoint(checkpoint_file: Path) -> Dict[int, Dict]:
    """Load checkpoint and return results_dict with int keys."""
    if not checkpoint_file.exists():
        return {}
    
    try:
        with open(checkpoint_file, 'r') as f:
            checkpoint_data = json.load(f)
        
        results_raw = checkpoint_data.get('results', {})
        
        # Convert JSON string keys back to int
        results_dict = {int(k): v for k, v in results_raw.items()}
        
        return results_dict
    except Exception as e:
        print(f"Warning: Failed to load checkpoint: {e}")
        return {}


def process_single_question(server_config: Dict, item: Dict, idx: int, timeout: int) -> Dict:
    """Process a single question using the Ollama API. Returns None on failure."""
    try:
        prompt = format_mmlu_prompt(item['question'], item['choices'])
        
        start_time = time.time()
        response = generate_response(server_config, prompt, timeout)
        elapsed_time = time.time() - start_time
        
        predicted_answer = extract_answer(response)
        correct_answer = chr(65 + item['answer'])
        
        return {
            'subject': item['subject'],
            'question': item['question'],
            'choices': item['choices'],
            'correct_answer': correct_answer,
            'predicted_answer': predicted_answer,
            'full_response': response,
            'is_correct': predicted_answer == correct_answer,
            'time_seconds': elapsed_time
        }
    except Exception as e:
        # Return None to trigger retry on different server
        return None


print("Helper functions loaded successfully!")

Helper functions loaded successfully!


## 4. Verify API Connection

In [4]:
# Check API health for all servers
print("Checking API servers...")
available_servers = []

for i, server_config in enumerate(API_SERVERS):
    try:
        health = check_api_health(server_config)
        print(f"✓ Server {i+1}: {health['status']} - Model: {health['model']} - URL: {health['base_url']}")
        available_servers.append(server_config)
    except Exception as e:
        print(f"✗ Server {i+1} ({server_config['base_url']}): UNAVAILABLE - {e}")

if not available_servers:
    raise RuntimeError("No servers available! Check your server URLs and ensure servers are running.")

# Update global list to only available servers
API_SERVERS = available_servers
print(f"\n{len(API_SERVERS)} server(s) ready for benchmark")

Checking API servers...
✓ Server 1: healthy - Model: llama3:8b-instruct-q4_K_M - URL: http://127.0.0.1:11434

1 server(s) ready for benchmark


In [5]:
# Check available models first
try:
    tags_response = requests.get(f"{API_BASE_URL}/api/tags", timeout=10)
    tags_response.raise_for_status()
    models_data = tags_response.json()
    
    print("Available models:")
    print("=" * 80)
    if 'models' in models_data:
        for model in models_data['models']:
            print(f"  - {model.get('name', 'unknown')}")
    else:
        print("No models found or unexpected response format")
        print(f"Response: {models_data}")
    
    print("\n" + "=" * 80)
    print(f"Configured model in notebook: {MODEL_NAME}")
    
    # Check if configured model exists
    if 'models' in models_data:
        model_names = [m.get('name', '') for m in models_data['models']]
        if MODEL_NAME in model_names:
            print(f"✓ Model '{MODEL_NAME}' is available")
        else:
            print(f"✗ Model '{MODEL_NAME}' NOT FOUND!")
            print(f"\nYou may need to update MODEL_NAME in cell 4 to one of the available models above.")
            
except Exception as e:
    print(f"Error checking models: {e}")

Available models:
  - llama3:8b-instruct-q4_K_M
  - llama3:8b-instruct-q4_0
  - olmo-3:32b
  - olmo-3:7b
  - nemotron-3-nano:30b
  - ministral-3:8b
  - ministral-3:3b
  - ministral-3:14b

Configured model in notebook: llama3:8b-instruct-q4_K_M
✓ Model 'llama3:8b-instruct-q4_K_M' is available


In [6]:
# DEBUG: Test the exact request format
import requests
import json

test_url = f"{API_BASE_URL}/api/chat"
test_payload = {
    "model": MODEL_NAME,
    "messages": [
        {
            "role": "user",
            "content": "Say only the letter A."
        }
    ],
    "stream": False,
    "options": {
        "temperature": 0.0,
        "num_predict": 5
    }
}

print(f"Testing request to: {test_url}")
print(f"Model: {MODEL_NAME}")
print(f"\nRequest payload:")
print(json.dumps(test_payload, indent=2))

try:
    response = requests.post(test_url, json=test_payload, timeout=30)
    print(f"\nResponse status: {response.status_code}")
    print(f"Response headers: {dict(response.headers)}")
    
    if response.status_code == 200:
        result = response.json()
        print(f"\nResponse structure: {list(result.keys())}")
        print(f"Full response:")
        print(json.dumps(result, indent=2))
    else:
        print(f"\nError response:")
        print(response.text)
except Exception as e:
    print(f"\nError: {type(e).__name__}: {e}")

Testing request to: http://127.0.0.1:11434/api/chat
Model: llama3:8b-instruct-q4_K_M

Request payload:
{
  "model": "llama3:8b-instruct-q4_K_M",
  "messages": [
    {
      "role": "user",
      "content": "Say only the letter A."
    }
  ],
  "stream": false,
  "options": {
    "temperature": 0.0,
    "num_predict": 5
  }
}

Response status: 200
Response headers: {'Content-Type': 'application/json; charset=utf-8', 'Date': 'Fri, 09 Jan 2026 01:33:19 GMT', 'Content-Length': '311'}

Response structure: ['model', 'created_at', 'message', 'done', 'done_reason', 'total_duration', 'load_duration', 'prompt_eval_count', 'prompt_eval_duration', 'eval_count', 'eval_duration']
Full response:
{
  "model": "llama3:8b-instruct-q4_K_M",
  "created_at": "2026-01-09T01:33:19.1709428Z",
  "message": {
    "role": "assistant",
    "content": "A"
  },
  "done": true,
  "done_reason": "stop",
  "total_duration": 16706875500,
  "load_duration": 16611528300,
  "prompt_eval_count": 16,
  "prompt_eval_duration

## 5. Load MMLU Dataset

In [7]:
print("Loading MMLU dataset from Hugging Face...")
print("This may take a few minutes on first run...")

# Load the MMLU dataset (using the 'test' split)
# The dataset is organized by subjects
dataset = load_dataset("cais/mmlu", "all", split="test")

print(f"\nDataset loaded successfully!")
print(f"Total questions: {len(dataset)}")

# Get all unique subjects
all_subjects = sorted(set(dataset['subject']))
print(f"Number of subjects: {len(all_subjects)}")
print(f"\nSubjects: {', '.join(all_subjects[:10])}..." if len(all_subjects) > 10 else f"\nSubjects: {', '.join(all_subjects)}")

Loading MMLU dataset from Hugging Face...
This may take a few minutes on first run...

Dataset loaded successfully!
Total questions: 14042
Number of subjects: 57

Subjects: abstract_algebra, anatomy, astronomy, business_ethics, clinical_knowledge, college_biology, college_chemistry, college_computer_science, college_mathematics, college_medicine...


## 6. Prepare Test Data

In [8]:
# Filter subjects if specified
subjects_to_test = SELECTED_SUBJECTS if SELECTED_SUBJECTS else all_subjects

# Prepare test questions
test_questions = []

for subject in subjects_to_test:
    # Filter dataset by subject
    subject_data = [item for item in dataset if item['subject'] == subject]
    
    # Limit number of questions if specified
    if NUM_QUESTIONS_PER_SUBJECT:
        subject_data = subject_data[:NUM_QUESTIONS_PER_SUBJECT]
    
    test_questions.extend(subject_data)

print(f"Prepared {len(test_questions)} test questions across {len(subjects_to_test)} subjects")
print(f"\nExample question:")
example = test_questions[0]
print(f"Subject: {example['subject']}")
print(f"Question: {example['question']}")
print(f"Choices: {example['choices']}")
print(f"Correct Answer: {chr(65 + example['answer'])}")

Prepared 14042 test questions across 57 subjects

Example question:
Subject: abstract_algebra
Question: Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.
Choices: ['0', '4', '2', '6']
Correct Answer: B


## 7. Run Benchmark

In [9]:
import concurrent.futures
import threading
import queue
import datetime
import random

# Sanitize model name for use in filenames (replace : and / with -)
model_name_safe = MODEL_NAME.replace(':', '-').replace('/', '-')

# Load checkpoint
run_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
checkpoint_file = RESULTS_DIR / f"checkpoint_ollama_{model_name_safe}_{run_timestamp}.json"
results_dict = {}

if ENABLE_CHECKPOINTS:
    checkpoint_files = sorted(RESULTS_DIR.glob(f"checkpoint_ollama_{model_name_safe}_*.json"), reverse=True)
    if checkpoint_files:
        latest_checkpoint = checkpoint_files[0]
        results_dict = load_checkpoint(latest_checkpoint)
        checkpoint_file = latest_checkpoint
        print(f"Found checkpoint: {latest_checkpoint.name}")
        print(f"Loaded {len(results_dict)} completed questions from checkpoint")

# Calculate subject_stats from loaded results (always recalculate, don't trust checkpoint)
subject_stats = {}
for result in results_dict.values():
    subject = result['subject']
    if subject not in subject_stats:
        subject_stats[subject] = {'correct': 0, 'total': 0, 'time': 0}
    subject_stats[subject]['total'] += 1
    subject_stats[subject]['time'] += result['time_seconds']
    if result['is_correct']:
        subject_stats[subject]['correct'] += 1

# Initialize shared state
state_lock = threading.Lock()              # Single lock for simplicity
question_queue = queue.Queue()             # Thread-safe by default
progress_queue = queue.Queue()             # For progress updates

# Fill queue with unprocessed questions
for idx in range(len(test_questions)):
    if idx not in results_dict:
        question_queue.put(idx)

remaining_count = question_queue.qsize()

print(f"\nStarting benchmark:")
print(f"  Total questions: {len(test_questions)}")
print(f"  Already completed: {len(results_dict)}")
print(f"  Remaining: {remaining_count}")
print(f"  Servers: {len(API_SERVERS)}")
print("=" * 80)
print()

# Worker function (runs on each thread - one per server)
def worker(server_config, worker_id):
    retry_delay = 5  # Exponential backoff for failures

    while True:
        try:
            idx = question_queue.get(timeout=1)
        except queue.Empty:
            return  # Exit when queue empty

        # CRITICAL: Check if already processed (race condition protection)
        with state_lock:
            if idx in results_dict:
                continue  # Another worker got it first, skip

        # Process question (no lock held during long operation!)
        result = process_single_question(
            server_config,
            test_questions[idx],
            idx,
            TIMEOUT_SECONDS
        )

        if result is not None:
            # Success - update all shared state atomically
            with state_lock:
                results_dict[idx] = result

                # Update subject stats
                subject = result['subject']
                if subject not in subject_stats:
                    subject_stats[subject] = {'correct': 0, 'total': 0, 'time': 0}
                subject_stats[subject]['total'] += 1
                subject_stats[subject]['time'] += result['time_seconds']
                if result['is_correct']:
                    subject_stats[subject]['correct'] += 1

                # Save checkpoint periodically
                if ENABLE_CHECKPOINTS and len(results_dict) % CHECKPOINT_INTERVAL == 0:
                    save_checkpoint(results_dict, checkpoint_file)

            # Send progress update (outside lock)
            progress_queue.put(('success', idx, result, worker_id))
            retry_delay = 5  # Reset delay on success

        else:
            # Failed - re-queue for retry on different server
            question_queue.put(idx)
            progress_queue.put(('retry', idx, server_config['base_url'], worker_id))

            # Exponential backoff with jitter to avoid thundering herd
            time.sleep(min(retry_delay, 60) + random.uniform(0, 2))
            retry_delay *= 1.5

# Progress display in separate thread (avoids tqdm threading issues)
# MINIMAL OUTPUT: Only print every 500 questions to prevent kernel death
PRINT_INTERVAL = 500  # Print detailed output every N questions

def display_progress():
    processed_count = len(results_dict)
    correct_count = sum(1 for r in results_dict.values() if r['is_correct'])

    with tqdm(total=len(test_questions), initial=processed_count, desc="MMLU Benchmark") as pbar:
        while True:
            try:
                event = progress_queue.get(timeout=0.1)
                event_type = event[0]

                if event_type == 'success':
                    _, idx, result, worker_id = event
                    pbar.update(1)
                    processed_count += 1
                    if result['is_correct']:
                        correct_count += 1

                    accuracy = (correct_count / processed_count) * 100
                    status = "✓" if result['is_correct'] else "✗"

                    # Only print detailed info every PRINT_INTERVAL questions
                    if processed_count % PRINT_INTERVAL == 0:
                        print(f"\n[Q{processed_count}/{len(test_questions)}] Progress Update:")
                        print(f"  Current Accuracy: {accuracy:.1f}% ({correct_count}/{processed_count})")
                        print(f"  Latest: {status} {result['subject']} - Answer: {result['predicted_answer']} (Correct: {result['correct_answer']})")
                        print(f"  Avg Time: {result['time_seconds']:.2f}s")

                elif event_type == 'retry':
                    _, idx, server_url, worker_id = event
                    print(f"\n[Q{idx+1}] ⚠ Retrying on different server (failed on Server {worker_id+1})")

                elif event_type == 'done':
                    break

            except queue.Empty:
                continue

# Start progress display thread
progress_thread = threading.Thread(target=display_progress, daemon=True)
progress_thread.start()

# Launch worker threads (one per server)
print(f"Starting {len(API_SERVERS)} worker thread(s)...\n")
with concurrent.futures.ThreadPoolExecutor(max_workers=len(API_SERVERS)) as executor:
    futures = [
        executor.submit(worker, server_config, i)
        for i, server_config in enumerate(API_SERVERS)
    ]

    # Wait for all workers to complete
    concurrent.futures.wait(futures)

# Signal progress thread to stop
progress_queue.put(('done',))
progress_thread.join(timeout=1)

# Final checkpoint save
if ENABLE_CHECKPOINTS:
    save_checkpoint(results_dict, checkpoint_file)

# Calculate final stats
total_questions = len(results_dict)
correct_count = sum(1 for r in results_dict.values() if r['is_correct'])
final_accuracy = (correct_count / total_questions) * 100 if total_questions > 0 else 0

print("\n" + "="*80)
print("Benchmark completed!")
print(f"Total: {total_questions} | Correct: {correct_count} | Accuracy: {final_accuracy:.2f}%")
print("="*80)


Starting benchmark:
  Total questions: 14042
  Already completed: 0
  Remaining: 14042
  Servers: 1

Starting 1 worker thread(s)...



MMLU Benchmark:   4%|▎         | 501/14042 [01:41<46:02,  4.90it/s]


[Q500/14042] Progress Update:
  Current Accuracy: 60.2% (301/500)
  Latest: ✓ clinical_knowledge - Answer: B (Correct: B)
  Avg Time: 0.20s


MMLU Benchmark:   7%|▋         | 1001/14042 [03:22<47:43,  4.55it/s]


[Q1000/14042] Progress Update:
  Current Accuracy: 62.5% (625/1000)
  Latest: ✓ college_computer_science - Answer: A (Correct: A)
  Avg Time: 0.23s


MMLU Benchmark:  11%|█         | 1500/14042 [05:06<47:15,  4.42it/s]


[Q1500/14042] Progress Update:
  Current Accuracy: 57.7% (865/1500)
  Latest: ✓ computer_security - Answer: C (Correct: C)
  Avg Time: 0.22s


MMLU Benchmark:  14%|█▍        | 2001/14042 [06:47<45:22,  4.42it/s]


[Q2000/14042] Progress Update:
  Current Accuracy: 57.0% (1139/2000)
  Latest: ✗ electrical_engineering - Answer: B (Correct: C)
  Avg Time: 0.24s


MMLU Benchmark:  18%|█▊        | 2501/14042 [08:28<41:37,  4.62it/s]


[Q2500/14042] Progress Update:
  Current Accuracy: 55.1% (1378/2500)
  Latest: ✗ formal_logic - Answer: UNKNOWN (Correct: B)
  Avg Time: 0.20s


MMLU Benchmark:  21%|██▏       | 3001/14042 [10:10<38:51,  4.74it/s]


[Q3000/14042] Progress Update:
  Current Accuracy: 56.3% (1688/3000)
  Latest: ✗ high_school_chemistry - Answer: A (Correct: B)
  Avg Time: 0.19s


MMLU Benchmark:  25%|██▍       | 3501/14042 [12:04<39:04,  4.50it/s]


[Q3500/14042] Progress Update:
  Current Accuracy: 57.1% (1997/3500)
  Latest: ✗ high_school_geography - Answer: A (Correct: B)
  Avg Time: 0.21s


MMLU Benchmark:  28%|██▊       | 4001/14042 [13:47<37:55,  4.41it/s]


[Q4000/14042] Progress Update:
  Current Accuracy: 59.3% (2373/4000)
  Latest: ✓ high_school_macroeconomics - Answer: B (Correct: B)
  Avg Time: 0.20s


MMLU Benchmark:  32%|███▏      | 4501/14042 [15:31<39:09,  4.06it/s]


[Q4500/14042] Progress Update:
  Current Accuracy: 57.1% (2568/4500)
  Latest: ✗ high_school_microeconomics - Answer: B (Correct: C)
  Avg Time: 0.23s


MMLU Benchmark:  36%|███▌      | 5001/14042 [17:15<34:28,  4.37it/s]


[Q5000/14042] Progress Update:
  Current Accuracy: 57.6% (2882/5000)
  Latest: ✓ high_school_psychology - Answer: C (Correct: C)
  Avg Time: 0.20s


MMLU Benchmark:  39%|███▉      | 5501/14042 [18:57<33:33,  4.24it/s]


[Q5500/14042] Progress Update:
  Current Accuracy: 59.2% (3255/5500)
  Latest: ✗ high_school_statistics - Answer: A (Correct: B)
  Avg Time: 0.20s


MMLU Benchmark:  43%|████▎     | 6000/14042 [20:59<39:46,  3.37it/s]


[Q6000/14042] Progress Update:
  Current Accuracy: 59.8% (3586/6000)
  Latest: ✓ high_school_world_history - Answer: C (Correct: C)
  Avg Time: 0.22s


MMLU Benchmark:  46%|████▋     | 6501/14042 [22:50<29:47,  4.22it/s]  


[Q6500/14042] Progress Update:
  Current Accuracy: 60.8% (3953/6500)
  Latest: ✗ international_law - Answer: C (Correct: B)
  Avg Time: 0.21s


MMLU Benchmark:  50%|████▉     | 7000/14042 [24:31<32:09,  3.65it/s]


[Q7000/14042] Progress Update:
  Current Accuracy: 61.2% (4287/7000)
  Latest: ✗ management - Answer: B (Correct: C)
  Avg Time: 0.21s


MMLU Benchmark:  53%|█████▎    | 7500/14042 [26:28<50:36,  2.15it/s]


[Q7500/14042] Progress Update:
  Current Accuracy: 62.5% (4688/7500)
  Latest: ✓ miscellaneous - Answer: A (Correct: A)
  Avg Time: 0.31s


MMLU Benchmark:  57%|█████▋    | 8000/14042 [28:12<29:42,  3.39it/s]


[Q8000/14042] Progress Update:
  Current Accuracy: 63.7% (5099/8000)
  Latest: ✗ miscellaneous - Answer: A (Correct: B)
  Avg Time: 0.21s


MMLU Benchmark:  61%|██████    | 8501/14042 [29:54<23:33,  3.92it/s]


[Q8500/14042] Progress Update:
  Current Accuracy: 64.0% (5440/8500)
  Latest: ✗ moral_disputes - Answer: C (Correct: B)
  Avg Time: 0.21s


MMLU Benchmark:  64%|██████▍   | 9000/14042 [31:39<23:53,  3.52it/s]


[Q9000/14042] Progress Update:
  Current Accuracy: 61.9% (5568/9000)
  Latest: ✗ moral_scenarios - Answer: B (Correct: D)
  Avg Time: 0.20s


MMLU Benchmark:  68%|██████▊   | 9500/14042 [33:23<22:47,  3.32it/s]


[Q9500/14042] Progress Update:
  Current Accuracy: 60.6% (5755/9500)
  Latest: ✓ nutrition - Answer: A (Correct: A)
  Avg Time: 0.23s


MMLU Benchmark:  71%|███████   | 10001/14042 [35:06<17:17,  3.90it/s]


[Q10000/14042] Progress Update:
  Current Accuracy: 61.0% (6100/10000)
  Latest: ✗ philosophy - Answer: B (Correct: A)
  Avg Time: 0.20s


MMLU Benchmark:  75%|███████▍  | 10500/14042 [37:07<18:33,  3.18it/s]


[Q10500/14042] Progress Update:
  Current Accuracy: 61.1% (6420/10500)
  Latest: ✗ professional_accounting - Answer: B (Correct: A)
  Avg Time: 0.20s


MMLU Benchmark:  78%|███████▊  | 11000/14042 [39:07<17:05,  2.96it/s]


[Q11000/14042] Progress Update:
  Current Accuracy: 60.4% (6639/11000)
  Latest: ✗ professional_law - Answer: A (Correct: C)
  Avg Time: 0.23s


MMLU Benchmark:  82%|████████▏ | 11500/14042 [41:09<14:35,  2.90it/s]


[Q11500/14042] Progress Update:
  Current Accuracy: 59.5% (6840/11500)
  Latest: ✗ professional_law - Answer: A (Correct: C)
  Avg Time: 0.22s


MMLU Benchmark:  85%|████████▌ | 12000/14042 [43:11<11:44,  2.90it/s]


[Q12000/14042] Progress Update:
  Current Accuracy: 58.7% (7043/12000)
  Latest: ✓ professional_law - Answer: B (Correct: B)
  Avg Time: 0.25s


MMLU Benchmark:  89%|████████▉ | 12500/14042 [45:06<08:31,  3.01it/s]


[Q12500/14042] Progress Update:
  Current Accuracy: 58.9% (7360/12500)
  Latest: ✗ professional_psychology - Answer: A (Correct: B)
  Avg Time: 0.23s


MMLU Benchmark:  93%|█████████▎| 13000/14042 [46:50<05:35,  3.11it/s]


[Q13000/14042] Progress Update:
  Current Accuracy: 59.1% (7683/13000)
  Latest: ✓ professional_psychology - Answer: A (Correct: A)
  Avg Time: 0.20s


MMLU Benchmark:  96%|█████████▌| 13501/14042 [48:37<02:40,  3.38it/s]


[Q13500/14042] Progress Update:
  Current Accuracy: 59.4% (8015/13500)
  Latest: ✓ sociology - Answer: A (Correct: A)
  Avg Time: 0.21s


MMLU Benchmark: 100%|█████████▉| 14000/14042 [50:18<00:13,  3.09it/s]


[Q14000/14042] Progress Update:
  Current Accuracy: 59.8% (8365/14000)
  Latest: ✓ world_religions - Answer: C (Correct: C)
  Avg Time: 0.19s


MMLU Benchmark: 100%|██████████| 14042/14042 [50:28<00:00,  4.64it/s]



Benchmark completed!
Total: 14042 | Correct: 8405 | Accuracy: 59.86%


## 8. Calculate and Display Results

In [10]:
# Convert results_dict to list for analysis
results = list(results_dict.values())

# Overall statistics
total_questions = len(results)
correct_answers = sum(1 for r in results if r['is_correct'])
overall_accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
total_time = sum(r['time_seconds'] for r in results)
avg_time = total_time / total_questions if total_questions > 0 else 0

print("\n" + "="*80)
print("OVERALL RESULTS")
print("="*80)
print(f"Total Questions: {total_questions}")
print(f"Correct Answers: {correct_answers}")
print(f"Incorrect Answers: {total_questions - correct_answers}")
print(f"Overall Accuracy: {overall_accuracy:.2f}%")
print(f"Total Time: {total_time:.2f} seconds")
print(f"Average Time per Question: {avg_time:.2f} seconds")
print("="*80)

# Subject-wise results
print("\n" + "="*80)
print("SUBJECT-WISE RESULTS")
print("="*80)

subject_results = []
for subject, stats in sorted(subject_stats.items()):
    accuracy = (stats['correct'] / stats['total']) * 100 if stats['total'] > 0 else 0
    avg_time_subject = stats['time'] / stats['total'] if stats['total'] > 0 else 0
    subject_results.append({
        'Subject': subject,
        'Correct': stats['correct'],
        'Total': stats['total'],
        'Accuracy (%)': f"{accuracy:.2f}",
        'Avg Time (s)': f"{avg_time_subject:.2f}"
    })

df_subjects = pd.DataFrame(subject_results)
print(df_subjects.to_string(index=False))
print("="*80)


OVERALL RESULTS
Total Questions: 14042
Correct Answers: 8405
Incorrect Answers: 5637
Overall Accuracy: 59.86%
Total Time: 3020.16 seconds
Average Time per Question: 0.22 seconds

SUBJECT-WISE RESULTS
                            Subject  Correct  Total Accuracy (%) Avg Time (s)
                   abstract_algebra       29    100        29.00         0.21
                            anatomy       93    135        68.89         0.20
                          astronomy      102    152        67.11         0.20
                    business_ethics       67    100        67.00         0.21
                 clinical_knowledge      187    265        70.57         0.20
                    college_biology      104    144        72.22         0.20
                  college_chemistry       40    100        40.00         0.21
           college_computer_science       42    100        42.00         0.21
                college_mathematics       30    100        30.00         0.21
                   

## 9. Detailed Error Analysis

In [11]:
# Show some incorrect answers for analysis
# Convert results_dict to list if not already done
if isinstance(results_dict, dict):
    results = list(results_dict.values())

incorrect_results = [r for r in results if not r['is_correct']]

print("\n" + "="*80)
print(f"SAMPLE INCORRECT ANSWERS (showing up to 5 of {len(incorrect_results)})")
print("="*80)

for i, result in enumerate(incorrect_results[:5]):
    print(f"\n[{i+1}] Subject: {result['subject']}")
    print(f"Question: {result['question']}")
    print(f"Choices:")
    for idx, choice in enumerate(result['choices']):
        print(f"  {chr(65+idx)}) {choice}")
    print(f"Correct Answer: {result['correct_answer']}")
    print(f"Model Answer: {result['predicted_answer']}")
    print(f"Full Response: {result['full_response'][:200]}..." if len(result['full_response']) > 200 else f"Full Response: {result['full_response']}")
    print("-" * 80)


SAMPLE INCORRECT ANSWERS (showing up to 5 of 5637)

[1] Subject: abstract_algebra
Question: Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.
Choices:
  A) 0
  B) 4
  C) 2
  D) 6
Correct Answer: B
Model Answer: C
Full Response: C
--------------------------------------------------------------------------------

[2] Subject: abstract_algebra
Question: Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5.
Choices:
  A) 8
  B) 2
  C) 24
  D) 120
Correct Answer: C
Model Answer: B
Full Response: B
--------------------------------------------------------------------------------

[3] Subject: abstract_algebra
Question: Statement 1 | A factor group of a non-Abelian group is non-Abelian. Statement 2 | If K is a normal subgroup of H and H is a normal subgroup of G, then K is a normal subgroup of G.
Choices:
  A) True, True
  B) False, False
  C) True, False
  D) False, True
Correct Answer: B
Model Answer: UNKNOWN
Full Response: The correct answer
---

## 10. Save Results to File

In [12]:
# Convert results_dict to list if not already done
if isinstance(results_dict, dict):
    results = list(results_dict.values())

# Create results dataframe
df_results = pd.DataFrame(results)

# Use the same timestamp from the benchmark run
final_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = RESULTS_DIR / f"mmlu_ollama_{model_name_safe}_results_{final_timestamp}.csv"
summary_file = RESULTS_DIR / f"mmlu_ollama_{model_name_safe}_summary_{final_timestamp}.txt"

# Save detailed results
df_results.to_csv(results_file, index=False)
print(f"Detailed results saved to: {results_file}")

# Save summary
with open(summary_file, 'w') as f:
    f.write("MMLU Benchmark Summary (Ollama)\n")
    f.write("=" * 80 + "\n\n")
    f.write(f"Date: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"API Type: Ollama\n")
    f.write(f"Base URL: {API_BASE_URL}\n")
    f.write(f"Model: {MODEL_NAME}\n")
    f.write(f"Number of Servers: {len(API_SERVERS)}\n\n")
    
    f.write("Configuration:\n")
    f.write("-" * 80 + "\n")
    f.write(f"Questions per subject: {NUM_QUESTIONS_PER_SUBJECT if NUM_QUESTIONS_PER_SUBJECT else 'ALL'}\n")
    f.write(f"Timeout: {TIMEOUT_SECONDS}s\n")
    f.write(f"Temperature: {GENERATION_SETTINGS.get('temperature', 'N/A')}\n")
    f.write(f"Num Predict: {GENERATION_SETTINGS.get('num_predict', 'N/A')}\n\n")
    
    f.write("Overall Results:\n")
    f.write("-" * 80 + "\n")
    f.write(f"Total Questions: {total_questions}\n")
    f.write(f"Correct Answers: {correct_answers}\n")
    f.write(f"Accuracy: {overall_accuracy:.2f}%\n")
    f.write(f"Average Time: {avg_time:.2f} seconds\n\n")
    
    f.write("Subject-wise Results:\n")
    f.write("-" * 80 + "\n")
    f.write(df_subjects.to_string(index=False))
    f.write("\n")

print(f"Summary saved to: {summary_file}")

# Clean up checkpoint file if benchmark completed successfully
if ENABLE_CHECKPOINTS and checkpoint_file.exists():
    # Rename checkpoint to indicate completion
    completed_checkpoint = RESULTS_DIR / f"completed_{checkpoint_file.name}"
    checkpoint_file.rename(completed_checkpoint)
    print(f"Checkpoint marked as completed: {completed_checkpoint.name}")

print("\nBenchmark complete! All results saved to the 'results' folder.")
print(f"Results directory: {RESULTS_DIR.absolute()}")

Detailed results saved to: results\mmlu_ollama_llama3-8b-instruct-q4_K_M_results_20260109_032446.csv
Summary saved to: results\mmlu_ollama_llama3-8b-instruct-q4_K_M_summary_20260109_032446.txt
Checkpoint marked as completed: completed_checkpoint_ollama_llama3-8b-instruct-q4_K_M_20260109_023418.json

Benchmark complete! All results saved to the 'results' folder.
Results directory: c:\Users\user\NextCloud\nextcloud.nicojoerger.de\Documents\Schulen\Studium\Abschlussarbeit\Git\Embedded-CPU-LLM\Code\Notebooks\results


## 11. Visualize Results