# MMLU Benchmark for TinyChatEngine API

This notebook runs the MMLU (Massive Multitask Language Understanding) benchmark against the TinyChatEngine API.

MMLU is a benchmark that tests knowledge across 57 subjects including mathematics, US history, computer science, law, and more.

## Configuration
- **API Endpoint**: Configure the base URL for your API
- **Number of Questions**: Select how many questions to test (per subject or total)
- **Subjects**: Choose specific subjects or test all

## 1. Install Requirements

In [13]:
!pip install -q requests datasets pandas tqdm numpy scikit-learn matplotlib seaborn

## 2. Configuration Settings

In [2]:
import requests
import json
import re
from datasets import load_dataset
import pandas as pd
from tqdm.auto import tqdm
import time
from typing import List, Dict, Tuple
import numpy as np
import os
from pathlib import Path

# API Configuration - Multiple Servers Support
API_SERVER_URLS = [
    "http://192.168.132.210:8080",  # Change these to your API endpoints
   # "http://rp5ab16.lan:8080",
   # "http://rp5ab8.lan:8080",
]

# Benchmark Configuration
NUM_QUESTIONS_PER_SUBJECT = None  # Set to None to run ALL questions, or a number to limit per subject
SELECTED_SUBJECTS = None  # List of subjects to test, or None for all subjects
TIMEOUT_SECONDS = 600  # Timeout for each question (10 minutes)

# Checkpoint Configuration
ENABLE_CHECKPOINTS = True  # Enable checkpoint/resume functionality
CHECKPOINT_INTERVAL = 1  # Save checkpoint after every N questions (1 = after each question)

# Results Directory
RESULTS_DIR = Path("results")  # Results will be saved in Notebooks/results/
RESULTS_DIR.mkdir(exist_ok=True)

# Generation Settings (optional - customize model behavior)
GENERATION_SETTINGS = {
    "temperature": 0.0,  # Lower temperature for more deterministic answers
    "max_tokens": 3,     # Limit tokens since we only need A/B/C/D
    "top_p": 0.95,
}

print("Configuration loaded successfully!")
print(f"API Servers: {len(API_SERVER_URLS)}")
for i, url in enumerate(API_SERVER_URLS, 1):
    print(f"  {i}. {url}")
print(f"Questions per subject: {NUM_QUESTIONS_PER_SUBJECT if NUM_QUESTIONS_PER_SUBJECT else 'ALL'}")
print(f"Timeout per question: {TIMEOUT_SECONDS} seconds ({TIMEOUT_SECONDS/60:.1f} minutes)")
print(f"Results directory: {RESULTS_DIR.absolute()}")
print(f"Checkpoints: {'Enabled' if ENABLE_CHECKPOINTS else 'Disabled'} (interval: {CHECKPOINT_INTERVAL} questions)")

  from .autonotebook import tqdm as notebook_tqdm


Configuration loaded successfully!
API Servers: 1
  1. http://192.168.132.210:8080
Questions per subject: ALL
Timeout per question: 600 seconds (10.0 minutes)
Results directory: c:\Users\user\NextCloud\nextcloud.nicojoerger.de\Documents\Schulen\Studium\Abschlussarbeit\Git\Embedded-CPU-LLM\Code\Notebooks\results
Checkpoints: Enabled (interval: 1 questions)


## 3. Helper Functions

In [3]:
def check_api_health(server_url: str) -> Dict:
    """Check if the API is running and healthy."""
    try:
        response = requests.get(f"{server_url}/health", timeout=5)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        raise ConnectionError(f"Failed to connect to API: {e}")


def reset_conversation(server_url: str) -> bool:
    """Reset the conversation state in the API."""
    try:
        response = requests.post(f"{server_url}/reset", timeout=10)
        response.raise_for_status()
        return True
    except Exception as e:
        print(f"Warning: Failed to reset conversation: {e}")
        return False


def configure_generation_settings(server_url: str, settings: Dict) -> bool:
    """Configure generation parameters."""
    try:
        response = requests.post(
            f"{server_url}/settings",
            json=settings,
            timeout=10
        )
        response.raise_for_status()
        return True
    except Exception as e:
        print(f"Warning: Failed to configure settings: {e}")
        return False


def parse_sse_stream(response_text: str) -> str:
    """Parse Server-Sent Events stream and extract generated text."""
    generated_text = ""
    error_details = []
    
    for line in response_text.strip().split('\n'):
        if line.startswith('data: '):
            data = line[6:]  # Remove 'data: ' prefix
            
            if data == '[DONE]':
                break
            elif data == '[ERROR]':
                # Capture any error details we've seen
                error_msg = "API returned error during generation"
                if error_details:
                    error_msg += f" (details: {'; '.join(error_details)})"
                raise RuntimeError(error_msg)
            
            try:
                event = json.loads(data)
                if 'token' in event:
                    generated_text += event['token']
                elif 'error' in event:
                    error_details.append(str(event['error']))
                    raise RuntimeError(f"API error: {event['error']}")
                # Capture any other error-related fields
                if 'message' in event and 'error' in str(event.get('message', '')).lower():
                    error_details.append(str(event['message']))
            except json.JSONDecodeError:
                # If we can't parse JSON, save the raw data for debugging
                if 'error' in data.lower() or 'fail' in data.lower():
                    error_details.append(f"Unparsed data: {data[:200]}")
                continue
    
    return generated_text.strip()


def generate_response(server_url: str, prompt: str, timeout: int = TIMEOUT_SECONDS) -> str:
    """Send a prompt to the API and get the generated response."""
    raw_response = None
    try:
        response = requests.post(
            f"{server_url}/chat",
            json={"prompt": prompt},
            timeout=timeout,
            stream=False
        )
        raw_response = response.text
        response.raise_for_status()
        
        # Parse SSE stream
        generated_text = parse_sse_stream(response.text)
        return generated_text
        
    except requests.Timeout:
        raise TimeoutError(f"Request timed out after {timeout} seconds")
    except requests.HTTPError as e:
        # Include response body for HTTP errors
        error_msg = f"HTTP {response.status_code}: {e}"
        if raw_response:
            error_msg += f" | Response: {raw_response[:500]}"
        raise RuntimeError(error_msg)
    except Exception as e:
        # For other errors, include raw response if available
        error_msg = str(e)
        if raw_response and 'API' in str(e):
            error_msg += f" | Raw response preview: {raw_response[:500]}"
        raise RuntimeError(error_msg)


def extract_answer(response: str) -> str:
    """Extract the answer choice (A, B, C, or D) from the model's response.
    
    STRICT matching to avoid false positives from words like 'Compute' or 'evAluate'.
    """
    # Clean the response
    response = response.strip()
    
    # First, try exact patterns that indicate a clear answer
    # Pattern 1: Single letter possibly with punctuation at START of response
    # Examples: "A", "A.", "A)", "A:", "B ", etc.
    match = re.match(r'^([A-Da-d])[\.\)\:\s]?', response)
    if match:
        return match.group(1).upper()
    
    # Pattern 2: "Answer: A" or "The answer is B" etc.
    match = re.search(r'(?:answer|choice)(?:\s+is)?[\s:]+([A-Da-d])(?:[^a-zA-Z]|$)', response, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    
    # Pattern 3: Single letter answer on its own line
    for line in response.split('\n'):
        line = line.strip()
        if re.match(r'^([A-Da-d])[\.\)\:]?$', line):
            return line[0].upper()
    
    # Pattern 4: Response is ONLY a single letter (after stripping whitespace)
    if len(response) == 1 and response.upper() in ['A', 'B', 'C', 'D']:
        return response.upper()
    
    # If none of the strict patterns match, return UNKNOWN
    # Do NOT try to find letters within words!
    return "UNKNOWN"


def format_mmlu_prompt(question: str, choices: List[str]) -> str:
    """Format a MMLU question with multiple choice options."""
    prompt = f"""Answer the following multiple choice question. Respond with only the letter (A, B, C, or D) of the correct answer.

Question: {question}

A) {choices[0]}
B) {choices[1]}
C) {choices[2]}
D) {choices[3]}

Answer:"""
    return prompt


def save_checkpoint(results_dict: Dict[int, Dict], checkpoint_file: Path):
    """Save checkpoint with question indices as keys."""
    checkpoint_data = {
        'results': results_dict,  # Dict with int keys (JSON converts to strings)
        'timestamp': time.time(),
        'num_processed': len(results_dict)
    }
    
    # Atomic save to prevent corruption
    temp_file = checkpoint_file.with_suffix('.tmp')
    with open(temp_file, 'w') as f:
        json.dump(checkpoint_data, f, indent=2)
    temp_file.replace(checkpoint_file)  # Atomic rename


def load_checkpoint(checkpoint_file: Path) -> Dict[int, Dict]:
    """Load checkpoint and return results_dict with int keys."""
    if not checkpoint_file.exists():
        return {}
    
    try:
        with open(checkpoint_file, 'r') as f:
            checkpoint_data = json.load(f)
        
        results_raw = checkpoint_data.get('results', {})
        
        # Convert JSON string keys back to int
        results_dict = {int(k): v for k, v in results_raw.items()}
        
        return results_dict
    except Exception as e:
        print(f"Warning: Failed to load checkpoint: {e}")
        return {}


def process_single_question(server_url: str, item: Dict, idx: int, timeout: int) -> Dict:
    """Process a single question on a specific server. Returns dict with 'error' key on failure."""
    try:
        reset_conversation(server_url)
        configure_generation_settings(server_url, GENERATION_SETTINGS)
        
        prompt = format_mmlu_prompt(item['question'], item['choices'])
        
        start_time = time.time()
        response = generate_response(server_url, prompt, timeout)
        elapsed_time = time.time() - start_time
        
        predicted_answer = extract_answer(response)
        correct_answer = chr(65 + item['answer'])
        
        return {
            'subject': item['subject'],
            'question': item['question'],
            'choices': item['choices'],
            'correct_answer': correct_answer,
            'predicted_answer': predicted_answer,
            'full_response': response,
            'is_correct': predicted_answer == correct_answer,
            'time_seconds': elapsed_time
        }
    except Exception as e:
        # Return error info to trigger retry on different server
        return {
            'error': str(e),
            'error_type': type(e).__name__,
            'question': item['question'],
            'subject': item['subject'],
            'choices': item.get('choices', [])
        }


print("Helper functions loaded successfully!")

Helper functions loaded successfully!


## 4. Verify API Connection

In [4]:
# Check API health for all servers
print("Checking API servers...")
available_servers = []

for server_url in API_SERVER_URLS:
    try:
        health = check_api_health(server_url)
        print(f"✓ {server_url}: {health['status']} - {health['model']} ({health['format']})")
        available_servers.append(server_url)
        
        # Configure generation settings for each server
        if configure_generation_settings(server_url, GENERATION_SETTINGS):
            print(f"  Settings configured: temp={GENERATION_SETTINGS['temperature']}, max_tokens={GENERATION_SETTINGS['max_tokens']}")
    except Exception as e:
        print(f"✗ {server_url}: UNAVAILABLE - {e}")

if not available_servers:
    raise RuntimeError("No servers available! Check your server URLs and ensure servers are running.")

# Update global list to only available servers
API_SERVER_URLS = available_servers
print(f"\n{len(API_SERVER_URLS)} server(s) ready for benchmark")

Checking API servers...
✓ http://192.168.132.210:8080: healthy - LLaMA_3_8B_Instruct (INT4)
  Settings configured: temp=0.0, max_tokens=3

1 server(s) ready for benchmark


## 5. Load MMLU Dataset

In [5]:
print("Loading MMLU dataset from Hugging Face...")
print("This may take a few minutes on first run...")

# Load the MMLU dataset (using the 'test' split)
# The dataset is organized by subjects
dataset = load_dataset("cais/mmlu", "all", split="test")

print(f"\nDataset loaded successfully!")
print(f"Total questions: {len(dataset)}")

# Get all unique subjects
all_subjects = sorted(set(dataset['subject']))
print(f"Number of subjects: {len(all_subjects)}")
print(f"\nSubjects: {', '.join(all_subjects[:10])}..." if len(all_subjects) > 10 else f"\nSubjects: {', '.join(all_subjects)}")

Loading MMLU dataset from Hugging Face...
This may take a few minutes on first run...

Dataset loaded successfully!
Total questions: 14042
Number of subjects: 57

Subjects: abstract_algebra, anatomy, astronomy, business_ethics, clinical_knowledge, college_biology, college_chemistry, college_computer_science, college_mathematics, college_medicine...


## 6. Prepare Test Data

In [6]:
# Filter subjects if specified
subjects_to_test = SELECTED_SUBJECTS if SELECTED_SUBJECTS else all_subjects

# Prepare test questions
test_questions = []

for subject in subjects_to_test:
    # Filter dataset by subject
    subject_data = [item for item in dataset if item['subject'] == subject]
    
    # Limit number of questions if specified
    if NUM_QUESTIONS_PER_SUBJECT:
        subject_data = subject_data[:NUM_QUESTIONS_PER_SUBJECT]
    
    test_questions.extend(subject_data)

print(f"Prepared {len(test_questions)} test questions across {len(subjects_to_test)} subjects")
print(f"\nExample question:")
example = test_questions[0]
print(f"Subject: {example['subject']}")
print(f"Question: {example['question']}")
print(f"Choices: {example['choices']}")
print(f"Correct Answer: {chr(65 + example['answer'])}")

Prepared 14042 test questions across 57 subjects

Example question:
Subject: abstract_algebra
Question: Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.
Choices: ['0', '4', '2', '6']
Correct Answer: B


## 7. Load Checkpoint (Optional - Skip API Benchmark if Complete)

In [7]:
import datetime

# Initialize results dictionary
results_dict = {}
checkpoint_file = None
run_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Try to load existing checkpoint
if ENABLE_CHECKPOINTS:
    checkpoint_files = sorted(RESULTS_DIR.glob("checkpoint_*.json"), reverse=True)
    
    # Also look for completed checkpoints
    completed_files = sorted(RESULTS_DIR.glob("completed_checkpoint_*.json"), reverse=True)
    
    all_checkpoints = checkpoint_files + completed_files
    
    if all_checkpoints:
        # Sort by modification time to get the most recent
        all_checkpoints.sort(key=lambda x: x.stat().st_mtime, reverse=True)
        latest_checkpoint = all_checkpoints[0]
        
        results_dict = load_checkpoint(latest_checkpoint)
        checkpoint_file = latest_checkpoint
        
        print(f"✓ Loaded checkpoint: {latest_checkpoint.name}")
        print(f"  Completed questions: {len(results_dict)}")
        print(f"  Checkpoint created: {datetime.datetime.fromtimestamp(latest_checkpoint.stat().st_mtime).strftime('%Y-%m-%d %H:%M:%S')}")
        
        # Calculate some quick stats
        if results_dict:
            correct_count = sum(1 for r in results_dict.values() if r.get('is_correct', False))
            accuracy = (correct_count / len(results_dict)) * 100
            print(f"  Current accuracy: {accuracy:.2f}% ({correct_count}/{len(results_dict)})")
            
            # Check completion status
            remaining = len(test_questions) - len(results_dict)
            if remaining == 0:
                print(f"\n✓ Benchmark COMPLETE! All {len(test_questions)} questions processed.")
                print(f"  You can skip the next cell (Run Benchmark) and go directly to results analysis.")
            else:
                print(f"\n⚠ Benchmark INCOMPLETE: {remaining} questions remaining")
                print(f"  Run the next cell to continue from where you left off.")
        
    else:
        print("No existing checkpoint found. Starting fresh.")
        checkpoint_file = RESULTS_DIR / f"checkpoint_{run_timestamp}.json"
else:
    print("Checkpoints disabled. Starting fresh.")
    checkpoint_file = RESULTS_DIR / f"checkpoint_{run_timestamp}.json"

print(f"\nResults will be saved to: {checkpoint_file}")
print("=" * 80)

✓ Loaded checkpoint: checkpoint_20260101_224825.json
  Completed questions: 14030
  Checkpoint created: 2026-01-05 17:10:09
  Current accuracy: 46.82% (6569/14030)

⚠ Benchmark INCOMPLETE: 12 questions remaining
  Run the next cell to continue from where you left off.

Results will be saved to: results\checkpoint_20260101_224825.json


## 8. Run Benchmark (Skip if checkpoint is complete)

In [None]:
import concurrent.futures
import threading
import queue
import random

# Check if benchmark is already complete
remaining_questions = len(test_questions) - len(results_dict)

if remaining_questions == 0:
    print("=" * 80)
    print("✓ All questions already processed!")
    print(f"  Total: {len(results_dict)} questions")
    print(f"  You can skip this cell and proceed directly to results analysis.")
    print("=" * 80)
else:
    print(f"Starting benchmark for {remaining_questions} remaining questions...")
    print(f"  Already completed: {len(results_dict)}")
    print(f"  Total questions: {len(test_questions)}")
    print(f"  Servers: {len(API_SERVER_URLS)}")
    print("=" * 80)
    print()
    
    # Calculate subject_stats from loaded results
    subject_stats = {}
    for result in results_dict.values():
        subject = result['subject']
        if subject not in subject_stats:
            subject_stats[subject] = {'correct': 0, 'total': 0, 'time': 0}
        subject_stats[subject]['total'] += 1
        subject_stats[subject]['time'] += result['time_seconds']
        if result['is_correct']:
            subject_stats[subject]['correct'] += 1
    
    # Initialize shared state
    state_lock = threading.Lock()
    question_queue = queue.Queue()
    progress_queue = queue.Queue()
    
    # Fill queue with unprocessed questions only
    for idx in range(len(test_questions)):
        if idx not in results_dict:
            question_queue.put(idx)
    
    # Worker function
    def worker(server_url, worker_id):
        retry_delay = 5
        
        while True:
            try:
                idx = question_queue.get(timeout=1)
            except queue.Empty:
                return
            
            with state_lock:
                if idx in results_dict:
                    continue
            
            result = process_single_question(
                server_url,
                test_questions[idx],
                idx,
                TIMEOUT_SECONDS
            )
            
            if 'error' not in result:
                with state_lock:
                    results_dict[idx] = result
                    
                    subject = result['subject']
                    if subject not in subject_stats:
                        subject_stats[subject] = {'correct': 0, 'total': 0, 'time': 0}
                    subject_stats[subject]['total'] += 1
                    subject_stats[subject]['time'] += result['time_seconds']
                    if result['is_correct']:
                        subject_stats[subject]['correct'] += 1
                    
                    if ENABLE_CHECKPOINTS and len(results_dict) % CHECKPOINT_INTERVAL == 0:
                        save_checkpoint(results_dict, checkpoint_file)
                        time.sleep(1) 
                
                progress_queue.put(('success', idx, result, worker_id))
                retry_delay = 5
            else:
                question_queue.put(idx)
                progress_queue.put(('retry', idx, server_url, worker_id, result))
                time.sleep(min(retry_delay, 60) + random.uniform(0, 2))
                retry_delay *= 1.5
    
    # Progress display
    def display_progress():
        processed_count = len(results_dict)
        correct_count = sum(1 for r in results_dict.values() if r['is_correct'])
        
        with tqdm(total=len(test_questions), initial=processed_count, desc="MMLU Benchmark") as pbar:
            while True:
                try:
                    event = progress_queue.get(timeout=0.1)
                    event_type = event[0]
                    
                    if event_type == 'success':
                        _, idx, result, worker_id = event
                        pbar.update(1)
                        processed_count += 1
                        if result['is_correct']:
                            correct_count += 1
                        
                        accuracy = (correct_count / processed_count) * 100
                        status = "✓" if result['is_correct'] else "✗"
                        
                        print(f"\n[Q{idx+1}] {status} {result['subject']} (Server {worker_id+1})")
                        print(f"  Answer: {result['predicted_answer']} | "
                              f"Correct: {result['correct_answer']} | "
                              f"Time: {result['time_seconds']:.1f}s | "
                              f"Accuracy: {accuracy:.1f}%")
                    
                    elif event_type == 'retry':
                        _, idx, server_url, worker_id, error_result = event
                        
                        print(f"\n[Q{idx+1}] ⚠ Retrying on different server (failed on Server {worker_id+1})")
                        print(f"  Subject: {error_result.get('subject', 'Unknown')}")
                        print(f"  Question: {error_result.get('question', 'Unknown')}")
                        
                        choices = error_result.get('choices', [])
                        if choices:
                            print(f"  Choices:")
                            for i, choice in enumerate(choices):
                                print(f"    {chr(65+i)}) {choice}")
                        
                        print(f"  Error Type: {error_result.get('error_type', 'Unknown')}")
                        print(f"  Error Details: {error_result.get('error', 'Unknown error')}")
                    
                    elif event_type == 'done':
                        break
                
                except queue.Empty:
                    continue
    
    # Start progress thread
    progress_thread = threading.Thread(target=display_progress, daemon=True)
    progress_thread.start()
    
    # Launch workers
    print(f"Starting {len(API_SERVER_URLS)} worker thread(s)...\n")
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(API_SERVER_URLS)) as executor:
        futures = [
            executor.submit(worker, server_url, i)
            for i, server_url in enumerate(API_SERVER_URLS)
        ]
        concurrent.futures.wait(futures)
    
    # Stop progress thread
    progress_queue.put(('done',))
    progress_thread.join(timeout=1)
    
    # Final checkpoint save
    if ENABLE_CHECKPOINTS:
        save_checkpoint(results_dict, checkpoint_file)
    
    # Calculate final stats
    total_questions_completed = len(results_dict)
    correct_count = sum(1 for r in results_dict.values() if r['is_correct'])
    final_accuracy = (correct_count / total_questions_completed) * 100 if total_questions_completed > 0 else 0
    
    print("\n" + "="*80)
    print("Benchmark completed!")
    print(f"Total: {total_questions_completed} | Correct: {correct_count} | Accuracy: {final_accuracy:.2f}%")
    print("="*80)

Starting benchmark for 8 remaining questions...
  Already completed: 14034
  Total questions: 14042
  Servers: 1

Starting 1 worker thread(s)...



MMLU Benchmark: 100%|█████████▉| 14034/14042 [00:00<?, ?it/s]


[Q2272] ⚠ Retrying on different server (failed on Server 1)
  Subject: elementary_mathematics
  Question: Ty has 12 weeks to read a certain number of books. He will spend 2 weeks reading each book. Which expression can Ty use to find the total number of books he will read?
  Choices:
    A) 12 – 2
    B) 12 + 12
    C) 12 – 2 – 2 – 2
    D) 12 – 2 – 2 – 2 – 2 – 2 – 2
  Error Type: RuntimeError
  Error Details: API returned error during generation (details: Unparsed data: data: {"error":"[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC3"}) | Raw response preview: data: {"position":0,"token":"12"}

data: data: {"error":"[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC3"}

data: [ERROR]



[Q2289] ⚠ Retrying on different server (failed on Server 1)
  Subject: elementary_mathematics
  Question: Malik has 240 CDs. He sorted them into 12 equal groups. Which expression shows how to find the number of CDs in each group?
  Choices:
    A) 240 ÷ 12


MMLU Benchmark: 100%|█████████▉| 14035/14042 [01:25<09:58, 85.53s/it]


[Q14038] ✓ world_religions (Server 1)
  Answer: A | Correct: A | Time: 11.0s | Accuracy: 46.8%


MMLU Benchmark: 100%|█████████▉| 14036/14042 [01:38<04:18, 43.07s/it]


[Q14039] ✗ world_religions (Server 1)
  Answer: UNKNOWN | Correct: C | Time: 11.8s | Accuracy: 46.8%


MMLU Benchmark: 100%|█████████▉| 14037/14042 [01:51<02:26, 29.27s/it]


[Q14040] ✗ world_religions (Server 1)
  Answer: UNKNOWN | Correct: B | Time: 11.3s | Accuracy: 46.8%


MMLU Benchmark: 100%|█████████▉| 14038/14042 [02:04<01:31, 22.82s/it]


[Q14041] ✗ world_religions (Server 1)
  Answer: A | Correct: B | Time: 11.4s | Accuracy: 46.8%


MMLU Benchmark: 100%|█████████▉| 14039/14042 [02:18<00:58, 19.50s/it]


[Q14042] ✓ world_religions (Server 1)
  Answer: A | Correct: A | Time: 12.0s | Accuracy: 46.8%

[Q2272] ⚠ Retrying on different server (failed on Server 1)
  Subject: elementary_mathematics
  Question: Ty has 12 weeks to read a certain number of books. He will spend 2 weeks reading each book. Which expression can Ty use to find the total number of books he will read?
  Choices:
    A) 12 – 2
    B) 12 + 12
    C) 12 – 2 – 2 – 2
    D) 12 – 2 – 2 – 2 – 2 – 2 – 2
  Error Type: RuntimeError
  Error Details: API returned error during generation (details: Unparsed data: data: {"error":"[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC3"}) | Raw response preview: data: {"position":0,"token":"12"}

data: data: {"error":"[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC3"}

data: [ERROR]



[Q2289] ⚠ Retrying on different server (failed on Server 1)
  Subject: elementary_mathematics
  Question: Malik has 240 CDs. He sorted them into 12 equal groups.

## 9. Calculate and Display Results

In [8]:
# Calculate subject_stats from results_dict (works whether data came from checkpoint or API)
subject_stats = {}
for result in results_dict.values():
    subject = result['subject']
    if subject not in subject_stats:
        subject_stats[subject] = {'correct': 0, 'total': 0, 'time': 0}
    subject_stats[subject]['total'] += 1
    subject_stats[subject]['time'] += result['time_seconds']
    if result['is_correct']:
        subject_stats[subject]['correct'] += 1

# Convert results_dict to list for analysis
results = list(results_dict.values())

# Overall statistics
total_questions = len(results)
correct_answers = sum(1 for r in results if r['is_correct'])
overall_accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
total_time = sum(r['time_seconds'] for r in results)
avg_time = total_time / total_questions if total_questions > 0 else 0

print("\n" + "="*80)
print("OVERALL RESULTS")
print("="*80)
print(f"Total Questions: {total_questions}")
print(f"Correct Answers: {correct_answers}")
print(f"Incorrect Answers: {total_questions - correct_answers}")
print(f"Overall Accuracy: {overall_accuracy:.2f}%")
print(f"Total Time: {total_time:.2f} seconds")
print(f"Average Time per Question: {avg_time:.2f} seconds")
print("="*80)

# Subject-wise results
print("\n" + "="*80)
print("SUBJECT-WISE RESULTS")
print("="*80)

subject_results = []
for subject, stats in sorted(subject_stats.items()):
    accuracy = (stats['correct'] / stats['total']) * 100 if stats['total'] > 0 else 0
    avg_time_subject = stats['time'] / stats['total'] if stats['total'] > 0 else 0
    subject_results.append({
        'Subject': subject,
        'Correct': stats['correct'],
        'Total': stats['total'],
        'Accuracy (%)': f"{accuracy:.2f}",
        'Avg Time (s)': f"{avg_time_subject:.2f}"
    })

df_subjects = pd.DataFrame(subject_results)
print(df_subjects.to_string(index=False))
print("="*80)


OVERALL RESULTS
Total Questions: 14030
Correct Answers: 6569
Incorrect Answers: 7461
Overall Accuracy: 46.82%
Total Time: 231800.30 seconds
Average Time per Question: 16.52 seconds

SUBJECT-WISE RESULTS
                            Subject  Correct  Total Accuracy (%) Avg Time (s)
                   abstract_algebra       22    100        22.00        14.34
                            anatomy       66    135        48.89        14.08
                          astronomy       88    152        57.89        16.12
                    business_ethics       54    100        54.00        17.23
                 clinical_knowledge      144    265        54.34        15.02
                    college_biology       74    144        51.39        25.31
                  college_chemistry       33    100        33.00        23.03
           college_computer_science       38    100        38.00        21.19
                college_mathematics       27    100        27.00        13.98
                

## 10. Detailed Error Analysis

In [9]:
# Show some incorrect answers for analysis
# Convert results_dict to list if not already done
if isinstance(results_dict, dict):
    results = list(results_dict.values())

incorrect_results = [r for r in results if not r['is_correct']]

print("\n" + "="*80)
print(f"SAMPLE INCORRECT ANSWERS (showing up to 5 of {len(incorrect_results)})")
print("="*80)

for i, result in enumerate(incorrect_results[:5]):
    print(f"\n[{i+1}] Subject: {result['subject']}")
    print(f"Question: {result['question']}")
    print(f"Choices:")
    for idx, choice in enumerate(result['choices']):
        print(f"  {chr(65+idx)}) {choice}")
    print(f"Correct Answer: {result['correct_answer']}")
    print(f"Model Answer: {result['predicted_answer']}")
    print(f"Full Response: {result['full_response'][:200]}..." if len(result['full_response']) > 200 else f"Full Response: {result['full_response']}")
    print("-" * 80)


SAMPLE INCORRECT ANSWERS (showing up to 5 of 7461)

[1] Subject: abstract_algebra
Question: Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.
Choices:
  A) 0
  B) 4
  C) 2
  D) 6
Correct Answer: B
Model Answer: A
Full Response: A)
--------------------------------------------------------------------------------

[2] Subject: abstract_algebra
Question: Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5.
Choices:
  A) 8
  B) 2
  C) 24
  D) 120
Correct Answer: C
Model Answer: UNKNOWN
Full Response: 8

Explanation
--------------------------------------------------------------------------------

[3] Subject: abstract_algebra
Question: Find all zeros in the indicated finite field of the given polynomial with coefficients in that field. x^5 + 3x^3 + x^2 + 2x in Z_5
Choices:
  A) 0
  B) 1
  C) 0,1
  D) 0,4
Correct Answer: D
Model Answer: UNKNOWN
Full Response: 0,4
--------------------------------------------------------------------------------

[

## 11. Save Results to File

In [10]:
# Convert results_dict to list if not already done
if isinstance(results_dict, dict):
    results = list(results_dict.values())

# Create results dataframe
df_results = pd.DataFrame(results)

# Use the same timestamp from the benchmark run
final_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = RESULTS_DIR / f"mmlu_results_{final_timestamp}.csv"
summary_file = RESULTS_DIR / f"mmlu_summary_{final_timestamp}.txt"

# Save detailed results
df_results.to_csv(results_file, index=False)
print(f"Detailed results saved to: {results_file}")

# Save summary
with open(summary_file, 'w') as f:
    f.write("MMLU Benchmark Summary\n")
    f.write("=" * 80 + "\n\n")
    f.write(f"Date: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"API Servers: {', '.join(API_SERVER_URLS)}\n")
    f.write(f"Checkpoint: {checkpoint_file.name if checkpoint_file else 'N/A'}\n\n")
    
    f.write("Configuration:\n")
    f.write("-" * 80 + "\n")
    f.write(f"Questions per subject: {NUM_QUESTIONS_PER_SUBJECT if NUM_QUESTIONS_PER_SUBJECT else 'ALL'}\n")
    f.write(f"Timeout: {TIMEOUT_SECONDS}s\n")
    f.write(f"Temperature: {GENERATION_SETTINGS.get('temperature', 'N/A')}\n")
    f.write(f"Max Tokens: {GENERATION_SETTINGS.get('max_tokens', 'N/A')}\n\n")
    
    f.write("Overall Results:\n")
    f.write("-" * 80 + "\n")
    f.write(f"Total Questions: {total_questions}\n")
    f.write(f"Correct Answers: {correct_answers}\n")
    f.write(f"Accuracy: {overall_accuracy:.2f}%\n")
    f.write(f"Average Time: {avg_time:.2f} seconds\n\n")
    
    f.write("Subject-wise Results:\n")
    f.write("-" * 80 + "\n")
    f.write(df_subjects.to_string(index=False))
    f.write("\n")

print(f"Summary saved to: {summary_file}")

# Clean up checkpoint file if benchmark completed successfully
if ENABLE_CHECKPOINTS and checkpoint_file and checkpoint_file.exists():
    # Only rename if it's not already a completed checkpoint
    if not checkpoint_file.name.startswith("completed_"):
        completed_checkpoint = RESULTS_DIR / f"completed_{checkpoint_file.name}"
        checkpoint_file.rename(completed_checkpoint)
        print(f"Checkpoint marked as completed: {completed_checkpoint.name}")

print("\nAll results saved to the 'results' folder.")
print(f"Results directory: {RESULTS_DIR.absolute()}")

Detailed results saved to: results\mmlu_results_20260105_185339.csv
Summary saved to: results\mmlu_summary_20260105_185339.txt
Checkpoint marked as completed: completed_checkpoint_20260101_224825.json

All results saved to the 'results' folder.
Results directory: c:\Users\user\NextCloud\nextcloud.nicojoerger.de\Documents\Schulen\Studium\Abschlussarbeit\Git\Embedded-CPU-LLM\Code\Notebooks\results


## 12. Visualize Results