## Setup and Imports

In [1]:
!pip install -q datasets

In [2]:
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.notebook import tqdm

## Model Loading

In [3]:
def load_model_and_tokenizer(model_name):
    """Load model and tokenizer with multi-GPU support"""
    # Check GPU availability
    if not torch.cuda.is_available():
        raise RuntimeError("No GPU available")
    
    n_gpus = torch.cuda.device_count()
    print(f"Found {n_gpus} GPUs")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    
    # Load model with optimal settings for T4 GPUs
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,  # Use float16 for T4 GPUs
        device_map="auto",         # Automatically handle multi-GPU
        max_memory={i: "12GiB" for i in range(n_gpus)},  # T4 has 16GB but leave some headroom
    )
    model.eval()
    
    # Get device (first GPU)
    device = torch.device("cuda:0")
    
    return model, tokenizer, device

## Data Loading

In [16]:
def load_mmlu_data(subjects=None, languages=['ru', 'en']):
    """Load MMLU data for specified subjects and languages"""
    if subjects is None:
        subjects = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
    
    dfs = []
    for subject in subjects:
        for lang in languages:
            try:
                if lang == 'ru':
                    dataset = load_dataset("NLPCoreTeam/mmlu_ru", subject, split="test")
                else:
                    dataset = load_dataset("cais/mmlu", subject, split="test")
                df = dataset.to_pandas()
                df['subject'] = subject
                df['language'] = lang
                dfs.append(df)
            except Exception as e:
                print(f"Error loading {subject} in {lang}: {e}")
    
    return pd.concat(dfs, ignore_index=True)

## Evaluation Functions

In [17]:
def format_prompt(row):
    """Format a single question into a prompt based on language"""
    lang = row['language']
    
    if lang == 'ru':
        prompt = f"""Ответьте на вопрос, выбрав правильный вариант (A, B, C или D).

Вопрос: {row['question_ru'] if 'question_ru' in row else row['question']}

Варианты ответа:
"""
        choices = row['choices_ru'] if 'choices_ru' in row else row['choices']
        for idx, choice in enumerate(choices):
            prompt += f"{chr(65 + idx)}. {choice}\n"
        prompt += "\nОтвет (укажите только букву A, B, C или D):"
        
    else:  # English
        prompt = f"""Answer the question by selecting the correct option (A, B, C, or D).

Question: {row['question']}

Options:
"""
        for idx, choice in enumerate(row['choices']):
            prompt += f"{chr(65 + idx)}. {choice}\n"
        prompt += "\nAnswer (provide only the letter A, B, C, or D):"
    
    return prompt

def evaluate_model(model, tokenizer, df, device, debug_samples= -1, batch_size=4):
    """Evaluate model on the dataset with debugging information and batch processing"""
    results = []

    # Process data in batches
    for i in tqdm(range(0, len(df), batch_size)):
        batch_df = df.iloc[i:i + batch_size]
        # Create a list of prompts for the current batch
        prompts = [format_prompt(row) for _, row in batch_df.iterrows()]

        # Tokenize all prompts in the batch
        inputs = tokenizer(prompts, 
                           return_tensors="pt", 
                           padding=True, 
                           truncation=True).to(device)

        # Generate for the entire batch
        with torch.no_grad():
            try:
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=10,
                    do_sample=False,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    use_cache=True
                )
            except Exception as e:
                print("Error during model generation:", e)
                continue

        # Iterate over each sample in the batch
        for idx, (prompt, (_, row)) in enumerate(zip(prompts, batch_df.iterrows())):
            # Decode the model's output for the current sample
            response = tokenizer.decode(outputs[idx], skip_special_tokens=True)
            # Remove the prompt from the beginning of the response
            generated = response[len(prompt):].strip()

            # Get the correct answer and convert if needed
            correct_answer = row['answer']
            if str(correct_answer).isdigit():
                correct_answer = chr(65 + int(correct_answer))  # Convert number to letter
            correct_answer = str(correct_answer).upper()

            # Extract the first capital letter as the prediction
            pred = next((c for c in generated if c.upper() in 'ABCD'), 'X')
            pred = pred.upper()

            # Print debug information for the first few samples
            if idx < debug_samples:
                print(f"\nDebug Sample {idx + 1} ({row['language']}):")
                print(f"Question: {row['question_ru'] if 'question_ru' in row else row['question']}")
                print(f"Full Response: {generated}")
                print(f"Extracted Prediction: {pred}")
                print(f"Correct Answer: {correct_answer}")
                print(f"Choices: {row['choices_ru'] if 'choices_ru' in row else row['choices']}")

            results.append({
                'subject': row['subject'],
                'language': row['language'],
                'question': row['question_ru'] if 'question_ru' in row else row['question'],
                'correct_answer': correct_answer,
                'predicted_answer': pred,
                'full_response': generated,
                'correct': pred == correct_answer
            })

    return pd.DataFrame(results)


## Running Evaluation

In [6]:
# Run these commands in separate cells:

# 1. Load model
model_name = "Qwen/Qwen2.5-3B-Instruct"  # e.g., "Qwen/Qwen2.5-1.5B-Instruct"
model, tokenizer, device = load_model_and_tokenizer(model_name)

Found 2 GPUs


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [18]:
# 2. Load data
eval_df = load_mmlu_data(languages=['en'])
eval_df_ru = load_mmlu_data(languages=['ru'])

test-00000-of-00001.parquet:   0%|          | 0.00/20.1k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.50k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/135 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/14 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/28.3k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.05k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/152 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/21.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.96k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/40.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/7.48k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/265 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/31.8k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.90k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.27k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/144 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/4.87k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.25k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/42.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/8.99k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.84k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/173 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/19.1k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.67k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/25.0k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.98k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/235 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/26 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/114 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/17.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.08k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.08k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/145 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/41.1k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/9.38k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/378 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/41 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/21.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.56k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.81k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/14 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.58k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/310 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/32 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/33.3k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/8.31k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.16k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/203 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/27.3k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/6.54k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/142k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/31.6k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/22.2k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/165 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/28.2k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.16k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/198 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/40.2k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/8.27k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/193 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/54.8k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/9.89k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/390 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/43 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/33.7k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.99k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.50k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/270 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/38.8k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/7.22k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/238 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/26 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/33.0k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/7.96k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.57k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/151 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/92.8k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/15.2k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/545 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/60 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/58.0k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/6.07k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/216 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/155k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/27.3k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/204 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/202k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/38.5k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/237 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/26 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/31.2k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.28k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/223 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.2k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.26k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.08k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/131 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/29.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/7.12k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.96k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/121 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.3k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.21k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/108 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.0k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.52k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/163 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/4.50k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.61k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/103 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/37.3k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/8.21k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.28k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/234 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/98.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/783 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/86 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/60.9k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/346 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/38 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/89.8k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/5.14k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/895 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/55.0k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/9.02k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.99k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/306 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/33 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/48.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/9.15k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/311 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/34 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/54.3k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/9.89k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.62k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/324 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/35 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/69.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.89k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/282 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/31 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/116k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1534 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/170 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/125k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/19.9k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/8.45k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/272 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/31 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/133k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/22.1k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.69k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/612 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/69 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.45k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/110 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/114k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/18.7k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/7.49k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/245 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/27 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/43.9k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/8.36k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/201 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.22k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/27.3k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/7.05k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/166 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/18.9k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.30k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/171 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/19 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/5.93k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/9.67k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/46.8k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/14 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/135 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/9.15k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/68.2k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/152 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/50.8k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/6.16k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/99.1k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/29 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/265 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/14.8k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/78.1k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/144 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/9.41k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/40.2k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/8 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/73.3k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/20.3k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/109k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/173 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/44.8k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/59.6k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/26 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/235 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/8.32k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/58.8k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/12 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/114 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.48k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/41.5k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/145 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/8.97k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/19.2k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/96.9k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/41 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/378 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/13.3k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/14 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/126 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/6.68k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/7.63k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/26.1k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/10 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/23.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/160k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/32 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/310 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/6.80k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/79.3k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/203 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/69.3k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/9 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/74.7k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/374k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/165 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.63k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/67.1k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/198 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/8.17k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.3k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/102k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/21 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/193 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/22.3k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/145k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/43 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/390 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/78.4k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/29 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/270 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/6.76k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/100k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/26 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/238 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/81.5k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/17 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/151 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/9.86k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/234k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/60 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/545 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/23.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/150k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/23 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/216 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/48.0k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/69.0k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/414k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/204 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/24.7k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/97.8k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/536k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/26 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/237 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/76.3k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/23 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/223 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.65k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/54.3k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/12 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/131 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/15.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/74.2k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/13 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/121 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.98k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/13.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/56.0k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/108 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/8.50k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/163 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/6.07k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/8.40k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/103 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/8.73k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/18.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/234 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/36.3k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/29.0k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/243k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/86 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/783 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/9.16k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/24.1k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/155k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/38 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/346 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/36.8k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/238k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/895 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/10.0k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/137k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/33 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/306 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/19.8k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/119k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/34 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/311 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/9.08k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/21.8k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/136k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/35 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/324 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/31 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/282 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/38.2k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/307k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/2.70M [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/170 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1534 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/48.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/325k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/31 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/272 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/8.82k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/337k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/69 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/612 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/50.8k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/12 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/110 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/43.7k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/299k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/27 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/245 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.79k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/109k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/201 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.90k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/46.6k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/66.1k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/166 [00:00<?, ? examples/s]

0000.parquet:   0%|          | 0.00/5.22k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/9.02k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/19 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/171 [00:00<?, ? examples/s]

In [None]:
# 3. Run evaluation
results_df = evaluate_model(model, tokenizer, eval_df, device)
results_df_ru = evaluate_model(model, tokenizer, eval_df_ru, device)

  0%|          | 0/3511 [00:00<?, ?it/s]



In [None]:
# 4. Calculate and display results for english
accuracy = results_df['correct'].mean()
subject_accuracy = results_df.groupby('subject')['correct'].mean()

print(f"Overall accuracy: {accuracy:.2%}")
print("\nAccuracy by subject:")
print(subject_accuracy)

In [None]:
# 4. Calculate and display results for russian
accuracy = results_df_ru['correct'].mean()
subject_accuracy = results_df_ru.groupby('subject')['correct'].mean()

print(f"Overall accuracy: {accuracy:.2%}")
print("\nAccuracy by subject:")
print(subject_accuracy)

In [None]:
# 5. Save results
results_df.to_csv(f"mmlu_results_{model_name.replace('/', '_')}.csv", index=False)