In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm

In [3]:
def construct_prompt(example, lang="en"):
    """
    Constructs a prompt from a dataset example.
    Uses language-specific keys if available and falls back to generic keys.
    """
    if lang.lower() == "ru":
        question_key = "question_ru"
        choices_key = "choices_ru"
    else:
        question_key = "question_en"
        choices_key = "choices_en"
        
    if choices_key not in example:
        for key in ["choices", "options", "possible_answers", "answers"]:
            if key in example:
                choices_key = key
                break
    if choices_key not in example:
        raise KeyError("Example must contain a valid choices key.")
        
    question_text = example.get(question_key, example.get("question", ""))
    prompt = f"Question: {question_text}\nChoices:\n"
    for idx, choice in enumerate(example[choices_key]):
        letter = chr(65 + idx)  # Map 0->A, 1->B, etc.
        prompt += f"{letter}. {choice}\n"
    prompt += "Answer:"
    return prompt

In [4]:
def evaluate_dataset(dataset, model, tokenizer, device):
    """
    Iterates over the dataset, generates the model output for each prompt,
    extracts the predicted answer (first capital letter found), and compares
    it to the ground truth to compute accuracy.
    """
    correct = 0
    total = len(dataset)
    for example in tqdm(dataset, desc="Evaluating"):
        prompt = construct_prompt(example, lang="ru")
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            temperature=0.0,
            do_sample=False
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_text = response[len(prompt):].strip()
        answer_pred = ""
        for char in generated_text:
            if char.upper() in ["A", "B", "C", "D", "E", "F", "G"]:
                answer_pred = char.upper()
                break
        ground_truth = example["answer"]
        if isinstance(ground_truth, int):
            ground_truth = chr(65 + ground_truth)
        else:
            ground_truth = ground_truth.strip().upper()
        if answer_pred == ground_truth:
            correct += 1
    accuracy = correct / total * 100
    return accuracy

In [8]:
def main():
    # Set device based on availability. With device_map="auto", the model will be distributed
    # across available GPUs (e.g. 2 T4 GPUs in Kaggle).
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    model_name = "Qwen/Qwen2.5-7B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,  # use fp16 for efficiency
        device_map="auto"           # automatically uses available GPUs
    )
    model.eval()
    
    # List of all MMLU (Russian) tasks to evaluate on
    mmlu_ru_tasks = [
      'abstract_algebra', 'anatomy', 'astronomy', 'auxiliary_train',
      'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry',
      'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
      'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering',
      'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology',
      'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history',
      'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics',
      'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
      'high_school_psychology', 'high_school_statistics', 'high_school_us_history',
      'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law',
      'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing',
      'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
      'philosophy', 'prehistory', 'professional_accounting', 'professional_law',
      'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies',
      'sociology', 'us_foreign_policy', 'virology', 'world_religions'
    ]
    
    results = {}
    for task in mmlu_ru_tasks:
        try:
            dataset_mmlu = load_dataset("NLPCoreTeam/mmlu_ru", task, split="test")
            print(f"Evaluating on NLPCoreTeam/mmlu_ru dataset ({task})")
            accuracy = evaluate_dataset(dataset_mmlu, model, tokenizer, device)
            print(f"Accuracy on {task}: {accuracy:.2f}%")
            results[task] = accuracy
        except Exception as e:
            print(f"Error evaluating {task}: {e}")
    
    print("\nResults:")
    for task, acc in results.items():
        print(f"{task}: {acc:.2f}%")
    if results:
        avg_accuracy = sum(results.values()) / len(results)
        print(f"\nAverage accuracy: {avg_accuracy:.2f}%")
    else:
        print("No evaluation results.")

In [None]:
if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]