<a href="https://colab.research.google.com/github/Lyra-Lab/LLM-Research/blob/main/experiments/evaluate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/484.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m481.3/484.9 kB[0m [31m28.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# %%
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.notebook import tqdm


In [3]:

# %% Model loading
def load_model_and_tokenizer(model_name):
    """Load model and tokenizer with multi-GPU support"""
    # Check GPU availability
    if not torch.cuda.is_available():
        raise RuntimeError("No GPU available")

    n_gpus = torch.cuda.device_count()
    print(f"Found {n_gpus} GPUs")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")

    # Load model with optimal settings for T4 GPUs
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,  # Use float16 for T4 GPUs
        device_map="auto",         # Automatically handle multi-GPU
        max_memory={i: "12GiB" for i in range(n_gpus)},  # T4 has 16GB but leave some headroom
    )
    model.eval()

    # Get device (first GPU)
    device = torch.device("cuda:0")

    return model, tokenizer, device


In [12]:

# %% Data loading
def load_mmlu_data(subjects=None, split="val", language="ru"):
    """Load MMLU_RU data for specified subjects and language."""

    if subjects is None:
        subjects = [
            'abstract_algebra',
            'college_computer_science',
            'college_mathematics',
            'formal_logic',
            'machine_learning',
            'college_physics',
            'high_school_statistics',
            'electrical_engineering',
            'computer_security'
        ]

    dfs = []
    for subject in subjects:
        try:
            dataset = load_dataset("NLPCoreTeam/mmlu_ru", subject, split=split)
            df = dataset.to_pandas()

            # Map integer answers to corresponding string labels
            int2str = dataset.features['answer'].int2str
            df['answer'] = df['answer'].map(int2str)

            # Insert subject column
            df.insert(0, 'subject', subject)

            # Keep only the selected language's question and choices
            lang_suffix = "_ru" if language == "ru" else "_en"
            df = df.rename(columns={
                f'question{lang_suffix}': 'question',
                f'choices{lang_suffix}': 'choices'
            })[['subject', 'question', 'choices', 'answer']]

            dfs.append(df)
        except Exception as e:
            print(f"Error loading {subject} ({language}): {e}")

    return pd.concat(dfs, ignore_index=True)


In [20]:

# %% Evaluation
# Format prompt
PROMPTS = {
    "ru": {
        "template": (
            "Ответьте на вопрос, выбрав правильный вариант (A, B, C или D).\n"
            "Вопрос: {question}\n"
            "Варианты ответа:\n"
            "{options}\n"
            "Ответ (укажите только букву A, B, C или D):"
        ),
        "question_key": "question",
        "choices_key": "choices"
    },
    "en": {
        "template": (
            "Answer the question by selecting the correct option (A, B, C, or D).\n"
            "Question: {question}\n"
            "Options:\n"
            "{options}\n"
            "Answer (provide only the letter A, B, C, or D):"
        ),
        "question_key": "question",
        "choices_key": "choices"
    }
}

def format_prompt(row):
    """Format a prompt for a question row using its language settings."""
    lang = row.get("language", "ru")
    config = PROMPTS.get(lang, PROMPTS["ru"])
    question = row[config["question_key"]]
    choices = row[config["choices_key"]]
    options = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices))
    return config["template"].format(question=question, options=options)

# Main evaluation function
def evaluate_model(model, tokenizer, df, device, debug_samples=-1, batch_size=4):
    """Evaluate model on the dataset with debugging information and batch processing."""
    results = []

    for i in tqdm(range(0, len(df), batch_size)):
        batch_df = df.iloc[i : i + batch_size]
        # Generate prompts for the batch
        prompts = [format_prompt(row) for _, row in batch_df.iterrows()]

        # Tokenize batch and move to device
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            try:
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=10,
                    do_sample=False,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    use_cache=True
                )
            except Exception as e:
                print("Error during model generation:", e)
                continue

        # Process each sample in the batch
        for idx, (prompt, (_, row)) in enumerate(zip(prompts, batch_df.iterrows())):
            response = tokenizer.decode(outputs[idx], skip_special_tokens=True)
            # Remove prompt text from response to get only the generated part
            generated = response[len(prompt):].strip()

            # Get and process the correct answer
            correct_answer = row["answer"]
            if str(correct_answer).isdigit():
                correct_answer = chr(65 + int(correct_answer))
            correct_answer = str(correct_answer).upper()

            # Extract the first valid answer (A, B, C, or D) from the generated text
            pred = next((c for c in generated if c.upper() in "ABCD"), "X").upper()

            # Use default keys, assuming Russian data if specific keys are missing
            question_text = row.get("question_ru", row.get("question"))
            choices = row.get("choices_ru", row.get("choices"))
            language = row.get("language", "ru")

            if idx < debug_samples:
                print(f"\nDebug Sample {idx + 1} ({language}):")
                print(f"Question: {question_text}")
                print(f"Full Response: {generated}")
                print(f"Extracted Prediction: {pred}")
                print(f"Correct Answer: {correct_answer}")
                print(f"Choices: {choices}")

            results.append({
                "subject": row["subject"],
                "language": language,
                "question": question_text,
                "correct_answer": correct_answer,
                "predicted_answer": pred,
                "full_response": generated,
                "correct": pred == correct_answer,
            })

    return pd.DataFrame(results)


In [9]:
# 1. Load model
model_name = "Qwen/Qwen2.5-1.5B-Instruct"  # Use the 1.5B variant
model, tokenizer, device = load_model_and_tokenizer(model_name)

Found 1 GPUs


In [40]:
# 2. Load data
eval_df_ru = load_mmlu_data(split="val", language='ru')

In [41]:
eval_df_ru.describe()

Unnamed: 0,subject,question,choices,answer
count,119,119,119,119
unique,9,119,119,4
top,high_school_statistics,"Циклическая подгруппа Z_24, порожденная 18, им...","[4, 8, 12, 6]",D
freq,23,1,1,38


In [42]:
# 3. Run evaluation
results_df_ru = evaluate_model(model, tokenizer, eval_df_ru, device)

  0%|          | 0/30 [00:00<?, ?it/s]



In [43]:
# 4. Calculate and display results for english
accuracy = results_df_ru['correct'].mean()
subject_accuracy = results_df_ru.groupby('subject')['correct'].mean()

In [44]:
# 5. Save Russian results
results_df_ru.to_csv(f"/content/mmlu_results_{model_name.replace('/', '_')}_ru.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   subject           119 non-null    object
 1   language          119 non-null    object
 2   question          119 non-null    object
 3   correct_answer    119 non-null    object
 4   predicted_answer  119 non-null    object
 5   full_response     119 non-null    object
 6   correct           119 non-null    bool  
dtypes: bool(1), object(6)
memory usage: 5.8+ KB
None
            subject language  \
0  abstract_algebra       ru   
1  abstract_algebra       ru   
2  abstract_algebra       ru   
3  abstract_algebra       ru   
4  abstract_algebra       ru   

                                            question correct_answer  \
0  Циклическая подгруппа Z_24, порожденная 18, им...              A   
1          Найдите порядок факторной группы Z_6/<3>.              B   
2  Утверждение 1 | Пер

correct,False,True
subject,Unnamed: 1_level_1,Unnamed: 2_level_1
abstract_algebra,8,3
college_computer_science,7,4
college_mathematics,5,6
college_physics,7,4
computer_security,6,5
electrical_engineering,10,6
formal_logic,10,4
high_school_statistics,10,13
machine_learning,8,3
