<a href="https://colab.research.google.com/github/Lyra-Lab/LLM-Research/blob/main/experiments/qwen_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

# Base model evaluation
This is the evaluation of the model before any fine tunning has occured

In [9]:
# %% [code]
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm

# Load the Qwen/Qwen2.5-3B-Instruct model and tokenizer
model_name = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # adjust based on your hardware
    device_map="auto"
)
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-35): 36 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=256, bias=True)
          (v_proj): Linear(in_features=2048, out_features=256, bias=True)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (up_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((2048,), eps=1e-06)
    (rotary_emb):

In [30]:
# %% [code]
def construct_prompt(example):
    """
    Constructs a prompt from a dataset example.
    Handles cases where 'choices' key is not present, using 'options' instead.
    If neither is present, tries to use 'possible_answers'.
    **Also handles the case where the key is 'answers' instead of 'choices', 'options', or 'possible_answers'.**
    Assumes the following keys in example:
      - "question": the question text.
      - "choices" or "options" or "possible_answers" **or "answers"**: a list of answer choices.
      - "answer": the correct answer letter (e.g., "A").
    """
    # Check for all possible keys
    choices_key = None
    for key in ["choices", "options", "possible_answers", "answers"]:
        if key in example:
            choices_key = key
            break

    # Raise KeyError if none of the expected keys are found
    if choices_key is None:
        raise KeyError("Example must contain 'choices', 'options', 'possible_answers', or 'answers' key.")

    # Create the prompt with the question and enumerated choices.
    prompt = f"Question: {example['question']}\nChoices:\n"
    for idx, choice in enumerate(example[choices_key]):
        # Convert index 0,1,2... to A, B, C, etc.
        letter = chr(65 + idx)
        prompt += f"{letter}. {choice}\n"
    prompt += "Answer:"  # model completion should output the answer letter.
    return prompt

def evaluate_dataset(dataset, model, tokenizer):
    """
    Iterates over the dataset, generates the model output for each prompt,
    extracts the predicted answer (first capital letter found), and compares
    it to the ground truth to compute accuracy.
    """
    correct = 0
    total = len(dataset)
    for example in tqdm(dataset, desc="Evaluating"):
        prompt = construct_prompt(example)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        # Generate a short output; use deterministic generation for evaluation.
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            temperature=0.0,
            do_sample=False
        )
        # Decode the generated tokens.
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract the text generated after the prompt.
        generated_text = response[len(prompt):].strip()
        # Find the first capital letter (A, B, C, …) as predicted answer.
        answer_pred = ""
        for char in generated_text:
            if char.upper() in ["A", "B", "C", "D", "E", "F", "G"]:
                answer_pred = char.upper()
                break
        # Compare with the ground truth answer.
        # Convert the ground truth answer to a letter if it's an integer.
        ground_truth_answer = example["answer"]
        if isinstance(ground_truth_answer, int):
            ground_truth_answer = chr(65 + ground_truth_answer)  # Convert 0 to A, 1 to B, etc.
        else:
            ground_truth_answer = ground_truth_answer.strip().upper()

        if answer_pred == ground_truth_answer:
            correct += 1
    accuracy = correct / total * 100
    return accuracy

In [34]:
# %% [code]
# Download and load datasets from Hugging Face
# Adjust the split as necessary – here we use "test" split assuming evaluation examples are stored there.
# Specify a config name, e.g., 'abstract_algebra'
dataset_mmlu = load_dataset("cais/mmlu", "abstract_algebra", split="test")
dataset_mmlu_ru = load_dataset("NLPCoreTeam/mmlu_ru", "abstract_algebra", split="test")

In [32]:
# %% [code]
# English Evaluation (cais/mmlu)
print("Evaluating on cais/mmlu dataset (English)")
accuracy_mmlu = evaluate_dataset(dataset_mmlu, model, tokenizer)
print(f"Accuracy on cais/mmlu (English): {accuracy_mmlu:.2f}%")

Evaluating on cais/mmlu dataset (English)


Evaluating: 100%|██████████| 100/100 [00:49<00:00,  2.01it/s]

Accuracy on cais/mmlu (English): 43.00%





In [33]:
# %% [code]
# Russian Evaluation (Vikhrmodels/mmlupro-ru)
print("\nEvaluating on Vikhrmodels/mmlupro-ru dataset (Russian)")
accuracy_mmlu_ru = evaluate_dataset(dataset_mmlu_ru, model, tokenizer)
print(f"Accuracy on Vikhrmodels/mmlupro-ru (Russian): {accuracy_mmlu_ru:.2f}%")


Evaluating on Vikhrmodels/mmlupro-ru dataset (Russian)


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


KeyError: "Example must contain 'choices', 'options', 'possible_answers', or 'answers' key."