In [1]:
import pandas as pd
import torch
from peft import PeftModel
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_lora_model(
    adapter_name: str,
    model_name: str = "meta-llama/Llama-2-7b-chat-hf",
) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
    config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name, low_cpu_mem_usage=True, quantization_config=config, attn_implementation="flash_attention_2"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = PeftModel.from_pretrained(model, adapter_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    
    return model, tokenizer

In [3]:
def tokenize_queries(tokenizer: AutoTokenizer, query: str, max_length: int = 1024) -> torch.Tensor:
    query = [
        {"role": "system", "content": "You answer multiple choice questions with the correct letter answer. Your answer should be in this format: '{Letter}.{Answer}'"},
        {"role": "user", "content": query},
    ]

    input_ids = tokenizer.apply_chat_template(
        query,
        add_generation_prompt=True,
        return_tensors="pt"
    )

    return input_ids

In [4]:
# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model & tokenizer
model, tokenizer = load_lora_model("checkpoints/checkpoint-36000")

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.15s/it]


In [5]:
# Load the evaluation data
eval_data = pd.read_csv("data/test.csv")

# Tokenize the queries
eval_data["input_ids"] = eval_data.apply(lambda row: tokenize_queries(tokenizer, row["query"]), axis=1)

# Drop unecessary columns
eval_data.drop("query", axis=1, inplace=True)

eval_data.head()

Unnamed: 0,response,input_ids
0,A.Civic Park is north of the administrative se...,"[[tensor(1), tensor(518), tensor(25580), tenso..."
1,"A.20-year-old accountant, 20-year-old salesper...","[[tensor(1), tensor(518), tensor(25580), tenso..."
2,B No Shouwu,"[[tensor(1), tensor(518), tensor(25580), tenso..."
3,D.If you do not participate in the English tut...,"[[tensor(1), tensor(518), tensor(25580), tenso..."
4,D.Ding.,"[[tensor(1), tensor(518), tensor(25580), tenso..."


In [6]:
correct = 0
total = 0

for i in tqdm(range(len(eval_data))):
    input_ids = eval_data["input_ids"][i].to(device)
    correct_answer = eval_data["response"][i][0]

    # Generate the response
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_new_tokens=256
        )

    # Decode the response
    outputs = tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
    
    # Extract the predicted answer
    predicted_answer = outputs[0]

    # Skip if the predicted answer is not a valid answer
    if predicted_answer not in ["A", "B", "C", "D"]:
        continue

    # Check if the predicted answer is correct
    correct += int(predicted_answer == correct_answer)
    total += 1

print(f"Accuracy: {correct}/{total} ({correct / total * 100:.2f}%)")

  1%|          | 6/650 [00:14<28:43,  2.68s/it]