In [None]:
# !pip install -U bitsandbytes transformers accelerate captum

In [None]:
import os
import sys

path = os.getcwd()
while True:
    if 'utils.py' in os.listdir(path):
        if path not in sys.path:
            sys.path.append(path)
        break
    new_path = os.path.dirname(path)
    if new_path == path:
        print("utils.py not found in any parent folder.")
        break
    path = new_path

import utils
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
utils.hf_login("HF_TOKEN")

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
model_id = 'Qwen/Qwen2.5-7B-Instruct'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config,
    attn_implementation="eager"
    )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [6]:
from google.colab import files
uploaded = files.upload()
qnli_test = pd.read_csv('qnli_val.csv')

qnli_test["label"] = qnli_test["label"].map({0: "entailment", 1: "not_entailment"})

Saving qnli_val.csv to qnli_val.csv


In [7]:
qnli_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5463 entries, 0 to 5462
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  5463 non-null   object
 1   sentence  5463 non-null   object
 2   label     5463 non-null   object
 3   idx       5463 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 170.8+ KB


In [8]:
qnli_test.head()

Unnamed: 0,question,sentence,label,idx
0,What came into force after the new constitutio...,"As of that day, the new constitution heralding...",entailment,0
1,What is the first major city in the stream of ...,The most important tributaries in this area ar...,not_entailment,1
2,What is the minimum required if you want to te...,In most provinces a second Bachelor's Degree s...,not_entailment,2
3,How was Temüjin kept imprisoned by the Tayichi...,The Tayichi'ud enslaved Temüjin (reportedly wi...,entailment,3
4,"What did Herr Gott, dich loben wir become know...","He paraphrased the Te Deum as ""Herr Gott, dich...",not_entailment,4


In [None]:
# Find the max_length for tokenization to avoid wasting computing.
utils.find_max_length(qnli_test, tokenizer=tokenizer, dataset_type='qnli')

In [None]:
prompt_lengths, _ = get_lengths(qnli_test)
df = pd.DataFrame(prompt_lengths, columns=["length"])
print("Number of examples that have over 256 tokens:",(df["length"] > 256).sum())

Since we have only 1 examples that goes over a length of 256, we are going to use 256 as the default for `max_length` and miss some information in order to offload computing.

In [None]:
# Define dataset and create a dataloader.
dataset_test = utils.MyDataset(dataframe=qnli_test,
                               tokenizer=tokenizer,
                               dataset_type='qnli',
                               prompt_max_length=117,
                               label_max_length=3)

batch_size = 16 # Change batch size according to GPU
dataloader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False) 

In [None]:
# Create checkpoint
checkpoint_path = utils.create_checkpoint_path(model_id=model_id, name='qnli')

Checkpoint directory: drive/MyDrive/eval_checkpoints


In [None]:
# # Test
# predictions, gold_labels = utils.test_run(model=model,
#                                           dataloader=dataloader,
#                                           tokenizer=tokenizer,
#                                           dataset_type='qnli')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


entailment
not_entailment
not_entailment
not_entailment


In [None]:
from tqdm import tqdm

# Load checkpoint if it exists
predicted_labels, gold_labels, no_answer, start_batch = utils.load_checkpoint(checkpoint_path=checkpoint_path)


with torch.no_grad():
    for i, batch in enumerate(tqdm(dataloader, desc="Evaluating", unit="batch")):
        # Continue from last checkpoint
        if i < start_batch:
            continue

        input_ids_batch = batch["input_ids"].to(model.device) # Move to GPU
        attention_mask_batch = batch["attention_mask"].to(model.device) # Move to GPU
        gold_labels_batch = batch["labels"] # Keep to CPU

        # Get outputs
        outputs = model.generate(input_ids=input_ids_batch, attention_mask=attention_mask_batch, max_new_tokens=6)
        outputs_decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        predicted_labels_batch = utils.get_predictions(outputs_decoded=outputs_decoded, no_answer=no_answer, dataset_type='qnli')
        predicted_labels.extend(predicted_labels_batch)
        gold_labels.extend(gold_labels_batch)

        # Save checkpoint
        if i % 50 == 0 or i == len(dataloader) - 1:
            torch.save({"predicted_labels": predicted_labels,
                        "gold_labels": gold_labels,
                        "no_answer": no_answer,
                        "batch_no": i+1}, checkpoint_path)

            print(f"Checkpoint saved: {i+1}, {checkpoint_path}")

In [None]:
predicted_labels, gold_labels, no_answer, _ = utils.load_checkpoint(checkpoint_path)
print(f"The model was unable to give an answer to {no_answer} out of {len(predicted_labels)} questions.")

In [None]:
# Calculate evaluation metrics
utils.evaluate_metrics(predicted_labels=predicted_labels, gold_labels=gold_labels)

Accuracy: 0.6132
