In [None]:
import numpy as np
import torch
import tqdm
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator
from datasets import load_dataset
from utils import *
from huggingface_hub import login

# Login to HF CLI
login()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.version.cuda)  # CUDA version PyTorch was built with
print(torch.version.__version__)  # PyTorch version
print(torch.cuda.get_device_name(0))  # Your GPU model
print(torch.cuda.get_device_capability(0))  # Compute capability

12.9
2.8.0+cu129
NVIDIA GeForce RTX 5070 Ti
(12, 0)


In [3]:
dataset = load_dataset("squad")

dataset["train"] = dataset["train"].shuffle(seed=101).select(range(5))
dataset["validation"] = dataset["validation"].shuffle(seed=101).select(range(1))

# Initialize tokenizer first
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

def preprocess_and_tokenize(example):
    input_text = "Context: " + example["context"] + " Question: " + example["question"] + " Answer: "
    target_text = example["answers"]["text"][0] if len(example["answers"]["text"]) > 0 else ""
    full_text = input_text + target_text
    
    max_input_length = 512 
    max_target_length = 512
    
    tokenized = tokenizer(
        full_text,
        max_length=max_input_length + max_target_length,
        truncation=True,
        padding="max_length"
    )

    input_tokens = tokenizer(
        input_text,
        max_length=max_input_length,
        truncation=True,
        padding=False,
        add_special_tokens=False
    )
    input_length = len(input_tokens["input_ids"])

    labels = tokenized["input_ids"].copy()
    labels[:input_length] = [-100] * input_length  # mask input
    labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]

    tokenized["labels"] = labels
    return tokenized



In [4]:
text = dataset["train"][0]
print("Input:")
print(text)

tokenized = preprocess_and_tokenize(text)
print("Input tokenized")
print(tokenized)

Input:
{'id': '57306625396df919000960e6', 'title': 'Translation', 'context': 'Many non-transparent-translation theories draw on concepts from German Romanticism, the most obvious influence being the German theologian and philosopher Friedrich Schleiermacher. In his seminal lecture "On the Different Methods of Translation" (1813) he distinguished between translation methods that move "the writer toward [the reader]", i.e., transparency, and those that move the "reader toward [the author]", i.e., an extreme fidelity to the foreignness of the source text. Schleiermacher favored the latter approach; he was motivated, however, not so much by a desire to embrace the foreign, as by a nationalist desire to oppose France\'s cultural domination and to promote German literature.', 'question': 'When did Schleiermacher publish his lecture "On the Different Methods of Translation"?', 'answers': {'text': ['1813'], 'answer_start': [247]}}
Input tokenized
{'input_ids': [128000, 2014, 25, 9176, 2536, 75

In [5]:
processed_dataset = dataset["train"].map(
    preprocess_and_tokenize,
    batched=False,
    remove_columns=dataset["train"].column_names,
)

train_loader = DataLoader(
    processed_dataset,
    batch_size=1,
    shuffle=True,
    collate_fn=default_data_collator,
)


In [None]:
def preprocess_and_tokenize_eval(example):
    # Prompt only (no gold answer appended to input_ids)
    input_text = "Context: " + example["context"] + " Question: " + example["question"] + " Answer: "
    target_text = example["answers"]["text"][0] if len(example["answers"]["text"]) > 0 else ""
    
    max_input_length = 512 
    max_target_length = 512

    # Tokenize prompt only for inputs
    tokenized_inputs = tokenizer(
        input_text,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    # Tokenize full_text (with answer) just to build labels
    tokenized_full = tokenizer(
        input_text + target_text,
        max_length=max_input_length + max_target_length,
        truncation=True,
        padding="max_length"
    )

    # Mask out the input part, keep only the answer portion for labels
    labels = tokenized_full["input_ids"].copy()
    input_length = len(tokenizer(input_text, add_special_tokens=False)["input_ids"])
    labels[:input_length] = [-100] * input_length
    labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]

    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

eval_dataset = dataset["validation"].map(
    preprocess_and_tokenize_eval, 
    remove_columns=dataset["validation"].column_names)


val_loader = DataLoader(
    eval_dataset,
    batch_size=1,
    collate_fn=default_data_collator,
)

In [7]:
batch = next(iter(train_loader))

print("Batch input_ids:", batch["input_ids"].shape)
print("Decoded input:", tokenizer.decode(batch["input_ids"][0], skip_special_tokens=True))

print("Batch labels:", batch["labels"].shape)
print("Decoded labels:", tokenizer.decode(
    [tok for tok in batch["labels"][0].tolist() if tok != -100],
    skip_special_tokens=True
))

print("Batch  Format:")
print(f"input_ids: {batch['input_ids']}")
print(f"attention mask: {batch['attention_mask']}")
print(f"labels: {batch['labels']}")



Batch input_ids: torch.Size([1, 1024])
Decoded input: Context: As the universe evolves in time, more and more of its energy becomes trapped in irreversible states (i.e., as heat or other kinds of increases in disorder). This has been referred to as the inevitable thermodynamic heat death of the universe. In this heat death the energy of the universe does not change, but the fraction of energy which is available to do work through a heat engine, or be transformed to other usable forms of energy (through the use of generators attached to heat engines), grows less and less. Question: In this heat death of energy, what does not change? Answer: energy of the universe
Batch labels: torch.Size([1, 1024])
Decoded labels:  energy of the universe
Batch  Format:
input_ids: tensor([[128000,   2014,     25,  ..., 128009, 128009, 128009]])
attention mask: tensor([[1, 1, 1,  ..., 0, 0, 0]])
labels: tensor([[-100, -100, -100,  ..., -100, -100, -100]])


In [None]:
def evaluate_model(model, val_loader, device, tokenizer, max_gen_length=50, show_samples=5, seed=42):
    model.eval()
    preds, refs = [], []

    for batch in tqdm(val_loader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_gen_length,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                do_sample=False,
            )

        for i in range(input_ids.shape[0]):
            # Slice generated tokens after the prompt
            generated_ids = outputs[0, input_ids.shape[1]:]
            generated_ids = generated_ids[generated_ids != tokenizer.pad_token_id]
            if len(generated_ids) > 0 and generated_ids[-1] == tokenizer.eos_token_id:
                generated_ids = generated_ids[:-1]
            pred = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

            # Decode gold answer from labels
            label_ids = labels[i]
            answer_ids = [lid.item() for lid in label_ids if lid.item() != -100]
            ref = tokenizer.decode(answer_ids, skip_special_tokens=True).strip()
            
            # Cut off everything before "Answer: "
            if "Answer:" in pred:
                pred = pred.split("Answer:", 1)[1].strip()

            preds.append(pred)
            refs.append(ref)

    # --- Metrics ---
    exact_match = np.mean([1 if p.lower() == r.lower() else 0 for p, r in zip(preds, refs)])
    contains_acc = np.mean([1 if r.lower() in p.lower() or p.lower() in r.lower() else 0 for p, r in zip(preds, refs)])

    f1_scores = []
    for p, r in zip(preds, refs):
        ptoks, rtoks = p.lower().split(), r.lower().split()
        common = set(ptoks) & set(rtoks)
        num_common = sum(min(ptoks.count(t), rtoks.count(t)) for t in common)
        if num_common == 0:
            f1_scores.append(0.0)
        else:
            prec = num_common / len(ptoks)
            rec = num_common / len(rtoks)
            f1_scores.append(2 * prec * rec / (prec + rec))
    f1 = np.mean(f1_scores)

    print(f"\nContains Accuracy: {contains_acc:.4f}")
    print(f"Exact Match Accuracy: {exact_match:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Sample outputs
    if show_samples > 0:
        print("\nSample predictions:\n")
        random.seed(seed)
        for i in random.sample(range(len(preds)), min(show_samples, len(preds))):
            print("=" * 80)
            print(f"Gold Answer: {refs[i]}")
            print(f"Predicted Answer: {preds[i]}")

    return {"contains_accuracy": contains_acc, "exact_match_accuracy": exact_match, "f1": f1}


In [9]:
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="cuda:0")
model = model.to(torch.float16)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.84s/it]


Device: cuda


In [13]:
evaluate_model(model, val_loader, device, tokenizer, max_gen_length=100)

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating: 100%|██████████| 1/1 [01:45<00:00, 105.84s/it]


Contains Accuracy: 1.0000
Exact Match Accuracy: 0.0000
F1 Score: 0.0000

Sample predictions:

Gold Answer: Sicily
Predicted Answer: The Norman-Arab architectural style was found in the Kingdom of Sicily, which was a region in southern





{'contains_accuracy': np.float64(1.0),
 'exact_match_accuracy': np.float64(0.0),
 'f1': np.float64(0.0)}