In [None]:
import numpy as np
import torch
import tqdm
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator
from datasets import load_dataset
from utils import *
from huggingface_hub import login

# Login to HF CLI
login()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("squad")

dataset["train"] = dataset["train"].shuffle(seed=101).select(range(2500))
dataset["validation"] = dataset["validation"].shuffle(seed=101).select(range(2500 // 10))

# Initialize tokenizer first
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

def preprocess_and_tokenize(example):
    input_text = "Context: " + example["context"] + " Question: " + example["question"] + " Answer: "
    target_text = example["answers"]["text"][0] if len(example["answers"]["text"]) > 0 else ""
    full_text = input_text + target_text
    
    max_input_length = 512 
    max_target_length = 512
    
    tokenized = tokenizer(
        full_text,
        max_length=max_input_length + max_target_length,
        truncation=True,
        padding="max_length"
    )

    input_tokens = tokenizer(
        input_text,
        max_length=max_input_length,
        truncation=True,
        padding=False,
        add_special_tokens=False
    )
    input_length = len(input_tokens["input_ids"])

    labels = tokenized["input_ids"].copy()
    labels[:input_length] = [-100] * input_length  # mask input
    labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]

    tokenized["labels"] = labels
    return tokenized



In [3]:
text = dataset["train"][0]
print("Input:")
print(text)

tokenized = preprocess_and_tokenize(text)
print("Input tokenized")
print(tokenized)

Input:
{'id': '57306625396df919000960e6', 'title': 'Translation', 'context': 'Many non-transparent-translation theories draw on concepts from German Romanticism, the most obvious influence being the German theologian and philosopher Friedrich Schleiermacher. In his seminal lecture "On the Different Methods of Translation" (1813) he distinguished between translation methods that move "the writer toward [the reader]", i.e., transparency, and those that move the "reader toward [the author]", i.e., an extreme fidelity to the foreignness of the source text. Schleiermacher favored the latter approach; he was motivated, however, not so much by a desire to embrace the foreign, as by a nationalist desire to oppose France\'s cultural domination and to promote German literature.', 'question': 'When did Schleiermacher publish his lecture "On the Different Methods of Translation"?', 'answers': {'text': ['1813'], 'answer_start': [247]}}
Input tokenized
{'input_ids': [128000, 2014, 25, 9176, 2536, 75

In [6]:
processed_dataset = dataset.map(
    preprocess_and_tokenize,
    batched=False,
    remove_columns=dataset["train"].column_names,
)

train_loader = DataLoader(
    processed_dataset["train"],
    batch_size=1,
    shuffle=True,
    collate_fn=default_data_collator,
)
val_loader = DataLoader(
    processed_dataset["validation"],
    batch_size=1,
    collate_fn=default_data_collator,
)

In [8]:
batch = next(iter(train_loader))

print("Batch input_ids:", batch["input_ids"].shape)
print("Decoded input:", tokenizer.decode(batch["input_ids"][0], skip_special_tokens=True))

print("Batch labels:", batch["labels"].shape)
print("Decoded labels:", tokenizer.decode(
    [tok for tok in batch["labels"][0].tolist() if tok != -100],
    skip_special_tokens=True
))


Batch input_ids: torch.Size([1, 1024])
Decoded input: Context: A dearth of field observations limit our knowledge, but intraspecific conflicts are known to sometimes result in injury or death. The screamers (Anhimidae), some jacanas (Jacana, Hydrophasianus), the spur-winged goose (Plectropterus), the torrent duck (Merganetta) and nine species of lapwing (Vanellus) use a sharp spur on the wing as a weapon. The steamer ducks (Tachyeres), geese and swans (Anserinae), the solitaire (Pezophaps), sheathbills (Chionis), some guans (Crax) and stone curlews (Burhinus) use a bony knob on the alular metacarpal to punch and hammer opponents. The jacanas Actophilornis and Irediparra have an expanded, blade-like radius. The extinct Xenicibis was unique in having an elongate forelimb and massive hand which likely functioned in combat or defence as a jointed club or flail. Swans, for instance, may strike with the bony spurs and bite when defending eggs or young. Question: What do stone curlews use to 