<a href="https://colab.research.google.com/github/MirudulaShri260302/LLM_Data/blob/main/Lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch
import random


In [None]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
print(f"Number of lines in dataset: {len(dataset)}")


In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"additional_special_tokens": ["<PARA>"]})
tokenizer.pad_token = tokenizer.eos_token
para_id = tokenizer.convert_tokens_to_ids("<PARA>")



In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=False)


In [None]:
tokenized_ds = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

print("First 20 input IDs:", tokenized_ds[0]["input_ids"][:20])




In [None]:
block_size = 128

def group_texts(examples):
    concatenated_inputs = []
    concatenated_masks = []

    for ids, mask in zip(examples["input_ids"], examples["attention_mask"]):
        concatenated_inputs.extend(ids)
        concatenated_inputs.append(para_id)  # INSERT <PARA> TOKEN HERE
        concatenated_masks.extend(mask)
        concatenated_masks.append(1)

    total_len = (len(concatenated_inputs) // block_size) * block_size

    concatenated_inputs = concatenated_inputs[:total_len]
    concatenated_masks = concatenated_masks[:total_len]

    result_input_ids = [concatenated_inputs[i:i+block_size]
                        for i in range(0, total_len, block_size)]
    result_masks = [concatenated_masks[i:i+block_size]
                    for i in range(0, total_len, block_size)]

    return {"input_ids": result_input_ids, "attention_mask": result_masks}


In [None]:
lm_ds = tokenized_ds.map(group_texts, batched=True, batch_size=1000)



In [None]:
def collate_fn(batch):
    input_ids = torch.tensor([example["input_ids"] for example in batch], dtype=torch.long)
    return {"input_ids": input_ids, "labels": input_ids.clone()}

train_loader = DataLoader(lm_ds, batch_size=8, shuffle=True, collate_fn=collate_fn)


In [None]:
for batch in train_loader:
    print(batch["input_ids"].shape)

    decoded = tokenizer.decode(batch["input_ids"][0], skip_special_tokens=False)
    print(decoded[:300])
    break
