<a href="https://colab.research.google.com/github/MirudulaShri260302/LLM_Data/blob/main/Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Streaming Language Modeling Data Pipeline with Hugging Face Datasets
--------------------------------------------------------------------
Goal:
    Demonstrate how to build a *true streaming* LM pipeline that:
    - Processes data without loading the entire dataset into RAM.
    - Tokenizes on the fly.
    - Concatenates text and chunks into fixed-length blocks for LM training.
    - Produces batches ready for training in PyTorch.

Key Teaching Points:
    1. Streaming allows us to work with web-scale corpora.
    2. We still can do grouping/chunking in a rolling fashion.
    3. This approach mimics real-world pipelines for large-scale LM training.
"""
print(__doc__)

In [None]:
# !pip install transformers, AutoTokenizer, torch

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import IterableDataset, DataLoader
import torch

In [None]:
# ============================================================
# 1. Load the dataset in STREAMING mode
# ============================================================
stream_dataset = load_dataset(
    "wikitext",
    "wikitext-2-raw-v1",
    split="train",
    streaming=True
)

In [None]:
# ============================================================
# 2. Initialize GPT-2 tokenizer
# ============================================================
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token   # GPT-2 normally has no pad token

eos_id = tokenizer.eos_token_id

In [None]:
# ============================================================
# 3. Tokenization step (no padding, no truncation)
# ============================================================
def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_stream = stream_dataset.map(tokenize_function, batched=True)

In [None]:
# ============================================================
# 4. Rolling buffer with EOS insertion + padding final block
# ============================================================
block_size = 128

def group_texts_streaming(dataset_iter, block_size, eos_id):
    """
    Modified version:
      ✔ Adds EOS token between documents
      ✔ Pads final leftover block instead of dropping it
    """
    buffer = []

    for example in dataset_iter:
        # Add tokens for this document
        buffer.extend(example["input_ids"])

        # Insert EOS token between documents
        buffer.append(eos_id)

        # Produce all full blocks
        while len(buffer) >= block_size:
            chunk = buffer[:block_size]
            buffer = buffer[block_size:]
            yield {
                "input_ids": chunk,
                "attention_mask": [1] * block_size
            }

    # After dataset is exhausted — pad leftover tokens
    if len(buffer) > 0:
        pad_length = block_size - len(buffer)
        padded_chunk = buffer + [eos_id] * pad_length

        attn_mask = [1] * len(buffer) + [0] * pad_length

        yield {
            "input_ids": padded_chunk,
            "attention_mask": attn_mask
        }


In [None]:
# ============================================================
# 5. IterableDataset wrapper
# ============================================================
class StreamingLMIterableDataset(IterableDataset):
    def __init__(self, hf_iterable_dataset, block_size):
        self.dataset = hf_iterable_dataset
        self.block_size = block_size

    def __iter__(self):
        return group_texts_streaming(self.dataset, self.block_size, eos_id)


grouped_iterable_dataset = StreamingLMIterableDataset(tokenized_stream, block_size)

In [None]:
# ============================================================
# 6. Collate function
# ============================================================
def collate_fn(batch):
    input_ids = torch.tensor([ex["input_ids"] for ex in batch], dtype=torch.long)
    attention_mask = torch.tensor([ex["attention_mask"] for ex in batch], dtype=torch.long)

    # For LM training, labels = input_ids
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": input_ids.clone()
    }

In [None]:
# ============================================================
# 7. DataLoader for streaming data
# ============================================================
train_loader = DataLoader(
    grouped_iterable_dataset,
    batch_size=8,
    collate_fn=collate_fn
)


# ============================================================
# 8. Iterate over a few batches
# ============================================================
print("Sample streaming batches (with EOS + padded final block):")
for i, batch in enumerate(train_loader):
    print(f"Batch {i} -> input_ids shape: {batch['input_ids'].shape}")
    if i == 2:
        break




In [None]:
print("Sample streaming batches (with EOS + padded final block):")
for i, batch in enumerate(train_loader):
    print(f"\nBatch {i} -> input_ids shape: {batch['input_ids'].shape}")

    # ----------------------------------------
    # PRINT SENTENCES (new code)
    # ----------------------------------------
    # Print the first sequence in the batch
    seq = batch["input_ids"][0].tolist()
    decoded = tokenizer.decode(seq, skip_special_tokens=False)

    print("Decoded text sample:")
    print(decoded)
    print("-" * 80)

    if i == 2:
        break
