In [1]:
from transformers import LlamaConfig

# Define the LLaMA-like configuration
config = LlamaConfig(
    vocab_size=32000,  # Size of the tokenizer vocabulary
    hidden_size=1024,  # Dimension of model embeddings
    intermediate_size=4096,  # Size of the feedforward layer
    num_hidden_layers=12,  # Number of transformer layers
    num_attention_heads=16,  # Number of attention heads
    max_position_embeddings=2048,  # Maximum sequence length
    rms_norm_eps=1e-5,  # Epsilon for RMSNorm
    use_cache=False,  # Disable cache for training
    initializer_range=0.02,  # Standard deviation for weight initialization
)

In [2]:
from transformers import LlamaForCausalLM

# Initialize the model from the configuration
model = LlamaForCausalLM(config)

In [3]:
from datasets import load_dataset, Dataset

streamed_dataset = load_dataset("EleutherAI/the_pile_deduplicated", split="train", streaming=True)

# Take the first 5000 rows
dataset = list(streamed_dataset.take(5000))
dataset = Dataset.from_list(dataset)

print(dataset)

Resolving data files:   0%|          | 0/1650 [00:00<?, ?it/s]

Dataset({
    features: ['text'],
    num_rows: 5000
})


In [4]:
from transformers import AutoTokenizer

# Load the tokenizer from the Meta LLaMA Hugging Face repo
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.pad_token = tokenizer.eos_token 

# Test the tokenizer
print(tokenizer.encode("This is a test!"))

[128000, 2028, 374, 264, 1296, 0]


In [15]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, return_tensors="pt")

tokenized_datasets = dataset.map(tokenize_function, batched=False)
#remove_columns=['text'])

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [16]:
from transformers import DataCollatorWithPadding

# Data collator for dynamic padding
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

def collate_fn(batch):
    # Tokenize input text with padding and truncation
    inputs = tokenizer(
        [item['text'] for item in batch],  # Extracting the text for tokenization
        padding=True,  # Pad to the longest sequence in the batch
        truncation=True,  # Truncate sequences to the max length
        return_tensors="pt",  # Return PyTorch tensors
    )
    
    # You may also want to shift labels for causal language modeling
    inputs["labels"] = inputs["input_ids"].clone()  # Labels for causal LM are same as input
    return inputs

# DataLoader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets, batch_size=8, collate_fn=collate_fn)


In [17]:
from transformers import AdamW, get_scheduler

# Optimizer
optimizer = AdamW(model.parameters(), lr=3e-4)

# Learning rate scheduler
num_training_steps = 1000 * 1
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps
)

In [19]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
# optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        # Prepare input and target tensors
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = input_ids.clone()  # Shifted labels for causal LM

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    print(f"Epoch {epoch} completed. Loss: {loss.item()}")

IndexError: index out of range in self