**Load Tokenized Dataset or Download the base dataset (not tokenized) from hf**

In [2]:
from datasets import load_from_disk
from pathlib import Path

#Initialize a bool to check if the tokenized dataset exists in disk or if it has to be tokenized 
tokenized_dataset_exists = False

dataset_path = Path("./data/tokenized_dataset")

if dataset_path.exists():
    tokenized_dataset = load_from_disk(str(dataset_path))
    tokenized_dataset_exists = True
    print("Dataset loaded successfully!")
else:
    print("Dataset does not exist at the specified path.")

Dataset loaded successfully!


**If the tokenized dataset doesnt exist on disk we download it and tokenize it**

In [3]:
from datasets import load_dataset

if tokenized_dataset_exists == False:
    # Load the dataset and specify the cache directory
    whole_dataset = load_dataset("wikimedia/wikipedia", "20231101.es", cache_dir="./data")
    ds = ds["train"].train_test_split(test_size=0.98, seed=42) #We split the dataset to make it smaller and 
    #try different training parameters faster
else:
    print("Skipping the download of whole dataset from Huggingface")

Skipping the download of whole dataset from Huggingface


## We load the tokenizer from disk or create the tokenizer if it doesnt exist

Function to load the text from the dataset to tokenizer

In [4]:
def get_training_corpus():
    for sample in ds["train"]:
        yield sample["text"]  # Extract text

**Load tokenizer // Create tokenizer**

In [5]:
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from datasets import load_from_disk
from pathlib import Path

#Initialize a bool to check if the tokenized dataset exists in disk or if it has to be tokenized 
tokenized_dataset_exists = False

tokenizer_path = Path("./data/scratch_tokenizer.json")

if tokenizer_path.exists():
    #load tokenizer if its saved on disk
    hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_path), 
                                       unk_token="[UNK]", 
                                       pad_token="[PAD]", 
                                       mask_token="[MASK]")
    # Save it in the Hugging Face format
    hf_tokenizer.save_pretrained("custom_tokenizer")
    print("Tokenizer loaded! 🎉")
else:
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    trainer = BpeTrainer(special_tokens=["[UNK]", "[PAD]", "[MASK]"], vocab_size=30_000)
    tokenizer.pre_tokenizer = Whitespace()
    
    
    
    
    tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
    tokenizer.save("./data/scratch_tokenizer.json")
    print("Tokenizer training complete! 🎉")


# Test with Transformers API
print(hf_tokenizer.encode("Hello world!"))

Tokenizer loaded! 🎉
[42, 8735, 21776, 3]


**Tokenize the dataset if not in disk**

At the beginning we downloaded the whole dataset in the case the tokenized one didnt exist
but we havent tokenized it

In [6]:
# Function to tokenize text in 128-token chunks
def tokenize_function(examples):
    # Tokenize text and split into chunks of max length 128
    tokenized_text = hf_tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )
    return tokenized_text
    
if not dataset_path.exists():
    # Apply tokenization to dataset
    tokenized_dataset = ds["train"].map(tokenize_function, batched=True)
    tokenized_dataset.save_to_disk(str(dataset_path))
    print(f"Dataset tokenized and saved to disk on path {string(dataset_path)}")

## Creating a data loader

In [7]:
import torch
from torch.utils.data import DataLoader

# Convert dataset to PyTorch format
tokenized_text = torch.tensor(tokenized_dataset["input_ids"])
tokenized_text.shape

torch.Size([36823, 128])

In [8]:

# Shift left to create targets
input_data = tokenized_text[:, :-1]  # Remove LAST token in each sequence
target_data = tokenized_text[:, 1:]  # Remove FIRST token in each sequence

print("Input Data Shape:", input_data.shape)  # Should be (6455, 127)
print("Target Data Shape:", target_data.shape)  # Should be (6455, 127)

Input Data Shape: torch.Size([36823, 127])
Target Data Shape: torch.Size([36823, 127])


In [9]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 64
dataset = TensorDataset(input_data, target_data)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True,  num_workers=4, pin_memory=True)


In [10]:
##Both pytorch and HF tokenizer will want to use many cpu cores, and will result in error
## Since we already tokenized the whole corpus we can disable the parallelism of the hf tokenizer
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [11]:
# Check batch shapes
for batch in data_loader:
    inputs, targets = batch
    print("Batch input shape:", inputs.shape)
    print("Batch target shape:", targets.shape)
    break  # St

Batch input shape: torch.Size([64, 127])
Batch target shape: torch.Size([64, 127])


## Attention Model

**Positional encodings**


In [12]:
import torch
a = torch.tensor([[1,2,3],[4,5,6]])
b = torch.tensor( [2, 2, 2])
c = a* b
c

tensor([[ 2,  4,  6],
        [ 8, 10, 12]])

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [14]:
import torch
import math

def get_positional_encodings(max_seq_length, d_model, device):
    p_e = torch.zeros(max_seq_length, d_model, device = device)  # Shape (L, D)
    position = torch.arange(0, max_seq_length, dtype=torch.float32).unsqueeze(1)  # Shape (L, 1)
    div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0) / d_model))

    p_e[:, 0::2] = torch.sin(position * div_term)
    p_e[:, 1::2] = torch.cos(position * div_term)
    
    return p_e  # Shape (L, D)

# Test Code
batch_size = 4
seq_length = 127  # This corresponds to max_seq_length in the function
d_model = 200


# Generate positional encodings for the given sequence length and model dimension
pos_enc = get_positional_encodings(seq_length, d_model, device)  # Shape (L, D)
print(pos_enc.unsqueeze(0).shape)
# Expanding for batch size
pos_enc_batch = pos_enc.unsqueeze(0).expand(batch_size, -1, -1)  # Shape (B, L, D)

# Print the shape to verify
print("Positional Encoding Shape:", pos_enc_batch.shape)  # Expected: (4, 10, 16)


torch.Size([1, 127, 200])
Positional Encoding Shape: torch.Size([4, 127, 200])


**Attention Head**

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class SelfAttentionHead(nn.Module):
    def __init__(self, embedding_dim):
        """
        Initializes a self-attention head.
        
        Args:
            embedding_dim (int): The dimension of the input embeddings (d_model).
        """
        super(SelfAttentionHead, self).__init__()
        # Define the linear projection layers for queries, keys, and values
        self.q_layer = nn.Linear(embedding_dim, embedding_dim)
        self.k_layer = nn.Linear(embedding_dim, embedding_dim)
        self.v_layer = nn.Linear(embedding_dim, embedding_dim)
        self.layer_norm = nn.LayerNorm(embedding_dim)

        ## FFN - Feed Forward Network
        self.first_linear = nn.Linear(embedding_dim, 4 * embedding_dim) ## Amplifies the att output 
        
        self.relu_layer = nn.ReLU() #Relu between them

        self.second_linear = nn.Linear( 4 * embedding_dim, embedding_dim) ## Reduces the att output back to embedding size

    def forward(self, x):
        """
        Perform the self-attention operation.
        
        Args:
            x (Tensor): Input tensor of shape (batch_size, seq_length, embedding_dim)
        
        Returns:
            att_value (Tensor): The attention output of shape (batch_size, seq_length, embedding_dim)
        """
        # Compute queries, keys, and values
        Q = self.q_layer(x)  # (batch_size, seq_length, embedding_dim)
        K = self.k_layer(x)  # (batch_size, seq_length, embedding_dim)
        V = self.v_layer(x)  # (batch_size, seq_length, embedding_dim)
        
        # Compute the dot-product attention scores and scale them
        scores = torch.matmul(Q, K.transpose(-2, -1))  # (batch_size, seq_length, seq_length)
        scores = scores / math.sqrt(Q.size(-1))
        mask = torch.triu(torch.ones(x.size(1), x.size(1), device=x.device), diagonal=1).bool()

        # Expand mask dimensions for batch (or rely on broadcasting)
        scores = scores.masked_fill(mask, float('-inf'))
        
        # Apply softmax to obtain attention weights
        att_weights = F.softmax(scores, dim=-1)  # (batch_size, seq_length, seq_length)
        
        # Multiply attention weights by the values to get the final output
        att_value = torch.matmul(att_weights, V)  # (batch_size, seq_length, embedding_dim)

        #Normalization
        normalized_attention_values = self.layer_norm(att_value)

        #FFW
        amplified_attention = self.first_linear(normalized_attention_values) #First forward layer
        
        amplified_attention = self.relu_layer(amplified_attention) #Relu
        
        attention_logits = self.second_linear(amplified_attention) #Second forward layer
        return attention_logits

# Example usage:
if __name__ == "__main__":
    # Hyperparameters
    batch_size = 4
    seq_length = 10
    embedding_dim = 64

    # Dummy input tensor
    x_test = torch.randn(batch_size, seq_length, embedding_dim)
    
    # Create the self-attention head instance
    attention_head = SelfAttentionHead(embedding_dim)
    
    # Run the forward pass
    output = attention_head(x_test)
    print("Attention output shape:", output.shape)


Attention output shape: torch.Size([4, 10, 64])


In [16]:
import torch
import torch.nn as nn

class Transformers(nn.Module):
    def __init__(self, max_seq_length, vocab_size, embedding_dim):
        super(Transformers, self).__init__()
        
        self.pos_enc = get_positional_encodings(max_seq_length-1, d_model, device).unsqueeze(0)  # Shape (1, L, D)

        # 🔹 First layer: Embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.attention_head = SelfAttentionHead(embedding_dim)

        self.logits = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, x):
        # x: [batch_size, sequence_length]
        
        embedded = self.embedding(x)  # 🎭 Convert indices to embeddings
        pos_enc = self.pos_enc # pos_enc is (1, L, embedding_dim)
        pos_enc = pos_enc[:, :embedded.size(1), :]  # slice to the input length
        x = embedded + pos_enc

        attention_value = self.attention_head(x)

        logits = self.logits(attention_value)
        return logits


## Hyperparameters

In [17]:
from torch import optim 
import torch.amp as amp  # For Automatic Mixed Precision (AMP)

batch_size = 32

vocab_size = 30000   # Number of words in vocabulary

#d_model
embedding_dim = 200  # from 100 to 200, for instance

max_seq_lenght = 128

model = Transformers(max_seq_lenght, vocab_size, embedding_dim)
model.to(device)

criterion = nn.CrossEntropyLoss()
lr =0.003
optimizer = optim.Adam(model.parameters(), lr=lr)
num_epochs = 10
scaler = amp.GradScaler()  # GradScaler for AMP

**Testing the transformer model witn an input of shape (B, L, D)**

In [18]:

# Generate random input of shape (B, L, D)
input_tensor = torch.randint(0, vocab_size, (batch_size, max_seq_lenght-1)).long().to(device)
# Forward pass
output = model(input_tensor)

# Assertions to validate correct output
print("Input shape:", input_tensor.shape)  # Expected: (B, L, D)
print(f"Expected Shape: ({batch_size}, {max_seq_lenght-1}, {vocab_size})")
print(f"Actual Shape: {output.shape}")

assert output.shape == (batch_size, max_seq_lenght-1, vocab_size), "❌ Mismatch in output shape!"
print("✅ Transformer test passed!")

Input shape: torch.Size([32, 127])
Expected Shape: (32, 127, 30000)
Actual Shape: torch.Size([32, 127, 30000])
✅ Transformer test passed!


## Use dataloader batches for smaller inputs for memory

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

# Train the model



In [28]:
import time

num_epochs = 5
for epoch in range(num_epochs):
    epoch_start = time.time()
    total_data_loading_time = 0.0
    total_gpu_compute_time = 0.0

    model.train()
    total_loss = 0.0

    # Initialize batch_start to measure data loading for the first batch.
    batch_start = time.time()

    for batch in data_loader:
        # Once batch is fetched, measure data loading time:
        batch_loaded_time = time.time()
        total_data_loading_time += (batch_loaded_time - batch_start)
        
        # Unpack and send to device:
        batch_inputs, batch_targets = batch
        batch_inputs = batch_inputs.to(device).long()
        batch_targets = batch_targets.to(device)

        optimizer.zero_grad()

        # Start GPU compute timing:
        gpu_start = time.time()
        with amp.autocast(device_type="cuda", dtype=torch.float16):
            outputs = model(batch_inputs)
            loss = criterion(outputs.view(-1, vocab_size), batch_targets.view(-1))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        gpu_end = time.time()
        total_gpu_compute_time += (gpu_end - gpu_start)

        total_loss += loss.item()
        
        # Prepare for next iteration: record time after optimizer step.
        batch_start = time.time()

    epoch_end = time.time()
    epoch_duration = epoch_end - epoch_start
    avg_loss = total_loss / len(data_loader)

    print(f"Epoch {epoch+1} Summary:")
    print(f"  Total Epoch Time     : {epoch_duration:.2f} s")
    print(f"  Total Data Loading   : {total_data_loading_time:.2f} s")
    print(f"  Total GPU Compute    : {total_gpu_compute_time:.2f} s")
    print(f"  Average Loss         : {avg_loss:.4f}\n")

print("Training complete.")

Epoch 1 Summary:
  Total Epoch Time     : 37.98 s
  Total Data Loading   : 0.14 s
  Total GPU Compute    : 34.74 s
  Average Loss         : 3.6463

Epoch 2 Summary:
  Total Epoch Time     : 38.76 s
  Total Data Loading   : 0.14 s
  Total GPU Compute    : 35.43 s
  Average Loss         : 3.5808

Epoch 3 Summary:
  Total Epoch Time     : 38.03 s
  Total Data Loading   : 0.14 s
  Total GPU Compute    : 34.78 s
  Average Loss         : 3.5238

Epoch 4 Summary:
  Total Epoch Time     : 37.11 s
  Total Data Loading   : 0.13 s
  Total GPU Compute    : 33.94 s
  Average Loss         : 3.4707

Epoch 5 Summary:
  Total Epoch Time     : 37.16 s
  Total Data Loading   : 0.13 s
  Total GPU Compute    : 33.98 s
  Average Loss         : 3.4239

Training complete.


In [21]:
import torch

def generate_text(model, tokenizer, prompt, num_words=50, device="cuda"):
    """
    Generate text using the  model.

    Args:
    - model: 
    - tokenizer: Tokenizer with `encode()` and `decode()` methods.
    - prompt: Seed text to start generation.
    - num_words: Number of words to generate.
    - device: "cuda" or "cpu".

    Returns:
    - Generated text (string).
    """
    model.eval()  # ✅ Set model to evaluation mode
    model.to(device)

    # 👇 Tokenize the input prompt
    input_ids = torch.tensor(tokenizer.encode(prompt), dtype=torch.long).unsqueeze(0).to(device)

    for _ in range(num_words):
        with torch.no_grad():
            logits = model(input_ids)  # 🔍 Forward pass
            next_token_logits = logits[:, -1, :]  # Take last token's output

            # 🎲 Sample the next word (greedy or probabilistic)
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)  # Greedy decoding

            # Append to input sequence
            input_ids = torch.cat([input_ids, next_token], dim=1)

    # 🔄 Convert token IDs back to text
    generated_text = tokenizer.decode(input_ids.squeeze().tolist())

    return generated_text

In [32]:

# Generate text
prompt_text = "Argentina es un pais muy maravilloso,"
generated_story = generate_text(model, hf_tokenizer, prompt_text, num_words=30)
print(generated_story)

Argentina es un pais muy maravillos o , que se encuentra en la ciudad de San Juan , en la provincia de Buenos Aires , Argentina . Fue uno de los más importantes de la ciudad de Trujillo
