**Load Tokenized Dataset or Download the base dataset (not tokenized) from hf**

In [5]:
from datasets import load_from_disk
from pathlib import Path

#Initialize a bool to check if the tokenized dataset exists in disk or if it has to be tokenized 
tokenized_dataset_exists = False

dataset_path = Path("./data/tokenized_dataset")

if dataset_path.exists():
    tokenized_dataset = load_from_disk(str(dataset_path))
    tokenized_dataset_exists = True
    print("Dataset loaded successfully!")
else:
    print("Dataset does not exist at the specified path.")

Dataset loaded successfully!


**If the tokenized dataset doesnt exist on disk we download it and tokenize it**

In [7]:
from datasets import load_dataset

if tokenized_dataset_exists == False:
    # Load the dataset and specify the cache directory
    whole_dataset = load_dataset("wikimedia/wikipedia", "20231101.es", cache_dir="./data")
    ds = ds["train"].train_test_split(test_size=0.98, seed=42) #We split the dataset to make it smaller and 
    #try different training parameters faster
else:
    print("Skipping the download of whole dataset from Huggingface")

Skipping the download of whole dataset from Huggingface


## We load the tokenizer from disk or create the tokenizer if it doesnt exist

Function to load the text from the dataset to tokenizer

In [2]:
def get_training_corpus():
    for sample in ds["train"]:
        yield sample["text"]  # Extract text

**Load tokenizer // Create tokenizer**

In [8]:
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from datasets import load_from_disk
from pathlib import Path

#Initialize a bool to check if the tokenized dataset exists in disk or if it has to be tokenized 
tokenized_dataset_exists = False

tokenizer_path = Path("./data/scratch_tokenizer.json")

if tokenizer_path.exists():
    #load tokenizer if its saved on disk
    hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_path), 
                                       unk_token="[UNK]", 
                                       pad_token="[PAD]", 
                                       mask_token="[MASK]")
    # Save it in the Hugging Face format
    hf_tokenizer.save_pretrained("custom_tokenizer")
    print("Tokenizer loaded! 🎉")
else:
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    trainer = BpeTrainer(special_tokens=["[UNK]", "[PAD]", "[MASK]"], vocab_size=30_000)
    tokenizer.pre_tokenizer = Whitespace()
    
    
    
    
    tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
    tokenizer.save("./data/scratch_tokenizer.json")
    print("Tokenizer training complete! 🎉")


# Test with Transformers API
print(hf_tokenizer.encode("Hello world!"))

Tokenizer loaded! 🎉
[42, 8735, 21776, 3]


**Tokenize the dataset if not in disk**

At the beginning we downloaded the whole dataset in the case the tokenized one didnt exist
but we havent tokenized it

In [16]:
# Function to tokenize text in 128-token chunks
def tokenize_function(examples):
    # Tokenize text and split into chunks of max length 128
    tokenized_text = hf_tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )
    return tokenized_text
    
if not dataset_path.exists():
    # Apply tokenization to dataset
    tokenized_dataset = ds["train"].map(tokenize_function, batched=True)
    tokenized_dataset.save_to_disk(str(dataset_path))
    print(f"Dataset tokenized and saved to disk on path {string(dataset_path)}")

## Creating a data loader

In [17]:
import torch
from torch.utils.data import DataLoader

# Convert dataset to PyTorch format
tokenized_text = torch.tensor(tokenized_dataset["input_ids"])
tokenized_text.shape

torch.Size([36823, 128])

In [18]:

# Shift left to create targets
input_data = tokenized_text[:, :-1]  # Remove LAST token in each sequence
target_data = tokenized_text[:, 1:]  # Remove FIRST token in each sequence

print("Input Data Shape:", input_data.shape)  # Should be (6455, 127)
print("Target Data Shape:", target_data.shape)  # Should be (6455, 127)

Input Data Shape: torch.Size([36823, 127])
Target Data Shape: torch.Size([36823, 127])


In [19]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 64
dataset = TensorDataset(input_data, target_data)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True,  num_workers=4, pin_memory=True)


In [21]:
##Both pytorch and HF tokenizer will want to use many cpu cores, and will result in error
## Since we already tokenized the whole corpus we can disable the parallelism of the hf tokenizer
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [22]:
# Check batch shapes
for batch in data_loader:
    inputs, targets = batch
    print("Batch input shape:", inputs.shape)
    print("Batch target shape:", targets.shape)
    break  # St

Batch input shape: torch.Size([64, 127])
Batch target shape: torch.Size([64, 127])


## LSTM Model

In [23]:
import torch
import torch.nn as nn

class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, output_dim):
        super(SimpleRNN, self).__init__()
        
        # 🔹 First layer: Embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # 🔁 LSTM Layers (or use nn.RNN / nn.GRU)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        
        # ⚡ Output layer (for classification, etc.)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # x: [batch_size, sequence_length]
        
        embedded = self.embedding(x)  # 🎭 Convert indices to embeddings
        output, hidden = self.rnn(embedded)  # 🔁 Recurrent processing
        logits = self.fc(output)  # ⚡ Final classification
        
        return logits


## Hyperparameters

In [25]:
from torch import optim 
import torch.amp as amp  # For Automatic Mixed Precision (AMP)

vocab_size = 30000   # Number of words in vocabulary
embedding_dim = 200  # from 100 to 200, for instance
hidden_dim = 256     # from 128 to 256  # Size of RNN hidden state
output_dim = vocab_size      # 2 for Binary classification (Positive/Negative) or vocab_size for next token generation
num_layers = 8

model = SimpleRNN(vocab_size, embedding_dim, hidden_dim, num_layers, output_dim)


criterion = nn.CrossEntropyLoss()
lr =0.003
optimizer = optim.Adam(model.parameters(), lr=lr)
num_epochs = 10
scaler = amp.GradScaler()  # GradScaler for AMP

## Use dataloader batches for smaller inputs for memory

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Train the model



In [33]:
import time

model.to(device)
num_epochs = 5
for epoch in range(num_epochs):
    epoch_start = time.time()
    total_data_loading_time = 0.0
    total_gpu_compute_time = 0.0

    model.train()
    total_loss = 0.0

    # Initialize batch_start to measure data loading for the first batch.
    batch_start = time.time()

    for batch in data_loader:
        # Once batch is fetched, measure data loading time:
        batch_loaded_time = time.time()
        total_data_loading_time += (batch_loaded_time - batch_start)
        
        # Unpack and send to device:
        batch_inputs, batch_targets = batch
        batch_inputs = batch_inputs.to(device).long()
        batch_targets = batch_targets.to(device)

        optimizer.zero_grad()

        # Start GPU compute timing:
        gpu_start = time.time()
        with amp.autocast(device_type="cuda", dtype=torch.float16):
            outputs = model(batch_inputs)
            loss = criterion(outputs.view(-1, vocab_size), batch_targets.view(-1))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        gpu_end = time.time()
        total_gpu_compute_time += (gpu_end - gpu_start)

        total_loss += loss.item()
        
        # Prepare for next iteration: record time after optimizer step.
        batch_start = time.time()

    epoch_end = time.time()
    epoch_duration = epoch_end - epoch_start
    avg_loss = total_loss / len(data_loader)

    print(f"Epoch {epoch+1} Summary:")
    print(f"  Total Epoch Time     : {epoch_duration:.2f} s")
    print(f"  Total Data Loading   : {total_data_loading_time:.2f} s")
    print(f"  Total GPU Compute    : {total_gpu_compute_time:.2f} s")
    print(f"  Average Loss         : {avg_loss:.4f}\n")

print("Training complete.")

Epoch 1 Summary:
  Total Epoch Time     : 52.84 s
  Total Data Loading   : 0.11 s
  Total GPU Compute    : 48.48 s
  Average Loss         : 4.2657

Epoch 2 Summary:
  Total Epoch Time     : 52.83 s
  Total Data Loading   : 0.11 s
  Total GPU Compute    : 48.48 s
  Average Loss         : 4.2098

Epoch 3 Summary:
  Total Epoch Time     : 52.97 s
  Total Data Loading   : 0.11 s
  Total GPU Compute    : 48.62 s
  Average Loss         : 4.1595

Epoch 4 Summary:
  Total Epoch Time     : 52.88 s
  Total Data Loading   : 0.11 s
  Total GPU Compute    : 48.54 s
  Average Loss         : 4.1119

Epoch 5 Summary:
  Total Epoch Time     : 53.08 s
  Total Data Loading   : 0.11 s
  Total GPU Compute    : 48.71 s
  Average Loss         : 4.0734

Training complete.


In [28]:
import torch

def generate_text(model, tokenizer, prompt, num_words=50, device="cuda"):
    """
    Generate text using the  model.

    Args:
    - model: 
    - tokenizer: Tokenizer with `encode()` and `decode()` methods.
    - prompt: Seed text to start generation.
    - num_words: Number of words to generate.
    - device: "cuda" or "cpu".

    Returns:
    - Generated text (string).
    """
    model.eval()  # ✅ Set model to evaluation mode
    model.to(device)

    # 👇 Tokenize the input prompt
    input_ids = torch.tensor(tokenizer.encode(prompt), dtype=torch.long).unsqueeze(0).to(device)

    for _ in range(num_words):
        with torch.no_grad():
            logits = model(input_ids)  # 🔍 Forward pass
            next_token_logits = logits[:, -1, :]  # Take last token's output

            # 🎲 Sample the next word (greedy or probabilistic)
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)  # Greedy decoding

            # Append to input sequence
            input_ids = torch.cat([input_ids, next_token], dim=1)

    # 🔄 Convert token IDs back to text
    generated_text = tokenizer.decode(input_ids.squeeze().tolist())

    return generated_text

In [36]:

# Generate text
prompt_text = "La ciudad de"
generated_story = generate_text(model, hf_tokenizer, prompt_text, num_words=30)
print(generated_story)

La ciudad de la República de la Universidad de la Universidad de Buenos Aires fue el órgano de fútbol de la República de la República de la República de la República de la
