### Transformers modelo de lenguaje

- Contruir un modelo de lenguaje a partir del texto del Señor de Los
Anillos, con una red
- Transformer
siguiendo el ejemplo de los perceptrones y las redes recurrentes visto
en clase. Comparar los resultados entre los ejemplos de clase y los
modelos construidos. Puntos a considerar.
- Cómo evaluar la calidad de un modelo de lenguaje.
- Cuál es la influencia de la tokenización y del encoding en la calidad del
modelo.
- Qué otros factores influyen.
- Tiempo de aprendizaje de los diferentes modelos.

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re
import math
import time
import torch.nn.functional as F

### Importamos el texto

#### Preprocesado de texto

In [5]:
# Preprocess text function
def preprocess_text(text, sequence_length):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    tokens = text.split()

    sequences = []
    for i in range(len(tokens) - sequence_length):
        seq = tokens[i:i + sequence_length + 1]
        sequences.append(seq)

    return sequences

# Build vocabulary function
def build_vocab(sequences):
    all_tokens = [token for seq in sequences for token in seq]
    token_counts = Counter(all_tokens)
    vocab = {token: idx for idx, (token, _) in enumerate(token_counts.items(), 1)}
    vocab['<PAD>'] = 0  # Add padding token
    return vocab

# Convert sequences to indices function
def sequences_to_indices(sequences, vocab):
    return [[vocab[token] for token in seq] for seq in sequences]

# Read text file function
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text



In [6]:
# Parameters
vocab_size = 10000
embedding_dim = 100
nhead = 4
num_encoder_layers = 2
dim_feedforward = 512
learning_rate = 0.001
batch_size = 64
num_epochs = 10
sequence_length = 10
file_path = "/content/LTR.txt"


# Read and preprocess the text file
text = read_text_file(file_path)
sequences = preprocess_text(text, sequence_length)
vocab = build_vocab(sequences)
indexed_sequences = sequences_to_indices(sequences, vocab)

# Adjust vocab_size according to the actual vocabulary size
vocab_size = len(vocab)

Construimos la estructura del tranformers, una muy similar a la del ejemplo de clase

In [7]:
# Define the dataset class
class LanguageDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long)

# Positional Encoding class
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x.transpose(0, 1)  # Transformer expects (sequence_length, batch_size, embedding_dim)
        x = x + self.pe[:x.size(0), :]
        return x.transpose(0, 1)  # Revert to (batch_size, sequence_length, embedding_dim)

# Transformer Language Model class
class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, nhead, num_encoder_layers, dim_feedforward, max_seq_length):
        super(TransformerLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim, max_seq_length)
        encoder_layers = nn.TransformerEncoderLayer(embedding_dim, nhead, dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.fc = nn.Linear(embedding_dim, vocab_size)
        self.embedding_dim = embedding_dim

    def forward(self, x, src_key_padding_mask=None):
        x = self.embedding(x) * math.sqrt(self.embedding_dim)
        x = self.positional_encoding(x)
        x = x.transpose(0, 1)  # Transformer expects (sequence_length, batch_size, embedding_dim)
        output = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)
        output = output.transpose(0, 1)
        output = self.fc(output)
        return output

# Training function
def train_model(model, data_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for batch in data_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            inputs = batch[:, :-1]
            targets = batch[:, 1:].contiguous().view(-1)

            src_key_padding_mask = (inputs == 0)  # Correct mask shape (batch_size, sequence_length)
            outputs = model(inputs, src_key_padding_mask=src_key_padding_mask)
            outputs = outputs.view(-1, vocab_size)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')

Creamos el dataset y el data loader e inicializar el modelo, la loss function y el optimizador (en nuestro caso usamos Adam)

In [8]:
# Create the dataset and data loader
dataset = LanguageDataset(indexed_sequences)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Instantiate the model, loss function, and optimizer
model_example = TransformerLanguageModel(vocab_size, embedding_dim, nhead, num_encoder_layers, dim_feedforward, sequence_length + 1)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_example.parameters(), lr=learning_rate)



Entrenamos el modelo

In [9]:
# Determine the device to be used (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
# Train the model
start_time = time.time()

train_model(model_example, data_loader, criterion, optimizer, num_epochs, device=device)

end_time = time.time()
elapsed_time = end_time - start_time

print(f'Tiempo de entrenamiento del modelo de ejemplo: {elapsed_time:.2f} segundos')

Epoch 1, Average Loss: 3.5260
Epoch 2, Average Loss: 2.3452
Epoch 3, Average Loss: 1.6005
Epoch 4, Average Loss: 1.1472
Epoch 5, Average Loss: 0.9172
Epoch 6, Average Loss: 0.8076
Epoch 7, Average Loss: 0.7467
Epoch 8, Average Loss: 0.7052
Epoch 9, Average Loss: 0.6757
Epoch 10, Average Loss: 0.6518
Tiempo de entrenamiento del modelo de ejemplo: 621.12 segundos


In [11]:
print(elapsed_time/60)

10.352009479204813


### Evaluación del modelo:

In [12]:
def calculate_perplexity(model, data_loader, criterion, device):
    total_loss = 0.0
    total_tokens = 0

    model.eval()
    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            inputs, targets = batch[:, :-1], batch[:, 1:]
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(2)), targets.contiguous().view(-1))
            total_loss += loss.item()
            total_tokens += targets.numel()

    avg_loss = total_loss / len(data_loader)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

In [13]:
# Calcular la perplejidad
perplexity = calculate_perplexity(model_example, data_loader, criterion, device='cuda')
print(f'Perplejidad del modelo: {perplexity:.4f}')

Perplejidad del modelo: 1.6749


#### Segundo modelo: modificamos parametros

In [16]:
# Parameters
vocab_size = 10000
embedding_dim = 100
nhead = 4
num_encoder_layers = 2
dim_feedforward = 512
learning_rate = 0.0005
batch_size = 64
num_epochs = 20
sequence_length = 10
file_path = "/content/LTR.txt"


# Read and preprocess the text file
text = read_text_file(file_path)
sequences = preprocess_text(text, sequence_length)
vocab = build_vocab(sequences)
indexed_sequences = sequences_to_indices(sequences, vocab)

# Adjust vocab_size according to the actual vocabulary size
vocab_size = len(vocab)

In [17]:
# Create the dataset and data loader
dataset = LanguageDataset(indexed_sequences)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


# Instantiate the model, loss function, and optimizer
model_2 = TransformerLanguageModel(vocab_size, embedding_dim, nhead, num_encoder_layers, dim_feedforward, sequence_length + 1)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_2.parameters(), lr=learning_rate)



In [18]:
# Train the model
start_time = time.time()

train_model(model_2, data_loader, criterion, optimizer, num_epochs, device=device)

end_time = time.time()
elapsed_time = end_time - start_time

print(f'Tiempo de entrenamiento del modelo de ejemplo: {elapsed_time:.2f} segundos')

Epoch 1, Average Loss: 3.9949
Epoch 2, Average Loss: 2.7025
Epoch 3, Average Loss: 1.9150
Epoch 4, Average Loss: 1.2985
Epoch 5, Average Loss: 0.9965
Epoch 6, Average Loss: 0.8504
Epoch 7, Average Loss: 0.7727
Epoch 8, Average Loss: 0.7255
Epoch 9, Average Loss: 0.6924
Epoch 10, Average Loss: 0.6681
Epoch 11, Average Loss: 0.6484
Epoch 12, Average Loss: 0.6334
Epoch 13, Average Loss: 0.6204
Epoch 14, Average Loss: 0.6088
Epoch 15, Average Loss: 0.5993
Epoch 16, Average Loss: 0.5907
Epoch 17, Average Loss: 0.5833
Epoch 18, Average Loss: 0.5762
Epoch 19, Average Loss: 0.5698
Epoch 20, Average Loss: 0.5647
Tiempo de entrenamiento del modelo de ejemplo: 1232.86 segundos


In [19]:
print(elapsed_time/60)

20.547633866469067


In [20]:
# Calcular la perplejidad
perplexity = calculate_perplexity(model_2, data_loader, criterion, device='cuda')
print(f'Perplejidad del modelo: {perplexity:.4f}')

Perplejidad del modelo: 1.6264


#### Tercer modelo: mínima complejidad posible

In [21]:
# Parameters
vocab_size = 10000
embedding_dim = 50
nhead = 2
num_encoder_layers = 1
dim_feedforward = 256
learning_rate = 0.001
batch_size = 32
num_epochs = 10
sequence_length = 10
file_path = "/content/LTR.txt"


# Read and preprocess the text file
text = read_text_file(file_path)
sequences = preprocess_text(text, sequence_length)
vocab = build_vocab(sequences)
indexed_sequences = sequences_to_indices(sequences, vocab)

# Adjust vocab_size according to the actual vocabulary size
vocab_size = len(vocab)

In [22]:
# Create the dataset and data loader
dataset = LanguageDataset(indexed_sequences)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Instantiate the model, loss function, and optimizer
model_3 = TransformerLanguageModel(vocab_size, embedding_dim, nhead, num_encoder_layers, dim_feedforward, sequence_length + 1)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_3.parameters(), lr=learning_rate)

In [23]:
# Train the model
start_time = time.time()

train_model(model_3, data_loader, criterion, optimizer, num_epochs, device=device)

end_time = time.time()
elapsed_time = end_time - start_time

print(f'Tiempo de entrenamiento del modelo de ejemplo: {elapsed_time/60:.2f} minutos')

Epoch 1, Average Loss: 5.2373
Epoch 2, Average Loss: 4.7035
Epoch 3, Average Loss: 4.4938
Epoch 4, Average Loss: 4.3526
Epoch 5, Average Loss: 4.2415
Epoch 6, Average Loss: 4.1525
Epoch 7, Average Loss: 4.0709
Epoch 8, Average Loss: 4.0046
Epoch 9, Average Loss: 3.9433
Epoch 10, Average Loss: 3.8891
Tiempo de entrenamiento del modelo de ejemplo: 10.85 minutos


In [24]:
print(elapsed_time/60)

10.853380926450093


In [25]:
# Calcular la perplejidad
perplexity = calculate_perplexity(model_3, data_loader, criterion, device='cuda')
print(f'Perplejidad del modelo: {perplexity:.4f}')

Perplejidad del modelo: 38.5607


#### Modelo 4: aumento de complejidad

In [27]:
# Parameters
vocab_size = 10000
embedding_dim = 200
nhead = 8
num_encoder_layers = 3
dim_feedforward = 1024
learning_rate = 0.001
batch_size = 128
num_epochs = 10
sequence_length = 20
file_path = "/content/LTR.txt"


# Read and preprocess the text file
text = read_text_file(file_path)
sequences = preprocess_text(text, sequence_length)
vocab = build_vocab(sequences)
indexed_sequences = sequences_to_indices(sequences, vocab)

# Adjust vocab_size according to the actual vocabulary size
vocab_size = len(vocab)

In [28]:
# Create the dataset and data loader
dataset = LanguageDataset(indexed_sequences)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Instantiate the model, loss function, and optimizer
model_4 = TransformerLanguageModel(vocab_size, embedding_dim, nhead, num_encoder_layers, dim_feedforward, sequence_length + 1)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_4.parameters(), lr=learning_rate)

In [29]:
# Determine the device to be used (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
# Train the model
start_time = time.time()

train_model(model_4, data_loader, criterion, optimizer, num_epochs, device=device)

end_time = time.time()
elapsed_time = end_time - start_time

print(f'Tiempo de entrenamiento del modelo de ejemplo: {elapsed_time/60:.2f} minutos')

Epoch 1, Average Loss: 2.9976
Epoch 2, Average Loss: 1.9959
Epoch 3, Average Loss: 1.8112
Epoch 4, Average Loss: 1.7004
Epoch 5, Average Loss: 1.4342
Epoch 6, Average Loss: 1.0289
Epoch 7, Average Loss: 0.7864
Epoch 8, Average Loss: 0.6338
Epoch 9, Average Loss: 0.5302
Epoch 10, Average Loss: 0.4612
Tiempo de entrenamiento del modelo de ejemplo: 23.44 minutos


In [31]:
# Calcular la perplejidad
perplexity = calculate_perplexity(model_4, data_loader, criterion, device='cuda')
print(f'Perplejidad del modelo: {perplexity:.4f}')

Perplejidad del modelo: 1.3132
