# 1

Desarrollamos la implementación de nuestra dataset. Utilizaremos las bibliotecas de toch y torchtext.

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
import spacy
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

# Cargar spacy tokenizers
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

# Tokenizers
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# Leer archivos
def read_lines(filepath):
    with open(filepath, encoding='utf-8') as f:
        return [line.strip() for line in f]

train_src_lines = read_lines('train.lc.norm.tok.de')
train_trg_lines = read_lines('train.lc.norm.tok.en')
val_src_lines = read_lines('val.lc.norm.tok.de')
val_trg_lines = read_lines('val.lc.norm.tok.en')
test_src_lines = read_lines('test_2017_mscoco.lc.norm.tok.de')
test_trg_lines = read_lines('test_2017_mscoco.lc.norm.tok.en')

# Crear vocabularios
def yield_tokens(data_lines, tokenizer):
    for line in data_lines:
        yield tokenizer(line)

vocab_de = build_vocab_from_iterator(yield_tokens(train_src_lines, tokenize_de), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab_de.set_default_index(vocab_de["<unk>"])

vocab_en = build_vocab_from_iterator(yield_tokens(train_trg_lines, tokenize_en), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab_en.set_default_index(vocab_en["<unk>"])

# Dataset personalizado
class TranslationDataset(Dataset):
    def __init__(self, src_lines, trg_lines, src_vocab, trg_vocab, src_tokenizer, trg_tokenizer):
        self.src_lines = src_lines
        self.trg_lines = trg_lines
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer

    def __len__(self):
        return len(self.src_lines)

    def __getitem__(self, idx):
        src_line = self.src_lines[idx]
        trg_line = self.trg_lines[idx]
        src_tokens = [self.src_vocab[token] for token in self.src_tokenizer(src_line)]
        trg_tokens = [self.trg_vocab[token] for token in self.trg_tokenizer(trg_line)]
        return torch.tensor(src_tokens), torch.tensor(trg_tokens)

train_dataset = TranslationDataset(train_src_lines, train_trg_lines, vocab_de, vocab_en, tokenize_de, tokenize_en)
val_dataset = TranslationDataset(val_src_lines, val_trg_lines, vocab_de, vocab_en, tokenize_de, tokenize_en)
test_dataset = TranslationDataset(test_src_lines, test_trg_lines, vocab_de, vocab_en, tokenize_de, tokenize_en)

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=vocab_de["<pad>"])
    trg_batch = pad_sequence(trg_batch, padding_value=vocab_en["<pad>"])
    return src_batch, trg_batch

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)


  from .autonotebook import tqdm as notebook_tqdm


Desarrollamos nuestro modelo básico de Seq2Seq con LSTM.

In [3]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [4]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [5]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, trg_vocab_size):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.trg_vocab_size = trg_vocab_size

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]

        outputs = torch.zeros(trg_len, batch_size, self.trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)
        input = trg[0, :]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[t] if random.random() < teacher_forcing_ratio else top1

        return outputs

In [10]:
print(len(val_dataloader))

127


Por último se entrena y evalúa el modelo para revisar cual es el rendimiento del mismo. Se definen los hiperparámetros a utilizar.

In [6]:
import torch.optim as optim


# Inicializar pesos del modelo
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)

INPUT_DIM = len(vocab_de)
OUTPUT_DIM = len(vocab_en)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2Seq(enc, dec, device, OUTPUT_DIM).to(device)
model.apply(init_weights)

optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = vocab_en["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

# Función de entrenamiento
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for i, (src, trg) in enumerate(iterator):
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        #print(trg)
        output = model(src, trg)
        output_dim = output.shape[-1]
        #print(i)

        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# Función de evaluación
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src = src.to(device)
            trg = trg.to(device)

            output = model(src, trg, 0)
            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# Entrenamiento del modelo
N_EPOCHS = 3
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_dataloader, criterion)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')


Epoch: 01
	Train Loss: 5.633
	 Val. Loss: 5.486
Epoch: 02
	Train Loss: 4.958
	 Val. Loss: 5.337
Epoch: 03
	Train Loss: 4.587
	 Val. Loss: 4.992


El modelo está aprendiendo de los datos de entrenamiento y mejorando su rendimiento, como se evidencia por la disminución constante de la pérdida de entrenamiento. Sin embargo, la diferencia entre la pérdida de entrenamiento y validación sugiere que el modelo podría estar comenzando a sobreajustarse a los datos de entrenamiento por lo que podría ser necesario realizar algunas modificaciones adicionales.

# 2

Aplicamos dropout a los pesos recurrentes durante el entrenamiento. Esto introduce variabilidad en los pesos durante el entrenamiento, ayudando a prevenir el sobreajuste.

In [20]:
import random

class WeightDrop(torch.nn.Module):
    def __init__(self, module, weights, dropout=0, variational=False):
        super(WeightDrop, self).__init__()
        self.module = module
        self.weights = weights
        self.dropout = dropout
        self.variational = variational
        self._setup()

    def widget_name(self, name):
        return name.replace('.', '_')

    def _setup(self):
        if isinstance(self.module, torch.nn.RNNBase):
            self.module.flatten_parameters = self._do_nothing

        for name_w in self.weights:
            print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
            w = getattr(self.module, name_w)
            self.register_parameter(self.widget_name(name_w), torch.nn.Parameter(w.data))
            self.module._parameters[name_w] = F.dropout(w, p=self.dropout, training=False)

    def _do_nothing(self):
        pass

    def forward(self, *args):
        for name_w in self.weights:
            raw_w = getattr(self, self.widget_name(name_w))
            self.module._parameters[name_w] = F.dropout(raw_w, p=self.dropout, training=self.training)

        return self.module.forward(*args)

class LockedDropout(torch.nn.Module):
    def __init__(self, dropout):
        super(LockedDropout, self).__init__()
        self.dropout = dropout

    def forward(self, x):
        if not self.training or not self.dropout:
            return x
        m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - self.dropout)
        mask = torch.autograd.Variable(m, requires_grad=False) / (1 - self.dropout)
        mask = mask.expand_as(x)
        return mask * x


In [21]:
class EmbeddingDropout(nn.Module):
    def __init__(self, emb, dropout):
        super(EmbeddingDropout, self).__init__()
        self.emb = emb
        self.dropout = dropout

    def forward(self, words, scale=None):
        if self.dropout:
            mask = self.emb.weight.data.new().resize_((self.emb.weight.size(0), 1)).bernoulli_(1 - self.dropout).expand_as(self.emb.weight) / (1 - self.dropout)
            masked_embed_weight = mask * self.emb.weight
        else:
            masked_embed_weight = self.emb.weight

        padding_idx = self.emb.padding_idx
        if padding_idx is None:
            padding_idx = -1

        return F.embedding(words, masked_embed_weight, padding_idx, self.emb.max_norm, self.emb.norm_type,
                           self.emb.scale_grad_by_freq, self.emb.sparse)


Modificamos el Decoder y Encoder con las clases anteriormente creadas.

In [22]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.emb_dropout = EmbeddingDropout(self.embedding, dropout)
        self.rnn = WeightDrop(nn.LSTM(emb_dim, hid_dim, n_layers), ['weight_hh_l0'], dropout=dropout)
        self.dropout = LockedDropout(dropout)

    def forward(self, src):
        embedded = self.emb_dropout(src)
        embedded = self.dropout(embedded)
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.emb_dropout = EmbeddingDropout(self.embedding, dropout)
        self.rnn = WeightDrop(nn.LSTM(emb_dim, hid_dim, n_layers), ['weight_hh_l0'], dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = LockedDropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.emb_dropout(input)
        embedded = self.dropout(embedded)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        output = self.dropout(output)
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell


In [23]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, trg_vocab_size):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.trg_vocab_size = trg_vocab_size

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]

        outputs = torch.zeros(trg_len, batch_size, self.trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)
        input = trg[0, :]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[t] if random.random() < teacher_forcing_ratio else top1

        return outputs


In [32]:
# Configuración del modelo
INPUT_DIM = len(vocab_de)
OUTPUT_DIM = len(vocab_en)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2Seq(enc, dec, device, OUTPUT_DIM).to(device)

# Inicialización de pesos y optimizador
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)

model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters())
TRG_PAD_IDX = vocab_en["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)


Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


In [23]:
# Función de entrenamiento
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for i, (src, trg) in enumerate(iterator):
        src = src.to(model.device)
        trg = trg.to(model.device)

        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]

        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# Función de evaluación
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src = src.to(model.device)
            trg = trg.to(model.device)

            output = model(src, trg, 0)  # No teacher forcing
            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [34]:
optimizer = torch.optim.Adam(model.parameters())
TRG_PAD_IDX = vocab_en["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

# Parámetros del entrenamiento
N_EPOCHS = 3
CLIP = 1

# Entrenamiento del modelo
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_dataloader, criterion)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

Epoch: 01
	Train Loss: 5.697
	 Val. Loss: 5.546
Epoch: 02
	Train Loss: 5.326
	 Val. Loss: 5.460
Epoch: 03
	Train Loss: 5.153
	 Val. Loss: 5.373


Al haber includo las técnicas de regularización avanzadas el rendimiento del modelo ha dejado de tener tanta diferencia entre la perdida de entrenamiento y validación, aunque más épocas tal vez se notaría más la diferencia. Sin embargo, las pérdidas siguen siendo altas, lo que sugiere que se podría seguir mejorando el modelo con ajustes adicionales en los hiperparámetros, más épocas de entrenamiento, o incorporando técnicas adicionales como ajuste de hiperparámetros, tuning de la tasa de aprendizaje, etc.

# 3

Se implementan las distintas técnicas de decodificación:

In [48]:
def greedy_decode(model, src, max_len, trg_vocab):
    model.eval()
    src = src.unsqueeze(1).to(model.device)
    
    with torch.no_grad():
        hidden, cell = model.encoder(src)
    
    trg_indexes = [trg_vocab["<bos>"]]
    
    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(model.device)
        
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
        
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        
        if pred_token == trg_vocab["<eos>"]:
            break
    
    trg_tokens = [trg_vocab.lookup_token(i) for i in trg_indexes]
    return trg_tokens


In [10]:
import heapq

def beam_search_decode(model, src, max_len, trg_vocab, beam_width=3):
    model.eval()
    src = src.unsqueeze(1).to(model.device)
    
    with torch.no_grad():
        hidden, cell = model.encoder(src)
    
    sequences = [[list(), 0.0, hidden, cell]]
    
    for _ in range(max_len):
        all_candidates = list()
        
        for seq in sequences:
            trg_indexes, score, hidden, cell = seq
            #print(trg_indexes)
            if isinstance(trg_indexes, list) and trg_indexes:  # Check if list and not empty
                trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(model.device)
            else:
                trg_tensor = torch.LongTensor([trg_vocab["<bos>"]]).to(model.device)
            
            with torch.no_grad():
                output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
            
            log_probs, indexes = torch.topk(output, beam_width)
            
            for i in range(beam_width):
                candidate = [trg_indexes + [indexes[0][i].item()], score - log_probs[0][i].item(), hidden, cell]
                all_candidates.append(candidate)
        
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:beam_width]
    
    best_seq = sequences[0][0]
    trg_tokens = [trg_vocab.lookup_token(i) for i in best_seq]
    return trg_tokens


In [80]:
def sample_with_temperature(output, temperature=1.0):
    output = output.div(temperature).exp()
    probs = output / output.sum()
    return torch.multinomial(probs, 1)

def temperature_decode(model, src, max_len, trg_vocab, temperature=1.0):
    model.eval()
    src = src.unsqueeze(1).to(model.device)
    
    with torch.no_grad():
        hidden, cell = model.encoder(src)
    
    trg_indexes = [trg_vocab["<bos>"]]
    
    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(model.device)
        
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
        
        pred_token = sample_with_temperature(output, temperature).item()
        trg_indexes.append(pred_token)
        
        if pred_token == trg_vocab["<eos>"]:
            break
    
    trg_tokens = [trg_vocab.lookup_token(i) for i in trg_indexes]
    return trg_tokens

Implementamos las métricas siguentes de BLEU y ROUGE:

In [12]:
import sacrebleu
from rouge_score import rouge_scorer

def calculate_bleu(references, hypotheses):
    bleu = sacrebleu.corpus_bleu(hypotheses, [references])
    return bleu.score

def calculate_rouge(references, hypotheses):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, hyp) for ref, hyp in zip(references, hypotheses)]
    avg_scores = {
        'rouge1': sum([score['rouge1'].fmeasure for score in scores]) / len(scores),
        'rougeL': sum([score['rougeL'].fmeasure for score in scores]) / len(scores)
    }
    return avg_scores


In [43]:
vocab_en = build_vocab_from_iterator(yield_tokens(train_trg_lines, tokenize_en), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab_en.set_default_index(vocab_en["<unk>"])

In [45]:
print(vocab_en.get_stoi)

<bound method Vocab.get_stoi of Vocab()>


In [81]:
def evaluate_decoding_techniques(model, dataloader, trg_vocab, max_len=50):
    references = []
    greedy_hypotheses = []
    beam_hypotheses = []
    temp_hypotheses = []

    for i, (src, trg) in enumerate(dataloader):
        src = src.to(model.device)
        trg = trg.to(model.device)

        for j in range(src.shape[1]):
            src_sentence = src[:, j]
            trg_sentence = trg[:, j]

            trg_tokens = [trg_vocab.lookup_token(t.item()) for t in trg_sentence if t.item() != trg_vocab["<pad>"]]

            greedy_tokens = greedy_decode(model, src_sentence, max_len, trg_vocab)
            beam_tokens = beam_search_decode(model, src_sentence, max_len, trg_vocab)
            temp_tokens = temperature_decode(model, src_sentence, max_len, trg_vocab)

            references.append(" ".join(trg_tokens))
            greedy_hypotheses.append(" ".join(greedy_tokens))
            beam_hypotheses.append(" ".join(beam_tokens))
            temp_hypotheses.append(" ".join(temp_tokens))

    greedy_bleu = calculate_bleu(references, greedy_hypotheses)
    beam_bleu = calculate_bleu(references, beam_hypotheses)
    temp_bleu = calculate_bleu(references, temp_hypotheses)

    greedy_rouge = calculate_rouge(references, greedy_hypotheses)
    beam_rouge = calculate_rouge(references, beam_hypotheses)
    temp_rouge = calculate_rouge(references, temp_hypotheses)

    print(f'Greedy BLEU: {greedy_bleu:.2f}')
    print(f'Beam BLEU: {beam_bleu:.2f}')
    print(f'Temperature BLEU: {temp_bleu:.2f}')

    print(f'Greedy ROUGE: {greedy_rouge}')
    print(f'Beam ROUGE: {beam_rouge}')
    print(f'Temperature ROUGE: {temp_rouge}')

# Evaluar las técnicas de decodificación
evaluate_decoding_techniques(model, val_dataloader, vocab_en)





Greedy BLEU: 0.44
Beam BLEU: 0.46
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.2544927358449203, 'rougeL': 0.20900279860924104}
Beam ROUGE: {'rouge1': 0.2624116613066208, 'rougeL': 0.2167682496065397}
Temperature ROUGE: {'rouge1': 0.13167839824372748, 'rougeL': 0.10450850810410503}


El score BLEU mide la precisión de las n-gramas generadas con respecto a las secuencias de referencia. Beam search tiene el mejor rendimiento, seguido de cerca por greedy decode. La decodificación con temperatura muestra un rendimiento significativamente menor, indicando que la introducción de aleatoriedad reduce la precisión de las secuencias generadas.

El score ROUGE mide la coincidencia entre las n-gramas, las subsecuencias más largas y las subsecuencias más largas a nivel de cadena entre las secuencias generadas y las de referencia. Al igual que con el score BLEU, beam search tiene el mejor rendimiento, seguido de greedy decode y, por último, temperature decode.

Beam search generalmente proporciona la mejor calidad, mientras que greedy decode ofrece una solución rápida y temperature decode introduce variabilidad a expensas de la precisión. 

# 4

In [82]:
from itertools import product

# Definir los rangos de los hiperparámetros
embedding_dims = [128, 256]
hidden_dims = [256, 512]
num_layers = [1, 2]
dropouts = [0.3, 0.5]
batch_sizes = [32, 64]

# Guardar los resultados
results = []

# Función para configurar y entrenar el modelo con diferentes hiperparámetros
def train_and_evaluate(hyperparams):
    emb_dim, hid_dim, n_layers, dropout, batch_size = hyperparams
    
    enc = Encoder(INPUT_DIM, emb_dim, hid_dim, n_layers, dropout)
    dec = Decoder(OUTPUT_DIM, emb_dim, hid_dim, n_layers, dropout)
    
    model = Seq2Seq(enc, dec, device, OUTPUT_DIM).to(device)
    model.apply(init_weights)
    optimizer = torch.optim.Adam(model.parameters())
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    
    # Entrenar el modelo (solo una época para simplificar el análisis)
    train(model, train_dataloader, optimizer, criterion, clip=1)
    val_loss = evaluate(model, val_dataloader, criterion)
    
    # Evaluar las técnicas de decodificación
    evaluate_decoding_techniques(model, val_dataloader, vocab_en)
    
    # Guardar resultados
    results.append({
        'embedding_dim': emb_dim,
        'hidden_dim': hid_dim,
        'num_layers': n_layers,
        'dropout': dropout,
        'batch_size': batch_size,
        'val_loss': val_loss
    })

# Realizar el análisis de sensibilidad
for hyperparams in product(embedding_dims, hidden_dims, num_layers, dropouts, batch_sizes):
    train_and_evaluate(hyperparams)

# Imprimir resultados
for result in results:
    print(result)


Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0


  return self._grad


Greedy BLEU: 0.14
Beam BLEU: 0.27
Temperature BLEU: 0.08
Greedy ROUGE: {'rouge1': 0.24734419092672183, 'rougeL': 0.20571105071489754}
Beam ROUGE: {'rouge1': 0.270202782070649, 'rougeL': 0.22884978223320318}
Temperature ROUGE: {'rouge1': 0.1373714662399592, 'rougeL': 0.11101316875089207}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.14
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.22624507437280142, 'rougeL': 0.18686108858160277}
Beam ROUGE: {'rouge1': 0.2510856756866568, 'rougeL': 0.2195044121563164}
Temperature ROUGE: {'rouge1': 0.13806642588016182, 'rougeL': 0.11070909283074125}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.16
Temperature BLEU: 0.08
Greedy ROUGE: {'rouge1': 0.22182103494482416, 'rougeL': 0.18239795609713927}
Beam ROUGE: {'rouge1': 0.2466915113323418, 'rougeL': 0.21331089233549624}
Temperature ROUGE: {'rouge1': 0.13770613348488994, 'rougeL': 0.11031500644752061}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.13
Temperature BLEU: 0.08
Greedy ROUGE: {'rouge1': 0.22735097719094075, 'rougeL': 0.18566667676327572}
Beam ROUGE: {'rouge1': 0.2407458558545967, 'rougeL': 0.20082626215950394}
Temperature ROUGE: {'rouge1': 0.13571611440338216, 'rougeL': 0.10761591260965397}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.15
Temperature BLEU: 0.08
Greedy ROUGE: {'rouge1': 0.2466439378945665, 'rougeL': 0.20388282173727013}
Beam ROUGE: {'rouge1': 0.27265961549116524, 'rougeL': 0.23027999048287248}
Temperature ROUGE: {'rouge1': 0.139229339495111, 'rougeL': 0.11161046271493091}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.15
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.23881062986635038, 'rougeL': 0.1951413033339314}
Beam ROUGE: {'rouge1': 0.2537082561114447, 'rougeL': 0.22358477516382716}
Temperature ROUGE: {'rouge1': 0.13718136134957618, 'rougeL': 0.10896370748153646}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.18
Temperature BLEU: 0.10
Greedy ROUGE: {'rouge1': 0.2319384318071061, 'rougeL': 0.1898735853787355}
Beam ROUGE: {'rouge1': 0.2554198730490275, 'rougeL': 0.22073280832550057}
Temperature ROUGE: {'rouge1': 0.1412757108588697, 'rougeL': 0.11222803917415182}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.14
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.22546887077629843, 'rougeL': 0.18418413718378773}
Beam ROUGE: {'rouge1': 0.24094352383413953, 'rougeL': 0.19701987551894234}
Temperature ROUGE: {'rouge1': 0.14053761265736403, 'rougeL': 0.11155846187619765}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.28
Temperature BLEU: 0.09
Greedy ROUGE: {'rouge1': 0.24078878899858983, 'rougeL': 0.197844882789754}
Beam ROUGE: {'rouge1': 0.22441515558204675, 'rougeL': 0.18978106583211568}
Temperature ROUGE: {'rouge1': 0.12782609690340974, 'rougeL': 0.1016711088733482}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.20
Temperature BLEU: 0.10
Greedy ROUGE: {'rouge1': 0.24555077498307346, 'rougeL': 0.20566528143116874}
Beam ROUGE: {'rouge1': 0.25682612755804113, 'rougeL': 0.21599149365412063}
Temperature ROUGE: {'rouge1': 0.13879401743640246, 'rougeL': 0.11026822283585773}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.19
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.23464629343013307, 'rougeL': 0.19189083484264252}
Beam ROUGE: {'rouge1': 0.2500940040622987, 'rougeL': 0.21029239045570836}
Temperature ROUGE: {'rouge1': 0.1282949291729916, 'rougeL': 0.1012497101825366}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.12
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.22914945786321297, 'rougeL': 0.18699409298224384}
Beam ROUGE: {'rouge1': 0.24518435666732474, 'rougeL': 0.21069882628153344}
Temperature ROUGE: {'rouge1': 0.13781104888154716, 'rougeL': 0.1085053807911344}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.36
Temperature BLEU: 0.10
Greedy ROUGE: {'rouge1': 0.21958235065978524, 'rougeL': 0.18055287359330932}
Beam ROUGE: {'rouge1': 0.2374585815480389, 'rougeL': 0.1979793624425039}
Temperature ROUGE: {'rouge1': 0.1413143629009935, 'rougeL': 0.11198661423541753}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.15
Beam BLEU: 0.26
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.24549560571756127, 'rougeL': 0.20191728430915018}
Beam ROUGE: {'rouge1': 0.2597261755568578, 'rougeL': 0.2200487497930008}
Temperature ROUGE: {'rouge1': 0.13453603321684146, 'rougeL': 0.10570239430471873}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.12
Temperature BLEU: 0.08
Greedy ROUGE: {'rouge1': 0.22896225915703902, 'rougeL': 0.18849483024705047}
Beam ROUGE: {'rouge1': 0.25069995664799316, 'rougeL': 0.2124776879764076}
Temperature ROUGE: {'rouge1': 0.1394000146848649, 'rougeL': 0.11075548982625649}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.13
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.23103419257456856, 'rougeL': 0.19213161094751913}
Beam ROUGE: {'rouge1': 0.25134724366343536, 'rougeL': 0.21307316107916532}
Temperature ROUGE: {'rouge1': 0.13948373870943107, 'rougeL': 0.11033441483971253}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.24
Temperature BLEU: 0.08
Greedy ROUGE: {'rouge1': 0.23648003466439615, 'rougeL': 0.1936543273978418}
Beam ROUGE: {'rouge1': 0.25232580440666846, 'rougeL': 0.21037739110060394}
Temperature ROUGE: {'rouge1': 0.13868248861345267, 'rougeL': 0.11085479079138656}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.13
Temperature BLEU: 0.09
Greedy ROUGE: {'rouge1': 0.2334777334488538, 'rougeL': 0.19069465854154247}
Beam ROUGE: {'rouge1': 0.24925525025277098, 'rougeL': 0.21354694522422815}
Temperature ROUGE: {'rouge1': 0.14019342010506525, 'rougeL': 0.11115313966748565}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.14
Temperature BLEU: 0.06
Greedy ROUGE: {'rouge1': 0.23797277862158936, 'rougeL': 0.19439572512922643}
Beam ROUGE: {'rouge1': 0.2488513004970552, 'rougeL': 0.21097361223204963}
Temperature ROUGE: {'rouge1': 0.1332689706229152, 'rougeL': 0.10526854659486821}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.11
Temperature BLEU: 0.09
Greedy ROUGE: {'rouge1': 0.22943109043721696, 'rougeL': 0.18743760859622446}
Beam ROUGE: {'rouge1': 0.2456054626012538, 'rougeL': 0.20674597914253312}
Temperature ROUGE: {'rouge1': 0.14003633219176673, 'rougeL': 0.11113983982406579}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.30
Temperature BLEU: 0.10
Greedy ROUGE: {'rouge1': 0.24084423596212318, 'rougeL': 0.19470322534248835}
Beam ROUGE: {'rouge1': 0.2606270397337378, 'rougeL': 0.21489673712610222}
Temperature ROUGE: {'rouge1': 0.1323470202574598, 'rougeL': 0.10442971660196934}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.13
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.24280003879280537, 'rougeL': 0.19845597621109573}
Beam ROUGE: {'rouge1': 0.25797047302974224, 'rougeL': 0.21616418839679913}
Temperature ROUGE: {'rouge1': 0.1407234595226274, 'rougeL': 0.11110056506279516}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.14
Temperature BLEU: 0.05
Greedy ROUGE: {'rouge1': 0.2208635514360087, 'rougeL': 0.18020401395795813}
Beam ROUGE: {'rouge1': 0.239072521023288, 'rougeL': 0.20626079640187195}
Temperature ROUGE: {'rouge1': 0.13596632951531085, 'rougeL': 0.10695614287330124}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.14
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.22428081442207998, 'rougeL': 0.18310149158938505}
Beam ROUGE: {'rouge1': 0.24922468553385269, 'rougeL': 0.21255394598888058}
Temperature ROUGE: {'rouge1': 0.1407957739159444, 'rougeL': 0.11181970389622355}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.20
Beam BLEU: 0.27
Temperature BLEU: 0.14
Greedy ROUGE: {'rouge1': 0.24088425770120375, 'rougeL': 0.20205537693958786}
Beam ROUGE: {'rouge1': 0.2345134974091616, 'rougeL': 0.20159986714945802}
Temperature ROUGE: {'rouge1': 0.13209680994500334, 'rougeL': 0.10481495394001598}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.20
Temperature BLEU: 0.09
Greedy ROUGE: {'rouge1': 0.2445191892927183, 'rougeL': 0.20008658744805255}
Beam ROUGE: {'rouge1': 0.25286721368107157, 'rougeL': 0.21267412239197486}
Temperature ROUGE: {'rouge1': 0.13636036766321838, 'rougeL': 0.10904131593596027}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.14
Temperature BLEU: 0.08
Greedy ROUGE: {'rouge1': 0.22687122779999397, 'rougeL': 0.18536619217057956}
Beam ROUGE: {'rouge1': 0.23401143184056525, 'rougeL': 0.1963842840844149}
Temperature ROUGE: {'rouge1': 0.13485722893394214, 'rougeL': 0.10649972023123801}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.17
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.229063428455179, 'rougeL': 0.18703419581117023}
Beam ROUGE: {'rouge1': 0.2498425290859321, 'rougeL': 0.2132559822032217}
Temperature ROUGE: {'rouge1': 0.1371246226119922, 'rougeL': 0.10700236576532737}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.15
Temperature BLEU: 0.08
Greedy ROUGE: {'rouge1': 0.23589703134739143, 'rougeL': 0.19553013602461922}
Beam ROUGE: {'rouge1': 0.24899859075376624, 'rougeL': 0.20587108775412}
Temperature ROUGE: {'rouge1': 0.13931781086258288, 'rougeL': 0.11000010300004494}
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.13
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.22810498167881163, 'rougeL': 0.18613185170826013}
Beam ROUGE: {'rouge1': 0.2375173920782681, 'rougeL': 0.19501079272898914}
Temperature ROUGE: {'rouge1': 0.1312268633399786, 'rougeL': 0.10475516432109458}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.14
Temperature BLEU: 0.10
Greedy ROUGE: {'rouge1': 0.22515808658978714, 'rougeL': 0.18954773656122417}
Beam ROUGE: {'rouge1': 0.24811967622850747, 'rougeL': 0.20947554807482627}
Temperature ROUGE: {'rouge1': 0.13739789670211636, 'rougeL': 0.10753475792002361}
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0




Greedy BLEU: 0.14
Beam BLEU: 0.13
Temperature BLEU: 0.07
Greedy ROUGE: {'rouge1': 0.22775100614744054, 'rougeL': 0.1859535411612792}
Beam ROUGE: {'rouge1': 0.24342908535747465, 'rougeL': 0.20344739454023517}
Temperature ROUGE: {'rouge1': 0.13594409849519512, 'rougeL': 0.10676429911950709}
{'embedding_dim': 128, 'hidden_dim': 256, 'num_layers': 1, 'dropout': 0.3, 'batch_size': 32, 'val_loss': 5.492411881685257}
{'embedding_dim': 128, 'hidden_dim': 256, 'num_layers': 1, 'dropout': 0.3, 'batch_size': 64, 'val_loss': 5.551296889781952}
{'embedding_dim': 128, 'hidden_dim': 256, 'num_layers': 1, 'dropout': 0.5, 'batch_size': 32, 'val_loss': 5.5386562794446945}
{'embedding_dim': 128, 'hidden_dim': 256, 'num_layers': 1, 'dropout': 0.5, 'batch_size': 64, 'val_loss': 5.561449289321899}
{'embedding_dim': 128, 'hidden_dim': 256, 'num_layers': 2, 'dropout': 0.3, 'batch_size': 32, 'val_loss': 5.505394339561462}
{'embedding_dim': 128, 'hidden_dim': 256, 'num_layers': 2, 'dropout': 0.3, 'batch_size': 

En base al análisis realizado podemos observar que el parámetro que genera más impacto en el rendimiento del modelo es el del número de layers, que parece otorgar una mejor validación a lo largo de las distintas evaluaciones

In [13]:
import language_tool_python

# Inicializar la herramienta de revisión gramatical
tool = language_tool_python.LanguageTool('en-US')

# Función para identificar errores gramaticales
def log_grammar_errors(text):
    matches = tool.check(text)
    errors = [match.ruleIssueType for match in matches]
    return errors

# Función para evaluar y loggear errores usando beam search
def evaluate_and_log_errors_with_beam_search(model, dataloader, trg_vocab, max_len=50, beam_width=3):
    references = []
    beam_hypotheses = []

    for i, (src, trg) in enumerate(dataloader):
        src = src.to(model.device)
        trg = trg.to(model.device)

        for j in range(src.shape[1]):
            src_sentence = src[:, j]
            trg_sentence = trg[:, j]

            trg_tokens = [trg_vocab.lookup_token(t.item()) for t in trg_sentence if t.item() != trg_vocab["<pad>"]]

            beam_tokens = beam_search_decode(model, src_sentence, max_len, trg_vocab, beam_width)

            references.append(" ".join(trg_tokens))
            beam_hypotheses.append(" ".join(beam_tokens))

    print(len(beam_hypotheses))

    beam_bleu = calculate_bleu(references, beam_hypotheses)
    beam_rouge = calculate_rouge(references, beam_hypotheses)

    print(f'Beam BLEU: {beam_bleu:.2f}')
    print(f'Beam ROUGE: {beam_rouge}')

    # Loggear errores gramaticales
    for i, hyp in enumerate(beam_hypotheses):
        errors = log_grammar_errors(hyp)
        print(f'Translation {i+1}: {hyp}')
        print(f'Errors: {errors}')
        print('---')

# Evaluar y loggear errores
evaluate_and_log_errors_with_beam_search(model, val_dataloader, vocab_en)


1014


That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


Beam BLEU: 0.78
Beam ROUGE: {'rouge1': 0.3147058688479438, 'rougeL': 0.26606960120520834}
Translation 1: group of people people are a on a street . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Errors: ['typographical', 'duplication', 'misspelling', 'whitespace', 'whitespace']
---
Translation 2: sitting on front of a man in a a . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Errors: ['typographical', 'duplication', 'whitespace', 'whitespace']
---
Translation 3: and a man is sitting on a table . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Errors: ['typographical', 'whitespace', 'whitespace']
---
Translation 4: man is a a in a a in a a . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Errors: ['typographical', 'duplication', 'duplication', 'duplication', 'whitespace', 'whitespace']
---
Translation 5: man in a blue shirt and blue shirt is on a a in a a . 

Los resultados parecen indicar que el modelo puede estar atascado en un bucle durante la generación.
Estos errores afectan gravemente la fluidez y la coherencia de las traducciones. El problema de puntuación también refiere tal vez algún problema con el decodificador.
Errores de "misspelling" y "grammar" indican deficiencias en la comprensión del idioma objetivo por parte del modelo.   

# 5

Colocamos los hiperparámetros a buscar.

In [16]:
from itertools import product

embedding_sizes = [256, 512]
hidden_sizes = [512, 1024]
dropout_rates = [0.3, 0.5]
num_layers = [1, 2]

hyperparameter_combinations = list(product(embedding_sizes, hidden_sizes, dropout_rates, num_layers))

In [18]:
import math

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [19]:
import time

def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, criterion, n_epochs, clip):
    best_val_loss = float('inf')
    for epoch in range(n_epochs):
        start_time = time.time()
        train_loss = train(model, train_dataloader, optimizer, criterion, clip)
        val_loss = evaluate(model, val_dataloader, criterion)
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best-model.pt')

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {val_loss:.3f} |  Val. PPL: {math.exp(val_loss):7.3f}')

    return best_val_loss

Realizamos la implementación del grid_Search.

In [26]:
def grid_search(hyperparameter_combinations, train_dataloader, val_dataloader, vocab_de, vocab_en):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    n_epochs = 3
    clip = 1
    results = []

    for embedding_size, hidden_size, dropout_rate, num_layer in hyperparameter_combinations:
        print(f'Training with emb_size={embedding_size}, hid_size={hidden_size}, dropout={dropout_rate}, layers={num_layer}')

        # Configuración del modelo
        enc = Encoder(len(vocab_de), embedding_size, hidden_size, num_layer, dropout_rate)
        dec = Decoder(len(vocab_en), embedding_size, hidden_size, num_layer, dropout_rate)
        model = Seq2Seq(enc, dec, device, len(vocab_en)).to(device)

        # Inicialización de pesos y optimizador
        model.apply(init_weights)
        optimizer = torch.optim.Adam(model.parameters())
        criterion = nn.CrossEntropyLoss(ignore_index=vocab_en["<pad>"])

        # Entrenar y evaluar
        best_val_loss = train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, criterion, n_epochs, clip)
        results.append((embedding_size, hidden_size, dropout_rate, num_layer, best_val_loss))

    return results


In [27]:
results = grid_search(hyperparameter_combinations, train_dataloader, val_dataloader, vocab_de, vocab_en)

# Mostrar resultados
for result in results:
    emb_size, hid_size, dropout, layers, val_loss = result
    print(f'Emb_size: {emb_size}, Hid_size: {hid_size}, Dropout: {dropout}, Layers: {layers}, Val_loss: {val_loss}')

Training with emb_size=256, hid_size=512, dropout=0.3, layers=1
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0
Epoch: 01 | Time: 13m 3s
	Train Loss: 5.542 | Train PPL: 255.234
	 Val. Loss: 5.544 |  Val. PPL: 255.618
Epoch: 02 | Time: 12m 44s
	Train Loss: 5.138 | Train PPL: 170.427
	 Val. Loss: 5.508 |  Val. PPL: 246.625
Epoch: 03 | Time: 12m 45s
	Train Loss: 5.026 | Train PPL: 152.294
	 Val. Loss: 5.529 |  Val. PPL: 251.966
Training with emb_size=256, hid_size=512, dropout=0.3, layers=2
Applying weight drop of 0.3 to weight_hh_l0
Applying weight drop of 0.3 to weight_hh_l0
Epoch: 01 | Time: 15m 37s
	Train Loss: 5.700 | Train PPL: 298.884
	 Val. Loss: 5.547 |  Val. PPL: 256.500
Epoch: 02 | Time: 15m 40s
	Train Loss: 5.210 | Train PPL: 183.028
	 Val. Loss: 5.502 |  Val. PPL: 245.296
Epoch: 03 | Time: 15m 39s
	Train Loss: 5.037 | Train PPL: 153.964
	 Val. Loss: 5.486 |  Val. PPL: 241.389
Training with emb_size=256, hid_size=512, dropout=0.5, layers

El análisis indica que una configuración con embedding_size de 512, hidden_size de 1024, dropout de 0.3 y 2 capas es la combinación óptima en el espacio de búsqueda evaluado. Esta configuración proporciona la mejor pérdida de validación, sugiriendo una mejor calidad de las traducciones producidas por el modelo.

# 6

Implementando el mecanismo de atención.

In [28]:
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)
    
    def forward(self, hidden, encoder_outputs):

        src_len = encoder_outputs.shape[0]
        
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        
        attention = self.v(energy).squeeze(2)
        
        return F.softmax(attention, dim=1)

In [29]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.emb_dropout = EmbeddingDropout(self.embedding, dropout)
        self.rnn = WeightDrop(nn.LSTM(emb_dim, hid_dim, n_layers), ['weight_hh_l0'], dropout=dropout)
        self.dropout = LockedDropout(dropout)
    
    def forward(self, src):
        embedded = self.emb_dropout(src)
        embedded = self.dropout(embedded)
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        return outputs, hidden, cell

In [30]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
        super().__init__()
        
        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.emb_dropout = EmbeddingDropout(self.embedding, dropout)
        self.rnn = WeightDrop(nn.LSTM(emb_dim + hid_dim, hid_dim, n_layers), ['weight_hh_l0'], dropout=dropout)
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        self.dropout = LockedDropout(dropout)
    
    def forward(self, input, hidden, cell, encoder_outputs):
        
        input = input.unsqueeze(0)
        
        embedded = self.emb_dropout(input)
        embedded = self.dropout(embedded)
        
        a = self.attention(hidden[-1], encoder_outputs)
        a = a.unsqueeze(1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        
        rnn_input = torch.cat((embedded, weighted), dim=2)
        
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        
        return prediction, hidden, cell

In [31]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        encoder_outputs, hidden, cell = self.encoder(src)
        
        input = trg[0, :]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[t] if random.random() < teacher_forcing_ratio else top1
        
        return outputs

In [32]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [33]:
def train(model, dataloader, optimizer, criterion, clip):
    model.train()
    
    epoch_loss = 0
    
    for i, (src, trg) in enumerate(dataloader):
        src = src.to(model.device)
        trg = trg.to(model.device)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)


In [34]:
def evaluate(model, dataloader, criterion):
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        for i, (src, trg) in enumerate(dataloader):
            src = src.to(model.device)
            trg = trg.to(model.device)
            
            output = model(src, trg, 0)  # Turn off teacher forcing
            
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)

Desarrollando la evaluación del modelo.

In [35]:

ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(HID_DIM)
enc = Encoder(len(vocab_de), ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(len(vocab_en), DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, device).to(device)


model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters())
TRG_PAD_IDX = vocab_en["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

# Función de entrenamiento y evaluación
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, criterion, n_epochs, clip):
    best_val_loss = float('inf')
    for epoch in range(n_epochs):
        start_time = time.time()
        train_loss = train(model, train_dataloader, optimizer, criterion, clip)
        val_loss = evaluate(model, val_dataloader, criterion)
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best-model.pt')

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {val_loss:.3f} |  Val. PPL: {math.exp(val_loss):7.3f}')

    return best_val_loss

# Entrenamiento y evaluación del modelo con atención
n_epochs = 10
clip = 1
train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, criterion, n_epochs, clip)

Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Epoch: 01 | Time: 26m 6s
	Train Loss: 5.471 | Train PPL: 237.704
	 Val. Loss: 5.411 |  Val. PPL: 223.896
Epoch: 02 | Time: 27m 19s
	Train Loss: 5.037 | Train PPL: 154.010
	 Val. Loss: 5.291 |  Val. PPL: 198.580
Epoch: 03 | Time: 27m 0s
	Train Loss: 4.760 | Train PPL: 116.720
	 Val. Loss: 5.041 |  Val. PPL: 154.635
Epoch: 04 | Time: 27m 12s
	Train Loss: 4.486 | Train PPL:  88.778
	 Val. Loss: 4.841 |  Val. PPL: 126.583
Epoch: 05 | Time: 27m 11s
	Train Loss: 4.257 | Train PPL:  70.578
	 Val. Loss: 4.723 |  Val. PPL: 112.450
Epoch: 06 | Time: 27m 9s
	Train Loss: 4.068 | Train PPL:  58.459
	 Val. Loss: 4.603 |  Val. PPL:  99.778
Epoch: 07 | Time: 27m 28s
	Train Loss: 3.894 | Train PPL:  49.112
	 Val. Loss: 4.530 |  Val. PPL:  92.729
Epoch: 08 | Time: 28m 6s
	Train Loss: 3.776 | Train PPL:  43.660
	 Val. Loss: 4.518 |  Val. PPL:  91.610
Epoch: 09 | Time: 25m 41s
	Train Loss: 3.648 | Train PPL:  38.389
	 

4.413198292255402

Se puede apreciar que a lo largo de las épocas el modelo consigue una mejora significativa. La implementación del mecanismo de atención ha tenido un impacto positivo significativo en el rendimiento del modelo Seq2Seq mejorando la capacidad del modelo para manejar secuencias largas. La reducción en las pérdidas y perplejidades a lo largo de las épocas indica que el modelo está aprendiendo de manera efectiva y generalizando bien a los datos no vistos.