Primero importamos las librerías necesarias.

In [32]:
# Si importar Field da problemas entonces hacer un downgrade de torchtext a la versión 0.6:
# !pip install torchtext==0.6.0

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torchtext.data import Field, TabularDataset, BucketIterator
import spacy # librería para NLP que funciona como tokenizador

device =  torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Cargamos el Dataset con las frases en inglés y español. El dataset se obtuvo de https://www.kaggle.com/datasets/lonnieqin/englishspanish-translation-dataset.
Separamos el 90% para entrenamiento y el 10% restante para probar el modelo. No separamos un conjunto de validación ya que no vamos a afinar los hiperparámetros.

In [10]:
url = 'https://raw.githubusercontent.com/Nico7102/eng-esp-transformer-pytorch/main/data.csv'
data = pd.read_csv(url)
n = int(0.9*len(data))
train_data = data[:n]
test_data = data[n:]

In [11]:
try:
    os.mkdir('data')
    print('carpeta data creada')
except OSError as error:
    print('carpeta data ya existe')
train_data.to_csv("data/train.csv", index=False)
test_data.to_csv("data/test.csv", index=False)
print('guardado en data')

carpeta data ya existe
guardado en data


Cargamos el tokenizador y construimos los vocabularios para los textos en inglés y español.

In [12]:
spacy_en = spacy.load("en_core_web_sm")
spacy_es = spacy.load("es_core_news_sm")

def tokenize_en(text):
    return [token.text_with_ws for token in spacy_en.tokenizer(text)]

def tokenize_es(text):
    return [token.text_with_ws for token in spacy_es.tokenizer(text)]

english = Field(tokenize=tokenize_en, init_token="<sos>", eos_token="<eos>")
spanish = Field(tokenize=tokenize_es, init_token="<sos>", eos_token="<eos>")

fields = {'english': ('src', english), 'spanish':('tgt', spanish)}

train_data, test_data = TabularDataset.splits(
    path='data',
    train='train.csv',
    test='test.csv',
    format='csv',
    fields=fields
)

english.build_vocab(train_data, max_size=10000, min_freq=1)
spanish.build_vocab(train_data, max_size=10000, min_freq=1)

In [76]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_tokenizer,
        d_model,
        src_vocab_size,
        tgt_vocab_size,
        src_pad_idx,
        nhead,
        num_encoder_layers,
        num_decoder_layers,
        dim_feedforward,
        dropout,
        window_size,
        device
    ):
        super(Transformer, self).__init__()
        self.device = device

        self.tokenize = src_tokenizer

        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.src_pos_encoding = nn.Embedding(window_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.tgt_pos_encoding = nn.Embedding(window_size, d_model)

        self.transformer = nn.Transformer(
            d_model,
            nhead,
            num_encoder_layers,
            num_decoder_layers,
            dim_feedforward,
            dropout,
        )
        
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx
        return src_mask.to(self.device)

    def forward(self, src, tgt):
        src_seq_length, N = src.shape
        tgt_seq_length, N = tgt.shape

        src_positions = (torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(self.device))
        tgt_positions = (torch.arange(0, tgt_seq_length).unsqueeze(1).expand(tgt_seq_length, N).to(self.device))

        embed_src = self.dropout((self.src_embedding(src) + self.src_pos_encoding(src_positions)))
        embed_tgt = self.dropout((self.tgt_embedding(tgt) + self.tgt_pos_encoding(tgt_positions)))

        src_padding_mask = self.make_src_mask(src)
        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt_seq_length).to(self.device)

        out = self.transformer(
            embed_src,
            embed_tgt,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=tgt_mask,
        )
        out = self.fc_out(out)
        return out
    
    def translate(self, sentence, max_len=100):
        if type(sentence) == str:
            tokens = self.tokenize(sentence)
        else:
            tokens = [token for token in sentence]
        
        tokens.insert(0, english.init_token)
        tokens.append(english.eos_token)

        text_to_indices = [english.vocab.stoi[token] for token in tokens]

        src_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(self.device)

        outputs = [english.vocab.stoi["<sos>"]]

        for i in range(max_len):
            tgt_tensor = torch.LongTensor(outputs).unsqueeze(1).to(self.device)

            with torch.no_grad():
                logits = self(src_tensor, tgt_tensor)

            # best_guess = logits.argmax(2)[-1, :].item()

            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            guess = torch.multinomial(probs, num_samples=1)[-1, :].item()

            outputs.append(guess)

            if guess == spanish.vocab.stoi["<sos>"] or guess == spanish.vocab.stoi["<pad>"]:
                continue
            if guess == spanish.vocab.stoi["<eos>"]:
                break
            
            yield spanish.vocab.itos[guess]

        return ''.join([spanish.vocab.itos[idx] for idx in outputs])

In [88]:
# hiperparámetros

load_model = True
save_model = True

num_epochs = 30
learning_rate = 3e-4
batch_size = 32

d_model = 512
src_vocab_size = len(english.vocab)
tgt_vocab_size = len(spanish.vocab)
src_pad_idx = english.vocab.stoi["<pad>"]
nhead = 8
num_encoder_layers = 3
num_decoder_layers = 3
dim_feedforward = 4
dropout = 0.1
window_size = 100

In [89]:
torch.manual_seed(42)

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

model = Transformer(
    tokenize_en,
    d_model,
    src_vocab_size,
    tgt_vocab_size,
    src_pad_idx,
    nhead,
    num_encoder_layers,
    num_decoder_layers,
    dim_feedforward,
    dropout,
    window_size,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [90]:
if load_model:
    checkpoint = torch.load("eng-esp-trained.tar")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    loaded_epoch = checkpoint["epoch"]
else:
    loaded_epoch = 0

sentence = "Tom pressed his ear against the wall to see if he could hear what his parents were discussing in the next room."

for epoch in range(loaded_epoch, num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "epoch":epoch,
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        print("Guardando modelo...")
        torch.save(checkpoint, "eng-esp-model.tar")

    model.eval()
    print("Oración traducida:")
    for token in model.translate(sentence):
        print(token, end='')
    print('')
    model.train()

    losses = []

    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.tgt.to(device)
        
        output = model(inp_data, target[:-1, :])

        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)
        losses.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)

[Epoch 5 / 30]
Guardando modelo...
Oración traducida:
Tom soñó su oído contra el muro de ver con usted.
[Epoch 6 / 30]
Guardando modelo...
Oración traducida:
Tom pasó su oído contra que saliera si alguien pudiera ver la regla.
[Epoch 7 / 30]
Guardando modelo...
Oración traducida:
Tom cogió su oído contra la pared de ver si ha leído si vendría .
[Epoch 8 / 30]
Guardando modelo...
Oración traducida:
Tom puso su oído contra la pared de entrar.
[Epoch 9 / 30]
Guardando modelo...
Oración traducida:
Tom <unk>sus oído contra la pared de estar al oír si se viene.
[Epoch 10 / 30]
Guardando modelo...
Oración traducida:
Tom respondió su oído contra la pared de ver si podía ver a alguien podría podía terrible podría cienciasi podía última costumbre que la <unk>algosi podía trajo ver si podía <unk>?
[Epoch 11 / 30]
Guardando modelo...
Oración traducida:
Tom cogió su oído contra la pared a ver si pudiera que podría conducir.
[Epoch 12 / 30]
Guardando modelo...
Oración traducida:
Tom puso su oído con

In [97]:
sentence = "He is tall and strong."

for token in model.translate(sentence):
        print(token, end='')

Él mide alto y fuerte.