In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
import time
import os

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

%matplotlib inline

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
os.chdir("/home/kmuenala/nlp")

In [4]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
import torch
import torch.nn as nn

class TransformerEncoder(nn.Module):
    def __init__(self, input_size, d_model, nhead, num_layers):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(input_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.d_model = d_model

    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        memory = self.transformer_encoder(src)
        return memory

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [6]:
class TransformerDecoder(nn.Module):
    def __init__(self, output_size, d_model, nhead, num_layers):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(output_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        decoder_layers = nn.TransformerDecoderLayer(d_model, nhead)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layers, num_layers)
        self.fc_out = nn.Linear(d_model, output_size)
        self.d_model = d_model

    def forward(self, tgt, memory):
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.pos_encoder(tgt)
        output = self.transformer_decoder(tgt, memory)
        output = self.fc_out(output)
        return output

In [22]:
import torch
import torch.nn as nn
from torch import optim
import random
import math

# Definir los hiperparámetros
INPUT_SIZE = input_lang.n_words  # Número de palabras en el vocabulario del idioma fuente
OUTPUT_SIZE = output_lang.n_words  # Número de palabras en el vocabulario del idioma destino
D_MODEL = 512  # Dimensión de los embeddings y del modelo
NHEAD = 8  # Número de cabezas de atención
NUM_LAYERS = 6  # Número de capas de codificador/decodificador
BATCH_SIZE = 64
MAX_LENGTH = 10  # Longitud máxima de las oraciones
LEARNING_RATE = 0.0005
EPOCHS = 10

# Inicializar Encoder y Decoder basados en Transformer
encoder = TransformerEncoder(INPUT_SIZE, D_MODEL, NHEAD, NUM_LAYERS).to(device)
decoder = TransformerDecoder(OUTPUT_SIZE, D_MODEL, NHEAD, NUM_LAYERS).to(device)

# Definir optimizadores
encoder_optimizer = optim.Adam(encoder.parameters(), lr=LEARNING_RATE)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=LEARNING_RATE)

# Definir la función de pérdida (Cross Entropy)
criterion = nn.CrossEntropyLoss()

# Función de entrenamiento
def train_step(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length):
    encoder.train()
    decoder.train()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0

    # Paso de codificación (encoder)
    encoder_outputs = encoder(input_tensor)

    # Inicializar el primer token para el decodificador (comienza con SOS)
    decoder_input = torch.tensor([[SOS_token]], device=device)

    for t in range(target_length):
        decoder_output = decoder(decoder_input, encoder_outputs)
        topv, topi = decoder_output.topk(1)  # Obtener la palabra con la mayor probabilidad
        decoder_input = topi.squeeze().detach()  # Actualizamos el input del decodificador

        # Calcular la pérdida
        loss += criterion(decoder_output, target_tensor[t].unsqueeze(0))

    # Backpropagation
    loss.backward()

    # Actualizar los pesos de encoder y decoder
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

# Función para entrenar el modelo por múltiples épocas
def train(encoder, decoder, n_epochs, print_every=1000):
    for epoch in range(1, n_epochs + 1):
        total_loss = 0

        # Recorrer el dataset y entrenar
        for i, (input_tensor, target_tensor) in enumerate(data_loader):
            loss = train_step(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, MAX_LENGTH)
            total_loss += loss

            if i % print_every == 0:
                print(f"Epoch {epoch} - Step {i} - Loss: {loss}")

        print(f"Epoch {epoch} - Average Loss: {total_loss / len(data_loader)}")

# Función de evaluación
def evaluate(encoder, decoder, sentence, input_lang, output_lang, max_length):
    with torch.no_grad():
        encoder.eval()
        decoder.eval()

        # Codificar la oración de entrada
        input_tensor = tensorFromSentence(input_lang, sentence)
        encoder_outputs = encoder(input_tensor)

        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoded_words = []

        for _ in range(max_length):
            decoder_output = decoder(decoder_input, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            if topi.item() == EOS_token:
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return ' '.join(decoded_words)

# Entrenar el modelo
train(encoder, decoder, EPOCHS)

TypeError: tensorFromSentence() takes 2 positional arguments but 3 were given

In [None]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [8]:
MAX_LENGTH = 10

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

def filterPair(p):
    try:
        return len(p[0].split(' ')) < MAX_LENGTH and \
            len(p[1].split(' ')) < MAX_LENGTH #and \
#            p[0].startswith(eng_prefixes)
    except:
        print(p)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]



In [9]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [10]:
def prepareData(lang1, lang2, file):
    text = open(file, encoding='utf-8').read().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')][:2] for l in text ]
    pairs = [pair for pair in pairs if len(pair) == 2]

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    pairs = filterPairs(pairs)

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [None]:
path = "./models/"
encoder = torch.load(path+"translate_sp_en_encoder.pt")
decoder = torch.load(path+"translate_sp_en_decoder.pt")

In [12]:
file = 'data/spa.txt'
input_lang, output_lang, pairs = prepareData('eng', 'spa', file)

Counted words:
eng 12105
spa 23411


In [18]:
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, pairs, input_lang, output_lang, max_length=10):
        self.pairs = pairs
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        input_sentence, target_sentence = self.pairs[idx]

        # Convertir las oraciones en tensores
        input_tensor = tensorFromSentence(self.input_lang, input_sentence, self.max_length)
        target_tensor = tensorFromSentence(self.output_lang, target_sentence, self.max_length)

        def indexesFromSentence(lang, sentence):
            return [lang.word2index[word] for word in sentence.split(' ')]

        def tensorFromSentence(lang, sentence):
            indexes = indexesFromSentence(lang, sentence)
            indexes.append(EOS_token)
            return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

        def tensorsFromPair(pair):
            input_tensor = tensorFromSentence(input_lang, pair[0])
            target_tensor = tensorFromSentence(output_lang, pair[1])
            return (input_tensor, target_tensor)

        return input_tensor, target_tensor


In [21]:
# Crear un dataset
dataset = TranslationDataset(pairs, input_lang, output_lang, max_length=MAX_LENGTH)

# Crear un DataLoader
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)


In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> that wasn t my fault
= eso no fue mi culpa
< no fue mi culpa mia <EOS>

> please smile
= sonreid
< favor le puso gasolina en las armas <EOS>

> she s at a meeting
= ella esta en una reunion
< ella es un aliado en la reunion <EOS>

> you learn something new every day
= cada dia aprendes algo nuevo
< aprendiste un nuevo dia de ingles <EOS>

> this machine can print sixty pages a minute
= este aparato puede imprimir sesenta paginas por minuto
< este aparato puede imprimir sesenta paginas por minuto <EOS>

> is this price acceptable ?
= es aceptable el precio ?
< es aceptable el precio de la mia ? <EOS>

> why are you wearing my coat ?
= por que estas usando mi abrigo ?
< por que lleva usted mi abrigo ? <EOS>

> tom was just as scared as mary was
= tom estaba tan asustado como lo estaba mary
< tom estaba tan asustado como lo estaba mary <EOS>

> this was his one and only hope
= esta era su unica esperanza
< era su unica parque solo tiene una necesidad urgente <EOS>

> did you already do 

In [None]:
evaluate(encoder, decoder, 'she is my sister', input_lang, output_lang)

(['ella', 'es', 'mi', 'hermana', 'mayor', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'i am cleaning my house', input_lang, output_lang)

(['estoy', 'contento', 'de', 'mi', 'casa', 'es', 'verde', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'when is homework due ?', input_lang, output_lang)

(['cuando', 'puedo', 'hacer', 'los', 'documentos', '?', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'i m scared', input_lang, output_lang)

(['tengo', 'miedo', 'de', 'tener', 'miedo', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'what is my name ?', input_lang, output_lang)

(['como', 'se', 'llama', 'mi', '?', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'what is your name ?', input_lang, output_lang)

(['cual', 'es', 'tu', 'nombre', '?', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'what is her name ?', input_lang, output_lang)

(['como', 'se', 'llama', 'tu', 'nombre', '?', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'what is his name ?', input_lang, output_lang)

(['como', 'se', 'llama', 'tu', 'nombre', '?', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'many years later', input_lang, output_lang)

(['muchos', 'anos', 'mas', 'lejos', 'mucho', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'i m taking an english class', input_lang, output_lang)

(['estoy', 'bebiendo', 'un', 'vaso', 'de', 'ingles', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'i m studying in an english class', input_lang, output_lang)

(['estoy', 'estudiando', 'ingles', 'en', 'el', 'estudio', '<EOS>'], None)

In [None]:
evaluate(encoder, decoder, 'they play soccer', input_lang, output_lang)

(['juegan', 'al', 'futbol', 'al', 'futbol', '<EOS>'], None)

In [None]:
play v. -> jugar
play n. -> actividad de entretenimiento

In [None]:
evaluate(encoder, decoder, 'my brother never showed up in my party', input_lang, output_lang)

(['mi', 'hermano', 'nunca', 'se', 'fue', 'en', 'mi', 'fiesta', '<EOS>'], None)

In [None]:
path = "./models/"
encoder_attn = torch.load(path+"translate_sp_en_attn_encoder.pt")
decoder_attn = torch.load(path+"translate_sp_en_attn_decoder.pt")

In [None]:
evaluate(encoder_attn, decoder_attn, 'she is my sister', input_lang, output_lang)[0]

['ella', 'es', 'mi', 'hermana', 'llamado', '<EOS>']