In [1]:
import random
import numpy as np
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from nltk.tokenize import RegexpTokenizer

In [2]:
punct_regex = RegexpTokenizer(r'\w+')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [16]:
class Vocabulary(object):
    def __init__(self, token_to_idx=None):
 
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
 
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
 
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
 
    def lookup_token(self, token):
        return self._token_to_idx[token]
 
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
 
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
 
    def __len__(self):
        return len(self._token_to_idx)

    
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):
 
        super(SequenceVocabulary, self).__init__(token_to_idx)
 
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token
 
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)
 
    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

        
class NMTVectorizer(object):
    def __init__(self, source_vocab, target_vocab, max_source_length, max_target_length):
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length        
        
    @classmethod
    def from_dataframe(cls, bitext_df):
        source_vocab = SequenceVocabulary()
        target_vocab = SequenceVocabulary()
        
        max_source_length = 0
        max_target_length = 0
 
        for _, row in bitext_df.iterrows():
            source_tokens = punct_regex.tokenize(row["source_language"])
            if len(source_tokens) > max_source_length:
                max_source_length = len(source_tokens)
            for token in source_tokens:
                source_vocab.add_token(token)
            
            target_tokens = punct_regex.tokenize(row["target_language"])
            if len(target_tokens) > max_target_length:
                max_target_length = len(target_tokens)
            for token in target_tokens:
                target_vocab.add_token(token)
            
        return cls(source_vocab, target_vocab, max_source_length, max_target_length)
    
    def get_vector(self, text, source=True, target=False):
        if source:
            vocab = self.source_vocab
            max_seq_len = self.max_source_length
        else:
            vocab = self.target_vocab
            max_seq_len = self.max_target_length
            
        vector = np.zeros(max_seq_len + 2, dtype=np.int64)
        
        vector[0] = vocab.lookup_token(vocab._begin_seq_token)
        
        for i in range(len(text)):
            vector[i+1] = vocab.lookup_token(text[i])
        
        vector[len(text) + 1] = vocab.lookup_token(vocab._end_seq_token)
        
        return vector
    
    def vectorize(self, source_text, target_text):
        source_text = punct_regex.tokenize(source_text)
        target_text = punct_regex.tokenize(target_text)
        
        source_vector = self.get_vector(source_text)
        target_vector = self.get_vector(target_text, False, True)
        
        return source_vector, target_vector
    
class NMTDataset(Dataset):
    def __init__(self, text_df, vectorizer):
        self.text_df = text_df
        self._vectorizer = vectorizer
 
        self.train_df = self.text_df[self.text_df.split=='train']
        self.train_size = len(self.train_df)
 
        self.val_df = self.text_df[self.text_df.split=='val']
        self.validation_size = len(self.val_df)
 
        self.test_df = self.text_df[self.text_df.split=='test']
        self.test_size = len(self.test_df)
 
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
 
        self.set_split('train')
 
    @classmethod
    def load_dataset_and_make_vectorizer(cls, dataset_csv):
        text_df = pd.read_csv(dataset_csv)
        train_subset = text_df[text_df.split=='train']
        return cls(text_df, NMTVectorizer.from_dataframe(train_subset))
 
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    def get_vectorizer(self):
        return self._vectorizer
 
    def __len__(self):
        return self._target_size
 
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
 
        source_vector, target_vector = self._vectorizer.vectorize(row.source_language, row.target_language)
        
        return {"x_source": source_vector,
               "y_target": target_vector}

In [17]:
def get_batch_loader(dataset, batch_size):
    data_indices = np.arange(dataset._target_size)    
    np.random.shuffle(data_indices)
    
    data = []
    for index in data_indices:
        data.append(dataset[index])
        
    data_loader = DataLoader(data, batch_size=batch_size)
        
    return data_loader

In [21]:
dataset = NMTDataset.load_dataset_and_make_vectorizer("../../PyTorchNLPBook/data/nmt/simplest_eng_fra.csv")
vectorizer = dataset.get_vectorizer()
padding_index = vectorizer.source_vocab.mask_index

EMBEDDING_SIZE = 256
RNN_HIDDEN_SIZE = 64
NUM_EPOCHS = 100
LEARNING_RATE = 0.001
PRINT_EVERY = 10
BATCH_SIZE = 64
SOURCE_VOCAB_SIZE = len(vectorizer.source_vocab._token_to_idx)
TARGET_VOCAB_SIZE = len(vectorizer.target_vocab._token_to_idx)

dataset.set_split("train")
train_loader = get_batch_loader(dataset, BATCH_SIZE)

dataset.set_split("val")
val_loader = get_batch_loader(dataset, BATCH_SIZE)

In [20]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, bidirectional = True)
        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
    
    def forward(self, x):
        x_embed = self.dropout(self.embedding(x))
        
        encoder_output, hidden = self.rnn(x_embed)
        
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]) , dim = 2))
        
        return encoder_output, hidden
    

class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, p):
        super(Decoder, self).__init__()
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(hidden_size * 2 + embedding_size, hidden_size)
        
        self.energy = nn.Linear(hidden_size * 3, 1)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p)
        self.softmax = nn.Softmax(dim = 0)
        self.relu = nn.ReLU()
        
    def forward(self, x, encoder_states, hidden):
        x = x.unsqueeze(0)
        
        x_embed = self.dropout(self.embedding(x))
        
        seq_len = encoder_states.shape[0]
        
        h_reshaped = hidden.repeat(seq_len, 1, 1)
        
        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        
        attention = self.softmax(energy)
        
        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)

        rnn_input = torch.cat((context_vector, x_embed), dim=2)

        outputs, hidden = self.rnn(rnn_input, hidden)

        predictions = self.fc(outputs).squeeze(0)

        return predictions, hidden


class Seq2Seq(nn.Module):
    def __init__(self, source_vocab_size, 
                target_vocab_size,
                target_output_size,
                embedding_size,
                rnn_hidden_size,
                dropout_p):
        super(Seq2Seq, self).__init__()
        
        self.encoder = Encoder(source_vocab_size,
                               embedding_size,
                               rnn_hidden_size, 
                               dropout_p).to(device)
        
        self.decoder = Decoder(target_vocab_size,
                               embedding_size, 
                               rnn_hidden_size,
                               target_output_size, 
                               dropout_p).to(device)
        
    def forward(self, source, target):
        encoder_states, hidden = self.encoder(source)
        
        target_len = target.shape[0]
        batch_size = target.shape[1]
        target_vocab_len = TARGET_VOCAB_SIZE
        
        outputs = torch.zeros(target_len, batch_size, target_vocab_len).to(device)
        x = target[0]
        
        for i in range(1, target_len):
            out, hidden = self.decoder(x, encoder_states, hidden)
            
            outputs[i] = out
            
            best_guess = out.argmax(1)
            
            if random.random() < 0.5:
                x = target[i]
            else:
                x = best_guess
        
        return outputs
    
# model = Seq2Seq(SOURCE_VOCAB_SIZE, 
#                 TARGET_VOCAB_SIZE, 
#                 TARGET_VOCAB_SIZE, 
#                 EMBEDDING_SIZE, 
#                 RNN_HIDDEN_SIZE,
#                 0.1).to(device)
# print(vectorizer.max_source_length)
# print(vectorizer.max_target_length)
# with torch.no_grad():
#     for _, batch in enumerate(train_loader):
#         print(batch['x_source'].shape)
#         print(batch['y_target'].shape)
#         source = batch['x_source'].permute(1, 0)
#         target = batch['y_target'].permute(1, 0)
        
#         print(source.shape)
#         print(source)
        
#         print(source[:,0])
#         y_pred = model(source.to(device), target.to(device))
                
#         y_pred = y_pred[1:].reshape(-1, y_pred.shape[2])
#         # y_target = target[1:].reshape(-1)
        
        # loss = loss_func(y_pred.to(device), y_target.to(device))
        
        # print("loss:", loss)
#         break

#         print(target.shape)

# print(len(train_loader))

In [22]:
model = Seq2Seq(SOURCE_VOCAB_SIZE, 
                TARGET_VOCAB_SIZE, 
                TARGET_VOCAB_SIZE, 
                EMBEDDING_SIZE, 
                RNN_HIDDEN_SIZE,
                0.1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_func = nn.CrossEntropyLoss(ignore_index=padding_index)

In [23]:
for epoch in tqdm(range(NUM_EPOCHS), position = 0, leave=True):
    
    running_loss = 0.0
    model.train()
    
    for batch_index, batch in enumerate(train_loader):
        optimizer.zero_grad()
        
        source = batch['x_source'].permute(1, 0)
        target = batch['y_target'].permute(1, 0)

        y_pred = model(source.to(device), target.to(device))
        y_pred = y_pred[1:].reshape(-1, y_pred.shape[2])
        trg = target[1:].reshape(-1)
        
        loss = loss_func(y_pred.to(device), trg.to(device))
        loss.backward()

        optimizer.step()
        running_loss += (loss.item() - running_loss) / (batch_index + 1)
        
    val_running_loss = 0.0
    model.eval()
        
    for batch_index, batch in enumerate(val_loader):
        
        source = batch['x_source'].permute(1, 0)
        target = batch['y_target'].permute(1, 0)

        y_pred = model(source.to(device), target.to(device))
        y_pred = y_pred[1:].reshape(-1, y_pred.shape[2])
        trg = target[1:].reshape(-1)
        
        loss = loss_func(y_pred.to(device), trg.to(device))

        val_running_loss += (loss.item() - val_running_loss) / (batch_index + 1)
        
    if epoch == 0 or (epoch+1) % PRINT_EVERY == 0:
        print(f"Epoch: {epoch + 1} / {NUM_EPOCHS}. Train Loss: {running_loss}. Validation Loss: {val_running_loss}")

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 1 / 100. Train Loss: 5.160566490013282. Validation Loss: 4.343281284455331
Epoch: 10 / 100. Train Loss: 2.3812982644234517. Validation Loss: 3.079926052401143
Epoch: 20 / 100. Train Loss: 1.4448584484887295. Validation Loss: 3.0030476047146712
Epoch: 30 / 100. Train Loss: 1.0277860352209398. Validation Loss: 3.0656183073597574
Epoch: 40 / 100. Train Loss: 0.8199761922542864. Validation Loss: 3.227133081805321
Epoch: 50 / 100. Train Loss: 0.6647504586856685. Validation Loss: 3.307442818918536
Epoch: 60 / 100. Train Loss: 0.5624445570515587. Validation Loss: 3.407812718422182
Epoch: 70 / 100. Train Loss: 0.4639524603848691. Validation Loss: 3.4625304898908067
Epoch: 80 / 100. Train Loss: 0.43850858353234673. Validation Loss: 3.498381322430026
Epoch: 90 / 100. Train Loss: 0.3841516832996914. Validation Loss: 3.6148998814244426
Epoch: 100 / 100. Train Loss: 0.3451737315504702. Validation Loss: 3.713799230514035


In [29]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

def translate_sentence(source_sentence, target_sentence, classifier, vectorizer, decision_threshold=0.5):
    source_sentence = preprocess_text(source_sentence)
    target_sentence = preprocess_text(target_sentence)
    
    source_vector, target_vector = vectorizer.vectorize(source_sentence, target_sentence)

    source_tensor = torch.tensor(source_vector).unsqueeze(0).permute(1, 0)
    target_tensor = torch.tensor(target_vector).unsqueeze(0).permute(1, 0)
    
    with torch.no_grad():
        result = classifier(source_tensor, target_tensor)
        result = result[1:].reshape(-1, result.shape[2])

    indices = torch.argmax(result, 1)
    word_arr = []
    for i in range(indices.shape[0]):
        index = indices[i].item()
        if index > 3:
            word_arr.append(vectorizer.target_vocab._idx_to_token[index])

    return ' '.join(word_arr)

source_sentence = "you are to do as i tell you"
target_sentence = "vous devez faire comme je vous dis"

model = model.cpu()
prediction = translate_sentence(source_sentence, target_sentence, model, dataset.get_vectorizer())

print("Source:", source_sentence)
print("Target:", target_sentence)
print("Prediction:", prediction)

Source: you are to do as i tell you
Target: vous devez faire comme je vous dis
Prediction: vous devez faire comme je te dis
