In [1]:
import random
import numpy as np
import pandas as pd
import re
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from nltk.tokenize import RegexpTokenizer

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

punct_regex = RegexpTokenizer(r'\w+')

cuda


In [3]:
class Vocabulary(object):
    def __init__(self, token_to_idx=None):
 
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
 
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
 
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
 
    def lookup_token(self, token):
        return self._token_to_idx[token]
 
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
 
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
 
    def __len__(self):
        return len(self._token_to_idx)

    
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):
 
        super(SequenceVocabulary, self).__init__(token_to_idx)
 
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token
 
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)
 
    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

        
class NMTVectorizer(object):
    def __init__(self, source_vocab, target_vocab, max_source_length, max_target_length):
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length        
        
    @classmethod
    def from_dataframe(cls, bitext_df):
        source_vocab = SequenceVocabulary()
        target_vocab = SequenceVocabulary()
        
        max_source_length = 0
        max_target_length = 0
 
        for _, row in bitext_df.iterrows():
            source_tokens = punct_regex.tokenize(row["source_language"])
            if len(source_tokens) > max_source_length:
                max_source_length = len(source_tokens)
            for token in source_tokens:
                source_vocab.add_token(token)
            
            target_tokens = punct_regex.tokenize(row["target_language"])
            if len(target_tokens) > max_target_length:
                max_target_length = len(target_tokens)
            for token in target_tokens:
                target_vocab.add_token(token)
            
        return cls(source_vocab, target_vocab, max_source_length, max_target_length)
    
    def get_vector(self, text, source=True, target=False):
        if source:
            vocab = self.source_vocab
            max_seq_len = self.max_source_length
        else:
            vocab = self.target_vocab
            max_seq_len = self.max_target_length
            
        vector = np.zeros(max_seq_len + 2, dtype=np.int64)
        
        vector[0] = vocab.lookup_token(vocab._begin_seq_token)
        
        for i in range(len(text)):
            vector[i+1] = vocab.lookup_token(text[i])
        
        vector[len(text) + 1] = vocab.lookup_token(vocab._end_seq_token)
        
        return vector
    
    def vectorize(self, source_text, target_text):
        source_text = punct_regex.tokenize(source_text)
        target_text = punct_regex.tokenize(target_text)
        
        source_vector = self.get_vector(source_text)
        target_vector = self.get_vector(target_text, False, True)
        
        return source_vector, target_vector
    
class NMTDataset(Dataset):
    def __init__(self, text_df, vectorizer):
        self.text_df = text_df
        self._vectorizer = vectorizer
 
        self.train_df = self.text_df[self.text_df.split=='train']
        self.train_size = len(self.train_df)
 
        self.val_df = self.text_df[self.text_df.split=='val']
        self.validation_size = len(self.val_df)
 
        self.test_df = self.text_df[self.text_df.split=='test']
        self.test_size = len(self.test_df)
 
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
 
        self.set_split('train')
 
    @classmethod
    def load_dataset_and_make_vectorizer(cls, dataset_csv):
        text_df = pd.read_csv(dataset_csv)
        train_subset = text_df[text_df.split=='train']
        return cls(text_df, NMTVectorizer.from_dataframe(train_subset))
 
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    def get_vectorizer(self):
        return self._vectorizer
 
    def __len__(self):
        return self._target_size
 
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
 
        source_vector, target_vector = self._vectorizer.vectorize(row.source_language, row.target_language)
        
        return {"x_source": source_vector,
               "y_target": target_vector}

In [4]:
def get_batch_loader(dataset, batch_size):
    data_indices = np.arange(dataset._target_size)    
    np.random.shuffle(data_indices)
    
    data = []
    for index in data_indices:
        data.append(dataset[index])
        
    data_loader = DataLoader(data, batch_size=batch_size)
        
    return data_loader

class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 80):
        super().__init__()
        self.d_model = d_model
        
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = \
                math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = \
                math.cos(pos / (10000 ** (((2 * i) + 1)/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + Variable(self.pe[:,:seq_len], requires_grad=False).cuda()
        return x

In [6]:
dataset = NMTDataset.load_dataset_and_make_vectorizer("../../PyTorchNLPBook/data/nmt/simplest_eng_fra.csv")
vectorizer = dataset.get_vectorizer()
padding_index = vectorizer.source_vocab.mask_index

EMBEDDING_DIM = 512
FF_DIM = 4
NUM_HEADS = 8
NUM_LAYERS = 3
DROPOUT = 0.1
SOURCE_VOCAB_SIZE = len(vectorizer.source_vocab._token_to_idx)
TARGET_VOCAB_SIZE = len(vectorizer.target_vocab._token_to_idx)

BATCH_SIZE = 128
LEARNING_RATE = 3e-4

NUM_EPOCHS = 50
PRINT_EVERY = NUM_EPOCHS / 10

dataset.set_split("train")
train_loader = get_batch_loader(dataset, BATCH_SIZE)

dataset.set_split("val")
val_loader = get_batch_loader(dataset, BATCH_SIZE)

In [7]:
class AttentionHead(nn.Module):
    def __init__(self, emb_dim, dim_kqv):
        super(AttentionHead, self).__init__()
        self.dim_kqv = dim_kqv
        
        self.wq = nn.Linear(emb_dim, dim_kqv)
        self.wk = nn.Linear(emb_dim, dim_kqv)        
        self.wv = nn.Linear(emb_dim, dim_kqv)
        
    def forward(self, q, k, v, mask):
        queries = self.wq(q)
        keys = self.wk(k)
        values = self.wv(v)
        
        score = queries.bmm(keys.transpose(1, 2))     

        score = torch.div(score, self.dim_kqv ** 0.5, rounding_mode='floor')
        
        if mask is not None:
            score = score.masked_fill(mask == 0, -1e9)
#             print("scaled score with mask:")
#             print(score)
        
        softmax = F.softmax(score, dim = -1)

        return softmax.bmm(values)
    
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, emb_dim, dim_kqv):
        super(MultiHeadAttention, self).__init__()
        self.heads = nn.ModuleList(
            [AttentionHead(emb_dim, dim_kqv) for _ in range(num_heads)]
        )
        
        self.w0 = nn.Linear(num_heads * dim_kqv, emb_dim)
        
    def forward(self, q, k, v, mask):
        attentions = [h(q, k, v, mask) for h in self.heads]
        attentions = torch.cat(attentions, dim = -1)
        out = self.w0(attentions)
        
        return out

class Residual(nn.Module):
    def __init__(self, sublayer, dimension, dropout):
        super(Residual, self).__init__()
        self.sublayer = sublayer
        self.norm = nn.LayerNorm(dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, *tensors):
        return self.dropout(self.norm(tensors[0] + self.sublayer(*tensors)))
    
class FeedForward(nn.Module):
    def __init__(self, emb_dim, ff_dim):
        super(FeedForward, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(emb_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, emb_dim)
        )
        
    def forward(self, residual_out):
        return self.network(residual_out)
    

class EncoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ff_dim, dropout):
        super(EncoderLayer, self).__init__()
        self.dim_kqv = emb_dim // num_heads
        
        assert (self.dim_kqv * num_heads == emb_dim), "Embedding size must be divisible by number of heads" 
        
        self.attention = Residual(
            MultiHeadAttention(num_heads, emb_dim, self.dim_kqv),
            dimension=emb_dim,
            dropout=dropout,
        )
    
        self.feed_forward = Residual(
            FeedForward(emb_dim, ff_dim),
            dimension=emb_dim,
            dropout=dropout,
        )
        
    def forward(self, src, mask):
        src = self.attention(src, src, src, mask)
        out = self.feed_forward(src)
        return out
    
class Encoder(nn.Module):
    def __init__(self, 
                 emb_dim, 
                 num_heads, 
                 ff_dim, 
                 num_layers, 
                 src_vocab_size,
                 padding_index,
                 dropout):
        
        super(Encoder, self).__init__()
        self.layers = nn.ModuleList([
            EncoderLayer(emb_dim,
                         num_heads,
                         ff_dim,
                         dropout)
            for _ in range(num_layers)
        ])
        
        self.embedding = nn.Embedding(src_vocab_size, emb_dim, padding_idx=0)
        self.pe = PositionalEncoder(emb_dim)
        
    def forward(self, src):
        src = self.embedding(src)
        
        src = self.pe(src)
        
        for layer in self.layers:
            src = layer(src, None)
            
        return src
    
class DecoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ff_dim, dropout):
        super(DecoderLayer, self).__init__()
        
        self.dim_kqv = emb_dim // num_heads
        
        assert (self.dim_kqv * num_heads == emb_dim), "Embedding size must be divisible by number of heads"
        
        self.attention_1 = Residual(
            MultiHeadAttention(num_heads, emb_dim, self.dim_kqv),
            dimension=emb_dim,
            dropout=dropout
        )
        
        self.attention_2 = Residual(
            MultiHeadAttention(num_heads, emb_dim, self.dim_kqv),
            dimension=emb_dim,
            dropout=dropout
        )
        
        self.feed_forward = Residual(
            FeedForward(emb_dim, ff_dim),
            dimension=emb_dim,
            dropout=dropout
        )
        
    def forward(self, trg, memory, mask):
        query = self.attention_1(trg, trg, trg, mask)
        attentions = self.attention_2(query, memory, memory, None)
        out = self.feed_forward(attentions)
        
        return out

class Decoder(nn.Module):
    def __init__(self, 
                 emb_dim, 
                 num_heads, 
                 ff_dim, 
                 num_layers, 
                 out_size, 
                 padding_index,
                 dropout):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([
            DecoderLayer(emb_dim, num_heads, ff_dim, dropout)
            for _ in range(num_layers)
        ])
        
        self.embedding = nn.Embedding(out_size, emb_dim, padding_idx=padding_index)
        self.pe = PositionalEncoder(emb_dim)

    def make_trg_mask(self, trg):
        batch_size, seq_len = trg.shape[0], trg.shape[1]
        mask = torch.tril(torch.ones(batch_size, seq_len, seq_len))
        return mask
        
    def forward(self, trg, encoder_out):
        trg = self.embedding(trg)
        
        trg = self.pe(trg)

        mask = self.make_trg_mask(trg).to(trg.get_device())
        
        for layer in self.layers:
            trg = layer(trg, encoder_out, mask)
            
        # return self.lin(trg)
        return trg

class VanillaTransformer(nn.Module):
    def __init__(self, 
                 emb_dim, 
                 num_heads, 
                 ff_dim, 
                 num_layers, 
                 src_vocab_size, 
                 trg_vocab_size,
                 device,
                 padding_index,
                 dropout):
        super(VanillaTransformer, self).__init__()
        
        self.encoder = Encoder(emb_dim, 
                               num_heads, 
                               ff_dim,
                               num_layers, 
                               src_vocab_size,
                               padding_index,
                               dropout).to(device)
        
        self.decoder = Decoder(emb_dim,
                               num_heads,
                               ff_dim, 
                               num_layers,
                               trg_vocab_size,
                               padding_index,
                               dropout).to(device)

        self.lin = nn.Linear(emb_dim, trg_vocab_size)
        
        
    def forward(self, src, trg):
        encoder_out = self.encoder(src)

        decoder_out = self.decoder(trg, encoder_out)
        
        out = self.lin(decoder_out)

        return out
    
model = VanillaTransformer(EMBEDDING_DIM,
                           NUM_HEADS,
                           FF_DIM,
                           NUM_LAYERS,
                           SOURCE_VOCAB_SIZE,
                           TARGET_VOCAB_SIZE,
                           device,
                           padding_index,
                           DROPOUT).to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_func = nn.CrossEntropyLoss(ignore_index=padding_index)

for epoch in tqdm(range(NUM_EPOCHS), position = 0, leave = True):
    
    running_loss = 0.0
    model.train()
    
    for batch_index, batch in enumerate(train_loader):
        optimizer.zero_grad()
        
        src = batch['x_source']
        trg = batch['y_target']
        
        y_pred = model(src.to(device), trg[:, :-1].to(device))
        y_pred = y_pred.reshape(-1, y_pred.shape[2])
        
        loss = loss_func(y_pred, trg[:, 1:].reshape(-1).to(device))
        loss.backward()

        running_loss += (loss.item() - running_loss) / (batch_index + 1)
        optimizer.step()

    val_running_loss = 0.0
    model.eval()

    for batch_index, batch in enumerate(val_loader):
        src = batch['x_source']
        trg = batch['y_target']
        
        y_pred = model(src.to(device), trg[:, :-1].to(device))
        y_pred = y_pred.reshape(-1, y_pred.shape[2])
        
        loss = loss_func(y_pred, trg[:, 1:].reshape(-1).to(device))
        
        val_running_loss += (loss.item() - val_running_loss) / (batch_index + 1)
        
    if epoch == 0 or (epoch + 1) % PRINT_EVERY == 0:
        print('Epoch: {:<2} Train loss: {:0.4f}  Validation Loss: {:0.4f}'.format(epoch + 1 , running_loss, val_running_loss))

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 1  Train loss: 5.0105  Validation Loss: 4.0048
Epoch: 5  Train loss: 2.6616  Validation Loss: 2.7817
Epoch: 10 Train loss: 1.6908  Validation Loss: 2.3907
Epoch: 15 Train loss: 1.1512  Validation Loss: 2.2675
Epoch: 20 Train loss: 0.8499  Validation Loss: 2.2756
Epoch: 25 Train loss: 0.6860  Validation Loss: 2.2951
Epoch: 30 Train loss: 0.5662  Validation Loss: 2.4076
Epoch: 35 Train loss: 0.4888  Validation Loss: 2.5013
Epoch: 40 Train loss: 0.4424  Validation Loss: 2.5455
Epoch: 45 Train loss: 0.4061  Validation Loss: 2.6160
Epoch: 50 Train loss: 0.3711  Validation Loss: 2.6822


In [10]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

def translate_sentence(source_sentence, target_sentence, classifier, vectorizer, decision_threshold=0.5):
    source_sentence = preprocess_text(source_sentence)
    target_sentence = preprocess_text(target_sentence)
    
    source_vector, target_vector = vectorizer.vectorize(source_sentence, target_sentence)

    source_tensor = torch.tensor(source_vector).unsqueeze(0)
    target_tensor = torch.tensor(target_vector).unsqueeze(0)
    
    with torch.no_grad():
        result = classifier(source_tensor.to(device), target_tensor[:, :-1].to(device))
        result = result.reshape(-1, result.shape[2])
        result = F.softmax(result, -1)

    indices = torch.argmax(result, 1)
    word_arr = []
    for i in range(indices.shape[0]):
        index = indices[i].item()
        if vectorizer.target_vocab._idx_to_token[index] == "<END>":
            break
        word_arr.append(vectorizer.target_vocab._idx_to_token[index])

    return ' '.join(word_arr)

source_sentence = "you 're a prude"
target_sentence = "vous êtes un puritain"

model = model.to(device)
prediction = translate_sentence(source_sentence, target_sentence, model, dataset.get_vectorizer())

print("Source:", source_sentence)
print("Target:", target_sentence)
print("Prediction:", prediction)

Source: you 're a prude
Target: vous êtes un puritain
Prediction: vous êtes êtes puritaine puritaine
