In [None]:
!pip install spacy

In [None]:
!pip install portalocker
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

In [3]:
from torchtext.data.utils import get_tokenizer

In [4]:
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable,List
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader,Dataset
from timeit import default_timer as timer
from torch.nn import Transformer
from torch import Tensor
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

In [5]:
import torch.nn as NN
import torch
import torch.nn.functional as F
import numpy as np
import math
import os
import pandas as pd
import matplotlib.pyplot as plt
import spacy
spacy.prefer_gpu()

True

In [6]:
seed=42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.benchmark=True

In [7]:
SRC_LANGUAGE='en'
TGT_LANGUAGE='fr'

In [8]:
token_transform={}
vocab_transform={}
token_transform[SRC_LANGUAGE]=get_tokenizer('spacy',language='en_core_web_sm')
token_transform[TGT_LANGUAGE]=get_tokenizer('spacy',language='fr_core_news_sm')



In [9]:
csv=pd.read_csv(
    'eng_-french.csv',
    usecols=['English words/sentences', 'French words/sentences']
)
csv.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [10]:
train_csv,test_csv=train_test_split(csv,test_size=0.1)

In [11]:
len(train_csv)


158058

In [12]:
len(test_csv)

17563

In [13]:
train_csv

Unnamed: 0,English words/sentences,French words/sentences
158383,They kept him waiting outside for a long time.,Ils le firent poireauter dehors.
146722,How much money did you spend on your car?,Combien d'argent avez-vous dépensé pour votre ...
120085,I heard it from a reliable source.,Je l'ai entendu d'une source fiable.
152460,My parents met each other in the mountains.,Mes parents se sont rencontrés dans les montag...
63136,My teacher drove me home.,Mon professeur m'a reconduit chez moi.
...,...,...
119879,I don't want to lose my boyfriend.,Je ne veux pas perdre mon petit ami.
103694,I will never forget seeing you.,Je n'oublierai jamais t'avoir vu.
131932,Who told you that we should do that?,Qui vous a dit que nous devrions faire cela ?
146867,I decided to go out and explore the town.,J’ai décidé de sortir et d’explorer la ville.


In [14]:
class TranslationDataset(Dataset):
  def __init__(self,csv):
    self.csv=csv

  def __len__(self):
    return len(self.csv)

  def __getitem__(self,idx):
    return(
        self.csv['English words/sentences'].iloc[idx],
        self.csv['French words/sentences'].iloc[idx]
    )


In [15]:
train_dataset=TranslationDataset(train_csv)
valid_dataset=TranslationDataset(test_csv)

iterator=iter(train_dataset)
print(next(iterator))

('They kept him waiting outside for a long time.', 'Ils le firent poireauter dehors.')


In [16]:
def yield_tokens(data_iter:Iterable,language:str)->List[str]:
  language_index={SRC_LANGUAGE:0,TGT_LANGUAGE:1}

  for data_sample in data_iter:
    yield token_transform[language](data_sample[language_index[language]])

In [17]:
UNK_IDX,PAD_IDX,BOS_IDX,EOS_IDX=0,1,2,3
special_symbols=['<unk>','<pad>','<bos>','<eos>']

for ln in [SRC_LANGUAGE,TGT_LANGUAGE]:
  vocab_transform[ln]=build_vocab_from_iterator(
      yield_tokens(train_dataset,ln),
      min_freq=1,
      specials=special_symbols,
      special_first=True,
  )

for ln in [SRC_LANGUAGE,TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [18]:
def sequential_transform(*transforms):
  def func(txt_input):
    for transform in transforms:
      txt_input=transform(txt_input)
    return txt_input
  return func

def tensor_transform(token_ids:List[int]):
  return torch.cat((torch.tensor([BOS_IDX]),
                    torch.tensor(token_ids),
                    torch.tensor([EOS_IDX])))

text_transform={}
for ln in [SRC_LANGUAGE,TGT_LANGUAGE]:
  text_transform[ln]=sequential_transform(token_transform[ln],
                                          vocab_transform[ln],
                                          tensor_transform)

def collate_fn(batch):
  src_batch,tgt_batch=[],[]
  for src_sample,tgt_sample in batch:
    src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
    tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))
  src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
  tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
  return src_batch, tgt_batch


In [19]:
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 192
NHEAD = 6
FFN_HID_DIM = 192
BATCH_SIZE = 192
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
DEVICE = 'cuda'
NUM_EPOCHS = 4

In [20]:
def generate_square_subsequent_mask(sz):
  mask=(torch.triu(torch.ones((sz,sz),device=DEVICE))==1).transpose(0,1)
  mask=mask.float().masked_fill(mask==0,float('-inf')).masked_fill(mask==1,float(0.0))
  return mask

def create_mask(src,tgt):
  src_seq_len=src.shape[1]
  tgt_seq_len=tgt.shape[1]

  tgt_mask=generate_square_subsequent_mask(tgt_seq_len)
  src_mask=torch.zeros((src_seq_len,src_seq_len),device=DEVICE).type(torch.bool)

  src_padding_mask=(src==PAD_IDX)
  tgt_padding_mask = (tgt == PAD_IDX)
  return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [21]:
class PositionalEncoding(NN.Module):
  def __init__(self,d_model,dropout,max_len=5000):
    super(PositionalEncoding,self).__init__()
    self.dropout=NN.Dropout(p=dropout)

    pe=torch.zeros(max_len,d_model)
    position=torch.arange(0,max_len,dtype=torch.float).unsqueeze(1)
    div_term=torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000_0)/d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

  def forward(self,x):
    x=x+self.pe[:,:x.size(1)]
    return self.dropout(x)



In [22]:
class TokenEmbedding(NN.Module):
  def __init__(self,vocab_size:int,emb_size):
    super(TokenEmbedding,self).__init__()
    self.embedding=NN.Embedding(vocab_size,emb_size)
    self.emb_size=emb_size

  def forward(self,tokens:Tensor):
    return self.embedding(tokens.long())*math.sqrt(self.emb_size)

In [23]:
class Seq2SeqTransformer(NN.Module):
  def __init__(
      self,
      num_encoder_layers:int,
      num_decoder_layers:int,
      emb_size:int,
      nhead:int,
      src_vocab_size:int,
      tgt_vocab_size:int,
      dim_feedforward:int=512,
      dropout:float=0.1
  ):
    super(Seq2SeqTransformer,self).__init__()
    self.transformer=Transformer(
      d_model=emb_size,
      nhead=nhead,
      num_encoder_layers=num_encoder_layers,
      num_decoder_layers=num_decoder_layers,
      dim_feedforward=dim_feedforward,
      dropout=dropout,
      batch_first=True
    )
    self.generator = NN.Linear(emb_size, tgt_vocab_size)
    self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
    self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
    self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

  def forward(
      self,
      src:Tensor,
      trg:Tensor,
      src_mask:Tensor,
      tgt_mask:Tensor,
      src_padding_mask:Tensor,
      tgt_padding_mask:Tensor,
      memory_key_padding_mask:Tensor):

    src_emb = self.positional_encoding(self.src_tok_emb(src))
    tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
    outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
    return self.generator(outs)
  def encode(self, src: Tensor, src_mask: Tensor):
    return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)
  def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
    return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)


In [24]:
model = Seq2SeqTransformer(
    NUM_ENCODER_LAYERS,
    NUM_DECODER_LAYERS,
    EMB_SIZE,
    NHEAD,
    SRC_VOCAB_SIZE,
    TGT_VOCAB_SIZE,
    FFN_HID_DIM
).to(DEVICE)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")
print(model)

14,487,719 total parameters.
14,487,719 training parameters.
Seq2SeqTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
          )
          (linear1): Linear(in_features=192, out_features=192, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=192, out_features=192, bias=True)
          (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-2): 3 x Tr

In [25]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)


In [26]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
def train_epoch(model, optimizer):
    print('Training')
    model.train()
    losses = 0
    for src, tgt in tqdm(train_dataloader, total=len(list(train_dataloader))):
        # print(" ".join(vocab_transform[SRC_LANGUAGE].lookup_tokens(list(src[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        # print(" ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:, :-1]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        logits = model(
            src,
            tgt_input,
            src_mask,
            tgt_mask,
            src_padding_mask,
            tgt_padding_mask,
            src_padding_mask
        )
        optimizer.zero_grad()
        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.view(-1, TGT_VOCAB_SIZE), tgt_out.contiguous().view(-1))
        loss.backward()
        optimizer.step()
        losses += loss.item()
    return losses / len(list(train_dataloader))

In [27]:
len(train_dataloader)

824

In [28]:
val_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
def evaluate(model):
    print('Validating')
    model.eval()
    losses = 0
    for src, tgt in tqdm(val_dataloader, total=len(list(val_dataloader))):
        # print(" ".join(vocab_transform[SRC_LANGUAGE].lookup_tokens(list(src[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        # print(" ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:, :-1]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(
            src,
            tgt_input,
            src_mask,
            tgt_mask,
            src_padding_mask,
            tgt_padding_mask,
            src_padding_mask
        )
        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.view(-1, TGT_VOCAB_SIZE), tgt_out.contiguous().view(-1))
        losses += loss.item()
    return losses / len(list(val_dataloader))

In [29]:
len(val_dataloader)

92

In [None]:
train_loss_list, valid_loss_list = [], []
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer)
    valid_loss = evaluate(model)
    end_time = timer()
    train_loss_list.append(train_loss)
    valid_loss_list.append(valid_loss)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {valid_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s \n"))

In [33]:
os.makedirs('outputs', exist_ok=True)

In [37]:
def save_plots(train_loss, valid_loss):
    """
    Function to save the loss plots to disk.
    """
    # Loss plots.
    plt.figure(figsize=(10, 7))
    plt.plot(
        train_loss, color='blue', linestyle='-',
        label='train loss'
    )
    plt.plot(
        valid_loss, color='red', linestyle='-',
        label='validataion loss'
    )
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(os.path.join('outputs', 'loss.png'))
    plt.show()

In [None]:
save_plots(train_loss_list, valid_loss_list)

In [39]:
torch.save(model, 'outputs/model.pth')


In [40]:
model = torch.load('outputs/model.pth')


In [49]:
print(model)

Seq2SeqTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
          )
          (linear1): Linear(in_features=192, out_features=192, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=192, out_features=192, bias=True)
          (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerDecoderLayer(
          (self_attn): MultiheadAttent

In [42]:
# Helper function to generate output sequence using greedy algorithm.
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        if i == 0:
            ys = ys.transpose(1, 0)
        tgt_mask = (generate_square_subsequent_mask(ys.size(1))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        if next_word == EOS_IDX:
            break
    return ys
# Translation function.
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(1, -1)
    num_tokens = src.shape[1]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [43]:
# SRC, GT pairs from the validation set.
infer_sentences = [
    ["Take a seat.", "Prends place !"],
    ["I'm not scared to die", "Je ne crains pas de mourir."],
    ["You'd better make sure that it is true.", "Tu ferais bien de t'assurer que c'est vrai."],
    ["The clock has stopped.", "L'horloge s'est arrêtée."],
    ["Take any two cards you like.", "Prends deux cartes de ton choix."]
]
for sentence in infer_sentences:
    print(f"SRC: {sentence[0]}")
    print(f"GT: {sentence[1]}")
    print(f"PRED: {translate(model, sentence[0])}\n")

SRC: Take a seat.
GT: Prends place !
PRED:  Arrête ! 

SRC: I'm not scared to die
GT: Je ne crains pas de mourir.
PRED:  Je ne suis pas à croire que je suis en suis pas

SRC: You'd better make sure that it is true.
GT: Tu ferais bien de t'assurer que c'est vrai.
PRED:  Tu ferais mieux que ça est mieux . 

SRC: The clock has stopped.
GT: L'horloge s'est arrêtée.
PRED:  Le chat a été en train . 

SRC: Take any two cards you like.
GT: Prends deux cartes de ton choix.
PRED:  Les gens vous êtes deux deux . 

