## *Machine Translation English2French Transformer Pytorch*

# Import libraries and Datasets

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/DATA/Machine_Translation_Dataset/MT_E2F_Transformer_Pytorch")
!ls

Mounted at /content/drive
small_vocab_en.csv  small_vocab_fr.csv


In [2]:
!pip install torchtext
!pip install python-math

Collecting python-math
  Downloading https://files.pythonhosted.org/packages/ff/8c/60c13be29a2f2e74c0313f2e62c7f751c944fe54b917afa5f88144e71a66/python_math-0.0.1-py3-none-any.whl
Installing collected packages: python-math
Successfully installed python-math-0.0.1


In [3]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torchtext.legacy.data  import Field, BucketIterator, TabularDataset
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
# load the data
df_english = pd.read_csv('small_vocab_en.csv', sep = '/t', names = ['english'])
df_french = pd.read_csv('small_vocab_fr.csv', sep = '/t', names = ['french'])

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
df_english.head()

Unnamed: 0,english
0,"new jersey is sometimes quiet during autumn , ..."
1,the united states is usually chilly during jul...
2,"california is usually quiet during march , and..."
3,the united states is sometimes mild during jun...
4,"your least liked fruit is the grape , but my l..."


In [6]:
df_french.head()

Unnamed: 0,french
0,new jersey est parfois calme pendant l' automn...
1,les états-unis est généralement froid en juill...
2,"california est généralement calme en mars , et..."
3,"les états-unis est parfois légère en juin , et..."
4,"votre moins aimé fruit est le raisin , mais mo..."


In [7]:
df = pd.concat([df_english, df_french], axis=1)

In [8]:
df.head()

Unnamed: 0,english,french
0,"new jersey is sometimes quiet during autumn , ...",new jersey est parfois calme pendant l' automn...
1,the united states is usually chilly during jul...,les états-unis est généralement froid en juill...
2,"california is usually quiet during march , and...","california est généralement calme en mars , et..."
3,the united states is sometimes mild during jun...,"les états-unis est parfois légère en juin , et..."
4,"your least liked fruit is the grape , but my l...","votre moins aimé fruit est le raisin , mais mo..."


In [9]:
print("Total English Records = {}".format(len(df['english'])))
print("Total French Records = {}".format(len(df['french'])))

Total English Records = 137860
Total French Records = 137860


# Text Cleaning & Preprocessing 

In [10]:
import string
import re
import os
import nltk
#nltk.download('stopwords')
nltk.download('punkt')
#from nltk.corpus import stopwords
#stopwords_english = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
def process_text(text):
    '''
    Input: 
        text: a string containing a text
    Output:
        text_clean: a list of words containing the processed text
    
    '''
    # remove number 
    text = re.sub('[0-9]', '', text)
    # remove stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)
    # remove old style text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'#', '', text)
    # remove the dates like Mar 30 2013
    text = re.sub('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2}\s\d{4}', ' ', text)
    text = re.sub(r"[/(){}\[\]\|,;.:\?\-\'\"$^]", '', text)
 
    #text = " ".join(word for word in text.split() if word not in stopwords_english)
    
   
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
   
     

    return  text

In [12]:
df['eng'] = df['english'].apply(str).apply(process_text)
df['fr'] = df['french'].apply(str).apply(process_text)

In [13]:
df = df.drop(['english','french'],axis=1)

In [14]:
df.head()

Unnamed: 0,eng,fr
0,new jersey is sometimes quiet during autumn a...,new jersey est parfois calme pendant l automne...
1,the united states is usually chilly during jul...,les étatsunis est généralement froid en juille...
2,california is usually quiet during march and ...,california est généralement calme en mars et ...
3,the united states is sometimes mild during jun...,les étatsunis est parfois légère en juin et i...
4,your least liked fruit is the grape but my le...,votre moins aimé fruit est le raisin mais mon...


In [15]:
!python -m spacy download fr
!python -m spacy download en

Collecting fr_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7MB)
[K     |████████████████████████████████| 14.7MB 10.3MB/s 
Building wheels for collected packages: fr-core-news-sm
  Building wheel for fr-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.2.5-cp37-none-any.whl size=14727024 sha256=3660d4cd4d01542274a0122acaed8fed17883f9a0959dd26802a2850b59b3695
  Stored in directory: /tmp/pip-ephem-wheel-cache-hhhg9159/wheels/46/1b/e6/29b020e3f9420a24c3f463343afe5136aaaf955dbc9e46dfc5
Successfully built fr-core-news-sm
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/fr_core_news_sm -->
/usr/local

In [16]:
spacy_eng = spacy.load("en")
spacy_fr = spacy.load("fr")


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


def tokenize_fr(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

In [17]:
french = Field(tokenize=tokenize_fr, lower=True, init_token="<sos>", eos_token="<eos>")

english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")
fields = {("eng", english), ("fr", french)}

            

In [18]:
# create train and test set
train, test = train_test_split(df, test_size=0.1)

In [19]:
test['eng'][1:2].astype(str)

94354    the united states is pleasant during july  and...
Name: eng, dtype: object

In [20]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

train_data, test_data = TabularDataset.splits( path="", train="train.csv", test="test.csv", format="csv", fields=fields)

In [21]:
english.build_vocab(train_data.eng)
french.build_vocab(train_data.fr)

In [22]:
len(french.vocab)

351

In [23]:
len(english.vocab)

203

In [24]:
train_iterator, test_iterator = BucketIterator.splits((train_data, test_data), batch_size=32, device="cuda")

# Model 

In [25]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

#adding zeros is an easy way
        src_positions = (
            torch.arange(0, src_seq_length)
            .reshape(src_seq_length,1)  + torch.zeros(src_seq_length,N) 
        ).to(device)
        
        trg_positions = (
            torch.arange(0, trg_seq_length)
            .reshape(trg_seq_length,1)  + torch.zeros(trg_seq_length,N) 
        ).to(device)



        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out

# Training

In [26]:
def save_checkpoint(state, filename="MT_Transformer.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [27]:
# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Training hyperparameters
num_epochs = 20
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
src_vocab_size = len(english.vocab)
trg_vocab_size = len(french.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 50
forward_expansion = 4
src_pad_idx = english.vocab.stoi["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0


model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = french.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)



In [28]:
def translate_sentence(model, sentence, english, french, device, max_length=50):
   
    spacy_eng = spacy.load("en")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_eng(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, english.init_token)
    tokens.append(english.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [english.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [french.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == french.vocab.stoi["<eos>"]:
            break

    translated_sentence = [french.vocab.itos[idx] for idx in outputs]
    # remove start token
    return ' '.join(ix for ix in translated_sentence[1:])

In [29]:
Original_english_sentence = test.iloc[3]['eng']
Original_French_word = test.iloc[3]['fr']

In [30]:
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")
    stepLoss=[]
    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    save_checkpoint(checkpoint)
    
    if epoch%5 == 0:
       model.eval()

       translated_sentence = translate_sentence(model, Original_english_sentence, english, french, device, max_length=50)

       print(f"Original English example sentence: \n {Original_english_sentence}")
       print(f"original French translated example sentence: \n {Original_French_word}")
       print(f"Trained French  translated example sentence: \n {translated_sentence}")

    model.train()

    losses = []

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.eng.to(device)
        target = batch.fr.to(device)

        # Forward prop
        output = model(inp_data, target[:-1, :])

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin.
        # Let's also remove the start token while we're at it
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

    mean_loss = sum(losses) / len(losses)

    print("train crossentropy at epoch {} loss: ".format(epoch), mean_loss)

[Epoch 0 / 20]
=> Saving checkpoint
Original English example sentence: 
 she dislikes peaches  oranges  and lemons 
original French translated example sentence: 
 elle déteste les pêches  les oranges  les citrons et les 
Trained French  translated example sentence: 
 que lapins pommes petit dans    dernière aller aller aller aller aller dans       lapins animal mais aller aller aller français juin lapins pommes lapins dans petit animal mais aller aller bleu    singe dans lapins étaient aller aller bleu singe petit étaient bleu mais aller visite aimentils mais
train crossentropy at epoch 0 loss:  0.13965346891281222
[Epoch 1 / 20]
=> Saving checkpoint
train crossentropy at epoch 1 loss:  0.055853391360383926
[Epoch 2 / 20]
=> Saving checkpoint
train crossentropy at epoch 2 loss:  0.051899451493703945
[Epoch 3 / 20]
=> Saving checkpoint
train crossentropy at epoch 3 loss:  0.05086200933158013
[Epoch 4 / 20]
=> Saving checkpoint
train crossentropy at epoch 4 loss:  0.049960382380837394
[E

# Testing

In [31]:
load_checkpoint(torch.load("MT_Transformer.pth.tar"), model, optimizer)

=> Loading checkpoint


In [32]:
Original_english_sentence = test.iloc[3]['eng']
Original_French_word = test.iloc[3]['fr']
model.eval()

translated_sentence = translate_sentence(model, Original_english_sentence, english, french, device, max_length=50)

print(f"Original English example sentence: \n {Original_english_sentence}")
print(f"original French translated example sentence: \n {Original_French_word}")
print(f"Trained French  translated example sentence: \n {translated_sentence}")

Original English example sentence: 
 she dislikes peaches  oranges  and lemons 
original French translated example sentence: 
 elle déteste les pêches  les oranges  les citrons et les 
Trained French  translated example sentence: 
 elle déteste les pêches   les oranges et les citrons <eos>
