## *Machine Translation English2French Seq2Seq Pytorch*

# Import libraries and Datasets

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/DATA/Machine_Translation_Dataset/MT_E2F_Seq2Seq_Pytorch")
!ls

Mounted at /content/drive
small_vocab_en.csv  small_vocab_fr.csv


In [2]:
!pip install torchtext



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data  import Field, BucketIterator, TabularDataset
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
# load the data
df_english = pd.read_csv('small_vocab_en.csv', sep = '/t', names = ['english'])
df_french = pd.read_csv('small_vocab_fr.csv', sep = '/t', names = ['french'])

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
df_english.head()

Unnamed: 0,english
0,"new jersey is sometimes quiet during autumn , ..."
1,the united states is usually chilly during jul...
2,"california is usually quiet during march , and..."
3,the united states is sometimes mild during jun...
4,"your least liked fruit is the grape , but my l..."


In [6]:
df_french.head()

Unnamed: 0,french
0,new jersey est parfois calme pendant l' automn...
1,les états-unis est généralement froid en juill...
2,"california est généralement calme en mars , et..."
3,"les états-unis est parfois légère en juin , et..."
4,"votre moins aimé fruit est le raisin , mais mo..."


In [7]:
df = pd.concat([df_english, df_french], axis=1)

In [8]:
df.head()

Unnamed: 0,english,french
0,"new jersey is sometimes quiet during autumn , ...",new jersey est parfois calme pendant l' automn...
1,the united states is usually chilly during jul...,les états-unis est généralement froid en juill...
2,"california is usually quiet during march , and...","california est généralement calme en mars , et..."
3,the united states is sometimes mild during jun...,"les états-unis est parfois légère en juin , et..."
4,"your least liked fruit is the grape , but my l...","votre moins aimé fruit est le raisin , mais mo..."


In [9]:
print("Total English Records = {}".format(len(df['english'])))
print("Total French Records = {}".format(len(df['french'])))

Total English Records = 137860
Total French Records = 137860


# Text Cleaning & Preprocessing 

In [10]:
import string
import re
import os
import nltk
#nltk.download('stopwords')
nltk.download('punkt')
#from nltk.corpus import stopwords
#stopwords_english = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
def process_text(text):
    '''
    Input: 
        text: a string containing a text
    Output:
        text_clean: a list of words containing the processed text
    
    '''
    # remove number 
    text = re.sub('[0-9]', '', text)
    # remove stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)
    # remove old style text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'#', '', text)
    # remove the dates like Mar 30 2013
    text = re.sub('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2}\s\d{4}', ' ', text)
    text = re.sub(r"[/(){}\[\]\|,;.:\?\-\'\"$^]", '', text)
 
    #text = " ".join(word for word in text.split() if word not in stopwords_english)
     

    return  text

In [12]:
df['eng'] = df['english'].apply(str).apply(process_text)
df['fr'] = df['french'].apply(str).apply(process_text)

In [13]:
df = df.drop(['english','french'],axis=1)

In [14]:
df.head()

Unnamed: 0,eng,fr
0,new jersey is sometimes quiet during autumn a...,new jersey est parfois calme pendant l automne...
1,the united states is usually chilly during jul...,les étatsunis est généralement froid en juille...
2,california is usually quiet during march and ...,california est généralement calme en mars et ...
3,the united states is sometimes mild during jun...,les étatsunis est parfois légère en juin et i...
4,your least liked fruit is the grape but my le...,votre moins aimé fruit est le raisin mais mon...


In [15]:
!python -m spacy download fr
!python -m spacy download en

Collecting fr_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7MB)
[K     |████████████████████████████████| 14.7MB 8.5MB/s 
Building wheels for collected packages: fr-core-news-sm
  Building wheel for fr-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.2.5-cp37-none-any.whl size=14727024 sha256=88e754e0d06a7925033ad40894e5859fe4485c098f0bed9c9fa703520d91fbd7
  Stored in directory: /tmp/pip-ephem-wheel-cache-sxh9pdvp/wheels/46/1b/e6/29b020e3f9420a24c3f463343afe5136aaaf955dbc9e46dfc5
Successfully built fr-core-news-sm
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/fr_core_news_sm -->
/usr/local/

In [16]:
spacy_eng = spacy.load("en")
spacy_fr = spacy.load("fr")


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


def tokenize_fr(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

In [17]:
spacy_eng

<spacy.lang.en.English at 0x7fd5bef66950>

In [18]:
spacy_fr

<spacy.lang.fr.French at 0x7fd5be015bd0>

In [19]:
french = Field(tokenize=tokenize_fr, lower=True, init_token="<sos>", eos_token="<eos>")

english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")
fields = {"eng": ("eng", english), "fr": ("fr", french)}         

In [20]:
# create train and test set
train, test = train_test_split(df, test_size=0.1)

train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

train_data, test_data = TabularDataset.splits( path="", train="train.csv", test="test.csv", format="csv", fields=fields)

In [21]:
df_train = pd.read_csv('train.csv')

In [22]:
df_train.head()

Unnamed: 0,eng,fr
0,she was driving a rusty red truck,elle conduisait un camion rouge rouillé
1,he dislikes pears peaches and grapefruit,il aime pas les poires les pêches et le pampl...
2,india is sometimes cold during april and it i...,l inde est parfois froid en avril et il est p...
3,the lime is his most loved fruit but the grap...,la chaux est son fruit le plus cher mais le r...
4,france is never cold during winter and it is ...,france ne fait jamais froid pendant l hiver e...


In [23]:
print(train_data)

<torchtext.legacy.data.dataset.TabularDataset object at 0x7fd692a9b350>


In [24]:
english.build_vocab(train_data.eng)
french.build_vocab(train_data.fr)

In [25]:
len(french.vocab)

348

In [26]:
len(english.vocab)

202

In [27]:
train_iterator, test_iterator = BucketIterator.splits((train_data, test_data), batch_size=32, device="cuda")

In [28]:
print(train_iterator.data)

<bound method Iterator.data of <torchtext.legacy.data.iterator.BucketIterator object at 0x7fd5a2f3f590>>


# Model 

In [29]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dp):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(dp)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dp)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell

In [30]:
class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, dp):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(dp)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dp)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

In [31]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(french.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

# Training

In [32]:
num_epochs = 50
learning_rate = 0.001
batch_size = 32

In [33]:
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(english.vocab)
input_size_decoder = len(french.vocab)
output_size = len(french.vocab)
encoder_embedding_size = 100
decoder_embedding_size = 100
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

In [34]:
# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0


In [35]:
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.eng),
    device=device)

In [36]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)

In [37]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = french.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [38]:
def save_checkpoint(state, filename="MT_Seq2Seq.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [63]:
def translate_sentence(model, sentence, english, french, device, max_length=50):
    # print(sentence)


    spacy_eng = spacy.load("en")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_eng(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, english.init_token)
    tokens.append(english.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [english.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [french.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == french.vocab.stoi["<eos>"]:
            break

    translated_sentence = [french.vocab.itos[idx] for idx in outputs]
    translated_sentence = translated_sentence[1:]
    translated_sentence = translated_sentence[:-1]

    return ' '.join(ix for ix in translated_sentence)
   

In [40]:
Original_english_sentence = test.iloc[1]['eng']
Original_French_word = test.iloc[1]['fr']

In [41]:
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    save_checkpoint(checkpoint)

    if epoch%5 == 0:
       model.eval()

       translated_sentence = translate_sentence(model, Original_english_sentence, english, french, device, max_length=50)

       print(f"Original English example sentence: \n {Original_english_sentence}")
       print(f"original French translated example sentence: \n {Original_French_word}")
       print(f"Trained French  translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.eng.to(device)
        target = batch.fr.to(device)

        # Forward prop
        output = model(inp_data, target)

        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

[Epoch 0 / 50]
=> Saving checkpoint
Original English example sentence: 
 china is never freezing during april  and it is sometimes wet in may 
original French translated example sentence: 
 chine est jamais le gel en avril  et il est parfois humide en mai 
Trained French  translated example sentence: 
 lépicerie étatsunis mangues mangues cet août mangues sec sec sec beau sec trouvé beau sec beau sec sec beau sec trouvé beau sec beau sec sec beau sec trouvé beau sec beau sec sec beau sec trouvé beau sec beau sec sec beau sec trouvé beau sec beau sec sec
[Epoch 1 / 50]
=> Saving checkpoint
[Epoch 2 / 50]
=> Saving checkpoint
[Epoch 3 / 50]
=> Saving checkpoint
[Epoch 4 / 50]
=> Saving checkpoint
[Epoch 5 / 50]
=> Saving checkpoint
Original English example sentence: 
 china is never freezing during april  and it is sometimes wet in may 
original French translated example sentence: 
 chine est jamais le gel en avril  et il est parfois humide en mai 
Trained French  translated example sente

# Testing

In [42]:
load_checkpoint(torch.load("MT_Seq2Seq.pth.tar"), model, optimizer)

=> Loading checkpoint


In [64]:
from torchtext.data.metrics import bleu_score
def bleu(data, model, english, french, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)['eng']
        trg = vars(example)["fr"]

        prediction = translate_sentence(model, src, english, french, device)
        

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

In [65]:
Original_english_sentence = test.iloc[3]['eng']
Original_French_word = test.iloc[3]['fr']
model.eval()

translated_sentence = translate_sentence(model, Original_english_sentence, english, french, device, max_length=50)

print(f"Original English example sentence: \n {Original_english_sentence}")
print(f"original French translated example sentence: \n {Original_French_word}")
print(f"Trained French  translated example sentence: \n {translated_sentence}")

Original English example sentence: 
 france is never beautiful during august  but it is never nice in summer 
original French translated example sentence: 
 la france est jamais belle au mois d août  mais il est jamais agréable en été 
Trained French  translated example sentence: 
 la france est jamais belle au mois d août   mais il est jamais agréable en été
