<a href="https://colab.research.google.com/github/GucciZhang/english-to-french/blob/main/English_to_French.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/GucciZhang/english-to-french.git
%cd english-to-french/

Cloning into 'english-to-french'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 16 (delta 0), reused 2 (delta 0), pack-reused 10[K
Unpacking objects: 100% (16/16), done.
/content/english-to-french/english-to-french


In [None]:
'''
  Installing additional required modules 
'''
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 14.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Collecting fr_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7 MB)
[K     |████████████████████████████████| 14.7 MB 1.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')


In [None]:
'''
  Setup Pytorch and other imports
'''
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import torch.nn as nn
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

from torchtext.datasets import IWSLT2016
from torchtext.vocab import build_vocab_from_iterator

import en_core_web_sm
import fr_core_news_sm

import spacy

from collections import Counter
import random

In [None]:
'''
  Data processing
'''

# Tokenizers
spacy_en = en_core_web_sm.load()
spacy_fr = fr_core_news_sm.load()

def tokenize_en(text):
  return [token.text for token in spacy_en.tokenizer(text)]

def tokenize_fr(text):
  return [token.text for token in spacy_fr.tokenizer(text)]

train_iter, valid_iter, test_iter = IWSLT2016(language_pair=('en', 'fr'))

def tokenize_data(data_iter):
  # Tokenize source and target sentences
  data = []
  for en, fr in data_iter:
    en = en.lower().strip()
    fr = fr.lower().strip()
    data.append({'src': tokenize_en(en), 'trg': tokenize_fr(fr)})

  return data

# Data splits
train_data_raw = tokenize_data(train_iter)
valid_data_raw = tokenize_data(valid_iter)
test_data_raw = tokenize_data(test_iter)

In [None]:
print(train_data_raw[:10])

[{'src': ['david', 'gallo', ':', 'this', 'is', 'bill', 'lange', '.', 'i', "'m", 'dave', 'gallo', '.'], 'trg': ['david', 'gallo', ':', 'voici', 'bill', 'lange', '.', 'je', 'suis', 'dave', 'gallo', '.']}, {'src': ['and', 'we', "'re", 'going', 'to', 'tell', 'you', 'some', 'stories', 'from', 'the', 'sea', 'here', 'in', 'video', '.'], 'trg': ['nous', 'allons', 'vous', 'raconter', 'quelques', 'histoires', 'de', 'la', 'mer', 'en', 'vidéo', '.']}, {'src': ['we', "'ve", 'got', 'some', 'of', 'the', 'most', 'incredible', 'video', 'of', 'titanic', 'that', "'s", 'ever', 'been', 'seen', ',', 'and', 'we', "'re", 'not', 'going', 'to', 'show', 'you', 'any', 'of', 'it', '.'], 'trg': ['nous', 'avons', 'des', 'vidéos', 'du', 'titanic', 'parmi', 'les', 'plus', 'spectaculaires', 'jamais', 'vues', '.', 'et', 'nous', "n'", 'allons', 'pas', 'vous', 'en', 'montrer', 'une', 'image', '.']}, {'src': ['the', 'truth', 'of', 'the', 'matter', 'is', 'that', 'the', 'titanic', '--', 'even', 'though', 'it', "'s", 'breakin

We define a vocabulary for English and French, which is a set of all the words that show up in the training data. Each word will also be assigned a numeric value which will be used later to encode the words (one-hot encoding) as tensors. 

Note that we only consider the words in the training data. Words not including in the training data (and hence not learned by the neural network) will be mapped to the '<unk>' token which stands for unknown.

In [None]:
'''
 Building the vocabulary
'''

def build_vocab(data):
  """
    Generates the vocabulary of provided data (list of lists of tokens)
    Note <unk> for unknown tokens, <pad> for padding, <bos> is beginning of strings, <eos> is end of string
  """
  return build_vocab_from_iterator(data, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

en_vocab = build_vocab((pair['src'] for pair in train_data_raw))
en_vocab.set_default_index(en_vocab['<unk>'])
fr_vocab = build_vocab((pair['trg'] for pair in train_data_raw))
fr_vocab.set_default_index(fr_vocab['<unk>'])

In [None]:
print(f"Unique tokens in source (en) vocabulary: {len(en_vocab)}")
print(f"Unique tokens in target (fr) vocabulary: {len(fr_vocab)}")

# print(en_vocab.get_itos())
# print(fr_vocab.get_itos())

Unique tokens in source (en) vocabulary: 53678
Unique tokens in target (fr) vocabulary: 74590


In [None]:
def encode_data(raw_data):
  # Encoding the data as torch tensors
  data = []
  for pair in raw_data:
    src = pair['src']
    trg = pair['trg']
    src_tensor = torch.tensor([en_vocab[token] for token in src], dtype=torch.long)
    trg_tensor = torch.tensor([fr_vocab[token] for token in trg], dtype=torch.long)
    data.append((src_tensor, trg_tensor))
  return data

train_data = encode_data(train_data_raw)
valid_data = encode_data(valid_data_raw)
test_data = encode_data(test_data_raw)

In [None]:
BATCH_SIZE = 128
# The en and fr values for the special tokens are the same
PAD_IDX = en_vocab['<pad>']
BOS_IDX = en_vocab['<bos>']
EOS_IDX = en_vocab['<eos>']

def generate_batch(batch):
  en_batch, fr_batch = [], []
  for en_tensor, fr_tensor in batch:
    en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    fr_batch.append(torch.cat([torch.tensor([BOS_IDX]), fr_item, torch.tensor([EOS_IDX])], dim=0))
  en_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
  fr_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
  return en_batch, fr_batch

train_data_iter = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
valid_data_iter = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
test_data_iter = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)


**Encoder**

Input vectors are one-hot encodings (very sparse, extra dimensions), will be converted to embeddings (denser, more efficient).

We will have an initial dropout layer from the embedded inputs into our LSTM.
The LSTM also takes a dropout argument - note this is dropout between layers of a multi-layer LSTM, not between recurrent iterations.

The forward defines how our encoder handles input data. It first converts from one-hot vectors to embedded vectors using the embedding layer. Then, it applies dropout. The embeddings are then passed into the LSTM. Note we are passing in a sequence, but the Torch RNNs automatically handle the recurrent iterations for us.

In [None]:
class Encoder(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout):
    super().__init__()

    self.hidden_dim = hidden_dim
    self.num_layers = num_layers

    # Embedding layer
    self.embedding = nn.Embedding(input_dim, embedding_dim)

    # LSTM RNN
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout)

    # Initial dropout layer
    self.dropout = nn.Dropout(dropout)

  def forward(self, seq):

    # src: [seq length, batch size]

    embedded = self.dropout(self.embedding(src))

    # embedded: [ seq length, batch size, embedding_dim ]

    output, hidden, cell = self.lstm(embedded)

    # output: [ seq length, batch size, hidden dim ]
    # hidden: [ num layers, batch size, hidden dim ]
    # cell: [ num layers, batch size, hidden dim]

    return hidden, cell


**Decoder**

The output dimension, analagously to the encoder, here is the size of the vocabulary for the target (French).

In [None]:
class Decoder(nn.Module):
  def __init__(self, output_dim, embedding_dim, hidden_dim, num_layers, dropout):
    super().__init__()

    self.output_dim = output_dim
    self.hidden_dim = embedding_dim
    self.num_layers = num_layers

    self.embedding = nn.Embedding(output_dim, embedding_dim)

    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout)

    self.dropout = nn.Dropout(dropout)

    self.predict = nn.Linear(hidden_dim, output_dim)

   
  def forward(self, input, hidden, cell):
    input = input.unsqueeze(0)

    embedded = self.dropout(self.embedding(input))

    output, hidden, cell = self.lstm(embedded, (hidden, cell))

    prediction = self.predict(output.squeeze(0))

    return prediction, hidden, cell


**Seq2Seq**

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.device = device

  def forward(self, src, trg, teacher_forcing = 0.5):

    # trg: [ trg_length, batch_size ]

    trg_length, batch_size = trg.shape
    trg_vocab_size = self.decoder.output_dim

    # tensor storing decoder outputs
    outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)

    # initial hidden, cell states of decoder are the final hidden, cell states of the encoder
    hidden, cell = self.encoder(src)

    input = trg[0,:]

    for t in range(1, trg_length):
      output, hidden, cell = self.decoder(input, hidden, cell)

      # store prediction
      outputs[t] = output

      # decide if we will use teacher-forcing
      teacher_force = random.random() < teacher_forcing

      if teacher_force:
        input = trg[t,:]
      else:
        prediction = output.argmax(1)
        input = prediction

    return outputs




**Training**

In [None]:
INPUT_DIM = len(en_vocab)
OUTPUT_DIM = len(fr_vocab)
ENCODER_EMBEDDING_DIM = 256
DECODER_EMBEDDING_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 4
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5

encoder = Encoder(INPUT_DIM, ENCODER_EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, ENCODER_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DECODER_EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, DECODER_DROPOUT)

model = Seq2Seq(encoder, decoder, device)

def init_weights(model: nn.Module):
  for name, param in model.named_parameters():
    nn.init.uniform_(param.data, -0.05, 0.05)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(53678, 256)
    (lstm): LSTM(256, 512, num_layers=4, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(74590, 256)
    (lstm): LSTM(256, 512, num_layers=4, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (predict): Linear(in_features=512, out_features=74590, bias=True)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 86,862,686 trainable parameters
