In [1]:
!git clone https://github.com/GucciZhang/english-to-french.git
%cd english-to-french/

fatal: destination path 'english-to-french' already exists and is not an empty directory.
/content/english-to-french


In [None]:
'''
  Installing additional required modules 
'''
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

In [44]:
'''
  Setup Pytorch and other imports
'''
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from torchtext.datasets import IWSLT2016
from torchtext.legacy.data import Field
from torchtext.vocab import build_vocab_from_iterator

import spacy

from collections import Counter

In [52]:
'''
  Data processing
'''

# Tokenizers
spacy_en = spacy.load('en_core_web_sm')
spacy_fr = spacy.load("fr_core_news_sm")

def tokenize_en(text):
  return [token.text for token in spacy_en.tokenizer(text)]

def tokenize_fr(text):
  return [token.text for token in spacy_fr.tokenizer(text)]

train_iter, valid_iter, test_iter = IWSLT2016(language_pair=('en', 'fr'))

def tokenize_data(data_iter):
  # Tokenize source and target sentences
  data = []
  for en, fr in data_iter:
    en = en.lower().strip()
    fr = fr.lower().strip()
    data.append({'src': tokenize_en(en), 'trg': tokenize_fr(fr)})

  return data

# Data splits
train_data = tokenize_data(train_iter)
valid_data = tokenize_data(valid_iter)
test_data = tokenize_data(test_iter)

In [31]:
print(train_data[:10])

[{'src': ['david', 'gallo', ':', 'voici', 'bill', 'lange', '.', 'je', 'suis', 'dave', 'gallo', '.'], 'trg': ['david', 'gallo', ':', 'this', 'is', 'bill', 'lange', '.', "i'", 'm', 'dave', 'gallo', '.']}, {'src': ['nous', 'allons', 'vous', 'raconter', 'quelques', 'histoires', 'de', 'la', 'mer', 'en', 'vidéo', '.'], 'trg': ['and', "we'", 're', 'going', 'to', 'tell', 'you', 'some', 'stories', 'from', 'the', 'sea', 'here', 'in', 'video', '.']}, {'src': ['nous', 'avons', 'des', 'vidéos', 'du', 'titanic', 'parmi', 'les', 'plus', 'spectaculaires', 'jamais', 'vues', '.', 'et', 'nous', "n'allons", 'pas', 'vous', 'en', 'montrer', 'une', 'image', '.'], 'trg': ["we'", 've', 'got', 'some', 'of', 'the', 'most', 'incredible', 'video', 'of', 'titanic', "that'", 's', 'ever', 'been', 'seen', ',', 'and', "we'", 're', 'not', 'going', 'to', 'show', 'you', 'any', 'of', 'it', '.']}, {'src': ['la', 'vérité', 'est', 'que', 'le', 'titanic', '--', 'même', "s'il", 'continue', 'de', 'battre', 'toutes', 'les', 'reco

In [57]:
'''
 Building the vocabulary
'''

def build_vocab(data):
  """
    Generates the vocabulary of provided data (list of lists of tokens)
    Note <unk> for unknown tokens, <pad> for padding, <bos> is beginning of strings, <eos> is end of string
  """
  return build_vocab_from_iterator(data, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

en_vocab = build_vocab((pair['src'] for pair in train_data))
fr_vocab = build_vocab((pair['trg'] for pair in train_data))

In [58]:
print(f"Unique tokens in source (en) vocabulary: {len(en_vocab)}")
print(f"Unique tokens in target (fr) vocabulary: {len(fr_vocab)}")
print(en_vocab.get_itos())

Unique tokens in source (en) vocabulary: 53221
Unique tokens in target (fr) vocabulary: 74684
