In [2]:
import spacy

en_nlp = spacy.load('en_core_web_sm')
fr_nlp = spacy.load('fr_core_news_sm')

In [3]:
en_test = "This is a sample sentence."
en_document = en_nlp(en_test)

for token in en_document:
    print(token.text)

This
is
a
sample
sentence
.


In [4]:
fr_test = "Ceci est une phrase de test."
fr_document = fr_nlp(fr_test)

for token in fr_document:
    print(token.text)

Ceci
est
une
phrase
de
test
.


In [6]:
from datasets import load_from_disk
import os


data_path = os.path.join("F:", "Datasets", "wmt14_fr_en")
if os.path.exists(data_path):
    print(data_path)
wmt14_en_fr_dataset = load_from_disk(dataset_path=data_path)

F:Datasets\wmt14_fr_en


Loading dataset from disk:   0%|          | 0/30 [00:00<?, ?it/s]

In [7]:
print(wmt14_en_fr_dataset['test'][0])

{'translation': {'en': 'Spectacular Wingsuit Jump Over Bogota', 'fr': 'Spectaculaire saut en "wingsuit" au-dessus de Bogota'}}


In [8]:
from enum import Enum
class Language(Enum):
    EN = 0
    FR = 1

In [9]:
def tokenize_en(sentence, lang: int = Language.EN.value):
    if lang:
        return [token.text.lower() for token in fr_nlp.tokenizer(sentence)]
    else:
        return [token.text.lower() for token in en_nlp.tokenizer(sentence)]

In [15]:
from collections import Counter
from tqdm.notebook import tqdm

def build_vocab(language_sentences, tokenizer, lang:int = Language.EN.value, min_freq: int = 2):
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    counter = Counter()
    
    if lang == Language.EN.value:
        for sentence in tqdm(language_sentences, desc="Building vocab for English"):
            counter.update(tokenizer(sentence['translation']['en'], lang))
    else:
        for sentence in tqdm(language_sentences, desc="Building vocab for French"):
            counter.update(tokenizer(sentence['translation']['fr'], lang))
    
    for word, freq in counter.items():
        if freq > min_freq and word not in vocab:
            ind = len(vocab)
            vocab[word] = ind
    return vocab

In [16]:
train_dataset = wmt14_en_fr_dataset['train']

In [17]:
# fr_sentences = [data['translation']['fr'] for data in train_dataset]
# en_sentences = [data['translation']['en'] for data in train_dataset]

In [18]:
# len(fr_sentences)

In [19]:
# len(en_sentences)

In [20]:
import pickle

processed_data_path = os.path.join("F:", "Datasets", "wmt14_fr_en_processed")
if os.path.exists(processed_data_path):
    print(processed_data_path)

F:Datasets\wmt14_fr_en_processed


In [None]:
vocab_train_en = build_vocab(language_sentences=train_dataset, tokenizer=tokenize_en, lang=Language.EN.value)
vocab_en_path = os.path.join(processed_data_path, "vocab_en.pkl")y

Building vocab for English:   0%|          | 0/40836715 [00:00<?, ?it/s]

In [23]:
with open(vocab_en_path, "wb") as f:
    pickle.dump(vocab_train_en, f)

In [24]:
vocab_train_fr = build_vocab(language_sentences=train_dataset, tokenizer=tokenize_en, lang=Language.FR.value)
vocab_fr_path = os.path.join(processed_data_path, "vocab_fr.pkl")

Building vocab for French:   0%|          | 0/40836715 [00:00<?, ?it/s]

In [25]:
with open(vocab_fr_path, "wb") as f:
    pickle.dump(vocab_train_fr, f)

In [26]:
vocab_fr_path = os.path.join(processed_data_path, "vocab_fr.pkl")
vocab_en_path = os.path.join(processed_data_path, "vocab_en.pkl")

with open(vocab_fr_path, "rb") as f:
    vocab_train_fr = pickle.load(f)
with open(vocab_en_path, "rb") as f:
    vocab_train_en = pickle.load(f)

In [27]:
print(len(vocab_train_en))
print(len(vocab_train_fr))

1066763
1142327


In [28]:
def encode(tokens, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

def preprocess(sample):
    fr_sentence = sample['translation']['fr']
    en_sentence = sample['translation']['en']
    
    fr_ids = [vocab_train_fr['<sos>']] + encode(tokenize_en(fr_sentence, lang = Language.FR.value), vocab_train_fr) + [vocab_train_fr['<eos>']]
    en_ids = [vocab_train_en['<sos>']] + encode(tokenize_en(en_sentence, lang = Language.EN.value), vocab_train_en) + [vocab_train_en['<eos>']]
    
    return {'fr_ids': fr_ids, 'en_ids': en_ids}

train_encoded_data = train_dataset.map(preprocess)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/40836715 [00:00<?, ? examples/s]

In [29]:
import torch

class TranslationData(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
        # super().__init__(data)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        fr_indices = self.data[index]['fr_ids']
        en_indices = self.data[index]['en_ids']
        
        src = torch.tensor(fr_indices, dtype=torch.long)
        trg = torch.tensor(en_indices, dtype=torch.long)
        return src, trg

In [None]:
train_data_torch = TranslationData(train_encoded_data)
train_data_torch_path = os.path.join(processed_data_path, 'training_fr_en_encoded_data.pt')
torch.save(train_data_torch, train_data_torch_path)

In [33]:
def collate_fn(batch):
    print(type(batch))
    src_batch, trg_batch = zip(*batch)
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=0, batch_first=True)
    trg_batch = torch.nn.utils.rnn.pad_sequence(trg_batch, padding_value=0, batch_first=True)
    
    return src_batch, trg_batch

In [34]:
data_loader = torch.utils.data.DataLoader(train_data_torch, batch_size=32, shuffle=True, collate_fn=collate_fn)

for src, trg in data_loader:
    print(src.shape, trg.shape)
    break

<class 'list'>
torch.Size([32, 75]) torch.Size([32, 56])
