In [None]:
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [32]:
import pickle

# load doc into memory
with open("cleaned_pairs.txt", 'rb') as f:
	paired_sent = pickle.load(f)

print(paired_sent[:5])

[['resumption of the session', 'hervatting van de zitting'], ['i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period', 'ik verklaar de zitting van het europees parlement die op vrijdag december werd onderbroken te zijn hervat ik wens u allen een gelukkig nieuwjaar en hoop dat u een goede vakantie heeft gehad'], ['although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful', 'zoals u heeft kunnen constateren is de grote millenniumbug uitgebleven de burgers van een aantal van onze lidstaten zijn daarentegen door verschrikkelijke natuurrampen getroffen'], ['you have requested a debate on this subject in the course of the next few days during this partsession', 'u heeft aangegeven dat u deze vergaderperiode een debat wilt ov

In [26]:
class MakeVocab():
    def __init__(self) -> None:
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS"}
        self.num_words = 3
        self.num_sentences = 0
        self.longest_sentence = 0
    
    def add_word(self, word):
            if word not in self.word2index:
                # First entry of word into vocabulary
                self.word2index[word] = self.num_words
                self.word2count[word] = 1
                self.index2word[self.num_words] = word
                self.num_words += 1
            else:
                # Word exists; increase word count
                self.word2count[word] += 1

    def add_sentence(self, sentence):
            sentence_len = 0
            for word in sentence.split(' '):
                sentence_len += 1
                self. add_word(word)
            if sentence_len > self.longest_sentence:
                # This is the longest sentence
                self.longest_sentence = sentence_len
            # Count the number of sentences
            self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

In [27]:
vocab_eng = MakeVocab()
vocab_nl = MakeVocab()
for pair in paired_sent:
    vocab_eng.add_sentence(pair[0])
    vocab_nl.add_sentence(pair[1])

In [29]:
print(vocab_eng.to_word(3))
print(vocab_nl.to_word(3))

resumption
hervatting


In [None]:
import torch
import torch.nn as nn
embeddings = nn.Embedding(num_words, 10)
lookup_tensor = torch.tensor([word2index["zitting"]])
zitting_embed = embeddings(lookup_tensor)
print(zitting_embed)

In [1]:
#Method Two for embedding 

#create vocabulary
def vocab(sentences):
    vocabulary = set()
    vocabulary.update(['<sos>', '<eos>', '<pad>'])
    for sen in sentences:
        vocabulary.update(sen.split())
    word_to_index = {word: index for index, word in enumerate(vocabulary)}
    index_to_word = {index: word for word, index in word_to_index.items()}
    return (word_to_index,index_to_word)
#change the words which are in the sentences into indices
def sentence_to_index(sentence, vocab):
    return [vocab['<sos>']] + [vocab[word] for word in sentence.split()] + [vocab['<eos>']]


In [2]:
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
# load doc into memory
with open("sen_eng.txt", 'rb') as f:
	sentences_eng = pickle.load(f)
	
with open("sen_nl.txt", 'rb') as f:
	sentences_nl = pickle.load(f)
	
nl_vocab,nl_index_to_word = vocab(sentences_nl)
en_vocab,en_index_to_word = vocab(sentences_eng)
nl_index = [torch.Tensor(sentence_to_index(sen, nl_vocab)) for sen in sentences_nl]
en_index = [torch.Tensor(sentence_to_index(sen, en_vocab)) for sen in sentences_eng]

len(nl_index), len(en_index)


(1997775, 1997775)

In [3]:
#create dataset for translation
#use Dataloader to get the dataloader for training dataset, validation dataset and test dateset: 49% : 21% : 30%

class Builddataset(Dataset):
    def __init__(self, sentences_eng, sentences_nl):
        self.nl_sentences = sentences_nl
        self.english_sentences = sentences_eng

    def __getitem__(self, idx):
        return self.nl_sentences[idx], self.english_sentences[idx]
    
    def __len__(self):
        return len(self.english_sentences)

def collate_fn(batch):

    en_index, nl_index = zip(*batch)

    padded_english_sentences = pad_sequence(en_index, batch_first=True, padding_value=en_vocab['<pad>'])
    padded_nl_sentences = pad_sequence(nl_index, batch_first=True, padding_value=nl_vocab['<pad>'])
    
    return padded_nl_sentences, padded_english_sentences, 



dataset = Builddataset(nl_index, en_index)


train_size = int(0.49 * len(dataset))
val_size = int(0.21* len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=72, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=72, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=72, shuffle=False, collate_fn=collate_fn)