In [None]:
import random
import collections
from tqdm import tqdm
from collections import Counter


import numpy as np
import pandas as pd
import spacy

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
# load data
de_train_table = pd.read_csv('../data/train.de', sep='\t', header=None, names=['Text'])
de_val_table = pd.read_csv('../data/val.de', sep='\t', header=None, names=['Text'])

en_train_table = pd.read_csv('../data/train.en', sep='\t', header=None, names=['Text'])
en_val_table = pd.read_csv('../data/val.en', sep='\t', header=None, names=['Text'])

In [None]:
en_train_table.head()

In [None]:
de_train_table.head()

In [6]:
# Spacy load German and English models
de_nlp = spacy.load('de_core_news_sm')
en_nlp = spacy.load('en_core_web_sm')

In [61]:
def create_vocabs(df_data, nlp, vocab_size):
    """
        Creates token-to-index and index-to-token vocabularies
    """
    # create initial token-to-index vocab  
    vocab_token2idx = {
        '[PAD]': 0,
        '[UNK]': 1,
        '[BOS]': 2,
        '[EOS]': 3
    }
    
    # create initial index-to-token vocab  
    vocab_idx2token = {
        0: '[PAD]',
        1: '[UNK]',
        2: '[BOS]',
        3: '[EOS]'
    }
    
    # tokenize 
    tokenized_sents = nlp.pipe(df_data['Text'])
    
    # count frequences
    token_counter = Counter()

    for sentence in tqdm(tokenized_sents, total=len(df_data['Text'])):
        for token in sentence:
            token_counter[token.text] += 1
            
    print(len(token_counter))        
            
    
    
    # fill in the vocab
    for token, freq in token_counter.most_common(vocab_size - len(vocab_token2idx)):
        vocab_token2idx[token] = len(vocab_token2idx)
        vocab_idx2token[len(vocab_idx2token)] = token
        
    return vocab_token2idx, vocab_idx2token

In [62]:
VOCAB_SIZE = 2000

en_vocab_token2idx, en_vocab_idx2token = create_vocabs(en_train_table, en_nlp, VOCAB_SIZE)
de_vocab_token2idx, de_vocab_idx2token = create_vocabs(de_train_table, de_nlp, VOCAB_SIZE)

100%|██████████| 29000/29000 [00:38<00:00, 749.59it/s]
  0%|          | 0/29000 [00:00<?, ?it/s]

10833


100%|██████████| 29000/29000 [00:33<00:00, 866.04it/s]

19210





In [63]:
en_vocab_idx2token

{0: '[PAD]',
 1: '[UNK]',
 2: '[BOS]',
 3: '[EOS]',
 4: 'a',
 5: '.',
 6: 'A',
 7: 'in',
 8: 'the',
 9: 'on',
 10: 'is',
 11: 'and',
 12: 'man',
 13: 'of',
 14: 'with',
 15: ',',
 16: 'woman',
 17: 'are',
 18: 'to',
 19: 'Two',
 20: 'at',
 21: 'wearing',
 22: 'people',
 23: 'shirt',
 24: 'white',
 25: 'young',
 26: 'black',
 27: 'his',
 28: 'an',
 29: 'while',
 30: 'blue',
 31: 'red',
 32: 'sitting',
 33: 'girl',
 34: 'dog',
 35: 'boy',
 36: 'men',
 37: 'standing',
 38: 'playing',
 39: 'group',
 40: 'street',
 41: 'down',
 42: 'walking',
 43: '-',
 44: 'front',
 45: 'her',
 46: 'holding',
 47: 'water',
 48: 'by',
 49: 'The',
 50: 'up',
 51: 'green',
 52: 'women',
 53: 'one',
 54: 'An',
 55: 'for',
 56: 'looking',
 57: 'outside',
 58: 'child',
 59: 'Three',
 60: 'as',
 61: 'little',
 62: 'large',
 63: 'through',
 64: 'yellow',
 65: 'brown',
 66: 'two',
 67: 'from',
 68: 'hat',
 69: 'their',
 70: 'ball',
 71: 'into',
 72: 'person',
 73: 'children',
 74: 'next',
 75: 'other',
 76: 'dresse

In [64]:
class CustomDataset(Dataset):
    def __init__(self, df_src, df_trg, vocab_src, vocab_trg, nlp_src, nlp_trg, pad_max_size):
        super().__init__()
        self.df_src = df_src['Text']
        self.df_trg = df_trg['Text']
        
        self.vocab_src = vocab_src
        self.vocab_trg = vocab_trg
        
        self.nlp_src = nlp_src
        self.nlp_trg = nlp_trg
        
        self.pad_max_size = pad_max_size
        
        self.pad_id = vocab_src['[PAD]']
        self.unk_id = vocab_src['[UNK]']
        self.bos_id = vocab_src['[BOS]']
        self.eos_id = vocab_src['[EOS]']

        
    def __len__(self):
        return len(self.df_src)
    
    def pad_sequence(self, seq):
        if len(seq) - 1 > self.pad_max_size:
            seq = seq[:self.pad_max_size + 1]
        elif len(seq) - 1 < self.pad_max_size:
            seq.extend([self.pad_id] * (self.pad_max_size - len(seq) + 1))
        
        return seq
    
    def __getitem__(self, indx):
        # souorce language
        src_seq = []
        
        src_sent = self.df_src[indx]
        src_seq.append(self.bos_id)
        
        src_seq.extend([self.vocab_src[token.text] 
                        if token.text in self.vocab_src 
                        else self.unk_id 
                        for token in self.nlp_src.tokenizer(src_sent)])

        
        # pad sequence or trancate to pad_max_size
        src_seq = self.pad_sequence(src_seq)    

        src_seq.append(self.eos_id)
        
        # target language
        trg_seq = []
        
        trg_sent = self.df_trg[indx]
        trg_seq.append(self.bos_id)
        
        trg_seq.extend([self.vocab_trg[token.text] 
                        if token.text in self.vocab_trg 
                        else self.unk_id 
                        for token in self.nlp_trg.tokenizer(trg_sent)])
        
        # pad sequence or trancate to pad_max_size
        trg_seq = self.pad_sequence(trg_seq)   
        
        trg_seq.append(self.eos_id)
            
            
        
        return (src_seq, trg_seq)
        

In [65]:
pad_max_size = 10

dataset = CustomDataset(en_train_table, de_train_table, en_vocab_token2idx, de_vocab_token2idx, en_nlp, de_nlp, pad_max_size)

In [66]:
dataset[6]

([2, 6, 12, 10, 136, 20, 4, 926, 1, 0, 0, 3],
 [2, 5, 12, 172, 20, 1, 1, 23, 4, 0, 0, 3])

In [90]:
class Encoder(nn.Module):
    def __init__(self, input_size: int, embedding_size: int, hidden_size: int, num_layers: int = 1, bidirectional: bool = False):
        super().__init__()
        
        self.embedding_layer = nn.Embedding(input_size, embedding_size) 
        
        self.rnn_layer = nn.RNN(input_size=embedding_size,
                                hidden_size=hidden_size,
                                num_layers=num_layers,
                                bidirectional=bidirectional,
                                batch_first=True
                               )
        
        self.h0 = torch.zeros(hidden_size)
        
    
    def forward(self, x: torch.Tensor):
        x = self.embedding_layer(x)
        output, hn = self.rnn(x, self.h0)
        
        
        return hn
        

In [91]:
encoder = Encoder(VOCAB_SIZE, 200, 100, 1)

In [92]:
def lookup_table(idx_arr, en_vocab_idx2token, de_vocab_idx2token):
    en_arr, de_arr = idx_arr
    
    en_tokens = []
    for en_token_idx in en_arr:
        en_tokens.append(en_vocab_idx2token[en_token_idx])
        
        
    de_tokens = []
    for de_token_idx in de_arr:
        de_tokens.append(de_vocab_idx2token[de_token_idx])    
    
    return en_tokens, de_tokens

In [93]:
data_exp = lookup_table(dataset[9], en_vocab_idx2token, de_vocab_idx2token)
data_exp

(['[BOS]',
  'Boys',
  'dancing',
  'on',
  'poles',
  'in',
  'the',
  'middle',
  'of',
  'the',
  'night',
  '[EOS]'],
 ['[BOS]',
  'Jungen',
  'tanzen',
  'mitten',
  'in',
  'der',
  'Nacht',
  'auf',
  'Pfosten',
  '.',
  '[PAD]',
  '[EOS]'])

In [95]:
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.layer = nn.RNN(input_size, hidden_size, batch_first=True)
        
    def forward(self, x):
        output, hidden = self.layer(x)
        
        return output
        
        