# Imports and load models

In [None]:
import numpy as np
import pandas as pd
import torch
import joblib
import matplotlib.pyplot as plt
from flair.data import Sentence
from flair.data import Token
from flair.embeddings import WordEmbeddings
%matplotlib inline

# gpu
device = torch.device("cuda:0")
dtype = torch.float

input_language = 'de'
output_language = 'en'
embedding_model_en = WordEmbeddings('en')
embedding_model_de = WordEmbeddings('de')

# embedding size of fasttext models
d_model = 300 

# load data

In [None]:
# data
df = pd.read_csv('data\deu-eng\pairs.txt', delimiter='\t',
                    usecols=[0,1],encoding='utf-8',names=['en','de'])

number_sentences = 224351 # decrease max sequence length
en = df['en'][0:number_sentences]
de = df['de'][0:number_sentences]

print(f'{en[224350]} --- {de[224350]}')

# make vocabulary dict for en and de

In [None]:
max_sentence_len_en = -1
max_sentence_len_de = -1

en_set = set()
for s in en:
    
    sentence = Sentence(s)
    max_sentence_len_en = np.maximum(max_sentence_len_en, len(sentence))
    
    for token in sentence:
        en_set.add(token.text)
        
de_set = set()
for s in de:
    
    sentence = Sentence(s)
    max_sentence_len_de = np.maximum(max_sentence_len_de, len(sentence))
    
    for token in sentence:
        de_set.add(token.text)
        
en_dict = {}
en_dict["vocab_size"] = len(en_set)
en_dict["max_sentence_len"] = max_sentence_len_en # without <SOS> or <EOS>
en_dict["<EOS>"] = en_dict["vocab_size"]
en_dict[en_dict["vocab_size"]] = "<EOS>"
for i, token in enumerate(list(en_set)):
    en_dict[i] = token
    en_dict[token] = i
    
de_dict = {}
de_dict["vocab_size"] = len(de_set)
de_dict["max_sentence_len"] = max_sentence_len_de # without <SOS> or <EOS>
de_dict["<EOS>"] = de_dict["vocab_size"]
de_dict[de_dict["vocab_size"]] = "<EOS>"
for i, token in enumerate(list(de_set)):
    de_dict[i] = token
    de_dict[token] = i

In [None]:
print("Vocabulary en: ",len(en_set)," | max sentence length: ", max_sentence_len_en)
print("Vocabulary de: ",len(de_set)," | max sentence length: ", max_sentence_len_de)
filename = 'D:\Transformer\\vocab_en.data'
joblib.dump(en_dict, filename)
filename = 'D:\Transformer\\vocab_de.data'
joblib.dump(de_dict, filename)

# Create positional encodings

In [None]:
sine_mask = [2*i for i in range(d_model//2)]
cosine_mask = [2*i+1 for i in range(d_model//2)]
i = torch.tensor([i for i in range(d_model//2)]).to(device).to(dtype)

# pre compute positional encodings for 50 tokens
positional_encodings = []
for position in range(50):
    
    sine = torch.sin( position / ( 10000 ** (2*i/d_model) ) )
    cosine = torch.cos( position / ( 10000 ** (2*i/d_model) ) )
    
    position_enc = torch.zeros(d_model).to(device)
    position_enc[sine_mask] = sine
    position_enc[cosine_mask] = cosine

    positional_encodings.append(position_enc)

# return pre computed pos encoding
def positional_encoding(sentence):
    num_tokens = len(sentence)
    return positional_encodings[0:num_tokens]

# Test positional encoding

In [None]:
s = Sentence('This is a test a b c r g.',language_code='en')
enc_list = positional_encoding(s)
enc = np.array([token_enc.cpu().numpy() for token_enc in enc_list])

plt.figure(figsize=(12,8))
plt.pcolormesh(enc, cmap='viridis')
plt.xlabel('Embedding Dimensions')
plt.ylabel('Token Position')
plt.colorbar()
plt.show()

# make dataset (without zero padding) 

In [None]:
def make_target(sentence, position_dict):
    
    # cross entropy loss encoding for pytorch
    # save only index (softmax output)
    targets = []
    
    for token in sentence:
        
        if token.text == '<SOS>':
            continue
        
        targets.append(position_dict[token.text])
    
    # add EOS token as last target
    targets.append(position_dict['<EOS>'])
    
    return targets

In [None]:
batch_size = 32
dataset = []
for line in range(len(de)//batch_size//5):

    sentences_en = []
    for s in en[line*batch_size:line*batch_size+batch_size]:
        
        sentence = Sentence(s, language_code='en')
        
        if input_language == 'en': # add <EOS> token
            sentence.add_token("<EOS>")
        else: # add <SOS> used for shifting ouput by one step
            sentence.tokens = [Token("<SOS>")] + sentence.tokens
            
        sentences_en.append(sentence)
        
    sentences_de = []
    for s in de[line*batch_size:line*batch_size+batch_size]:
        
        sentence = Sentence(s, language_code='de')
        
        if input_language == 'de': # add <EOS> token
            sentence.add_token("<EOS>")
        else: # add <SOS> used for shifting ouput by one step 
            sentence.tokens = [Token("<SOS>")] + sentence.tokens
            
        sentences_de.append(sentence)
        
    embedding_model_en.embed(sentences_en)
    embedding_model_de.embed(sentences_de)
    
    for sentence_en, sentence_de in zip(sentences_en,sentences_de):
        
        pos_enc_en = positional_encoding(sentence_en)
        pos_enc_de = positional_encoding(sentence_de)
        
        en_embedding = torch.stack([token.embedding + pos_enc_en[i] for i,token in enumerate(sentence_en)])
        de_embedding = torch.stack([token.embedding + pos_enc_de[i] for i,token in enumerate(sentence_de)])
        
        en_tokens = [token.text for token in sentence_en]
        de_tokens = [token.text for token in sentence_de]
        
        target = make_target(sentence_en, en_dict) if output_language == 'en' else make_target(sentence_de, de_dict)
        
        if output_language ==  'en':
            dataset.append([de_embedding.cpu(), en_embedding.cpu(), target])
        else:
            dataset.append([en_embedding.cpu(), de_embedding.cpu(), target])
            
    if line%100 == 0:
        print(f'{line} / {len(df)//batch_size}')

In [None]:
filename = f'D:\Transformer\{input_language}_to_{output_language}.data'
joblib.dump(dataset, filename)  