In [45]:
import torch
from torchtext.data import get_tokenizer
from typing import List, Iterable
from torchtext.datasets import Multi30k
from torchtext.vocab import build_vocab_from_iterator

In [1]:
source_lang = 'de'
target_lang = 'en'

In [2]:
token_transform = {}

In [8]:
en_tokenizer =

In [12]:
token_transform[source_lang] = get_tokenizer(tokenizer='spacy', language='de_core_news_sm')
token_transform[target_lang] = get_tokenizer(tokenizer='spacy', language='en_core_web_sm')

In [13]:
token_transform

{'de': functools.partial(<function _spacy_tokenize at 0x7fedf9cf9e50>, spacy=<spacy.lang.de.German object at 0x7fed11123f10>),
 'en': functools.partial(<function _spacy_tokenize at 0x7fedf9cf9e50>, spacy=<spacy.lang.en.English object at 0x7fed12284f40>)}

In [29]:
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {source_lang: 0, target_lang: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])


In [42]:
train_iter = Multi30k(split='train')

In [43]:
list(yield_tokens(train_iter, 'en'))[1]

['Several',
 'men',
 'in',
 'hard',
 'hats',
 'are',
 'operating',
 'a',
 'giant',
 'pulley',
 'system',
 '.',
 '\n']

In [44]:
vocab_transform = {}

In [61]:
for ln in [source_lang, target_lang]:
    train_iter = Multi30k(split='train')
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, language=ln), min_freq=1)
    

In [62]:
vocab_transform["en"]

Vocab()

In [64]:
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [66]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

In [57]:
train_iter = Multi30k(split='train')

In [60]:
next(yield_tokens(train_iter, language='en'))

['Several',
 'men',
 'in',
 'hard',
 'hats',
 'are',
 'operating',
 'a',
 'giant',
 'pulley',
 'system',
 '.',
 '\n']

In [67]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from typing import Iterable, List


SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}


# Create source and target language tokenizer. Make sure to install the dependencies.
# pip install -U spacy
# python -m spacy download en_core_web_sm
# python -m spacy download de_core_news_sm
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set UNK_IDX as the default index. This index is returned when the token is not found.
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

In [69]:
vocab_transform['de'].

Vocab()

In [74]:
import math

import torch
import torch.nn as nn

In [130]:
class PositionalEmbedding(nn.Module):
    def __init__(self,
                 emb_size,
                 dropout,
                 maxlen):
        super(PositionalEmbedding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        print("den shape", den.shape)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        print("pos shape", pos.shape)
        pos_embedding = torch.zeros(maxlen, emb_size)
        pos_embedding[:, 0::2] = torch.sin(pos*den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        print(pos_embedding[:, 3])
        print(pos_embedding.shape)
        pos_embedding = pos_embedding.unsqueeze(-2)
        print(pos_embedding.shape)
        
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)
        
        
    def forward(self, token_embedding):
        x = self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(), 0])
        return x

In [128]:
p_em = PositionalEmbedding(300, 0.2, 25)

den shape torch.Size([150])
pos shape torch.Size([25, 1])
tensor([ 1.0000,  0.5894, -0.3051, -0.9492, -0.8138, -0.0102,  0.8018,  0.9554,
         0.3244, -0.5729, -0.9998, -0.6057,  0.2857,  0.9426,  0.8254,  0.0305,
        -0.7895, -0.9612, -0.3436,  0.5561,  0.9992,  0.6218, -0.2662, -0.9356,
        -0.8367])
torch.Size([25, 300])
torch.Size([25, 1, 300])


In [84]:
v = torch.exp(- torch.arange(0, 300, 2)* math.log(10000) / 300)
print(v.shape)
print(v)

torch.Size([150])
tensor([1.0000e+00, 9.4044e-01, 8.8444e-01, 8.3176e-01, 7.8223e-01, 7.3564e-01,
        6.9183e-01, 6.5063e-01, 6.1188e-01, 5.7544e-01, 5.4117e-01, 5.0894e-01,
        4.7863e-01, 4.5013e-01, 4.2332e-01, 3.9811e-01, 3.7440e-01, 3.5210e-01,
        3.3113e-01, 3.1141e-01, 2.9286e-01, 2.7542e-01, 2.5902e-01, 2.4359e-01,
        2.2909e-01, 2.1544e-01, 2.0261e-01, 1.9055e-01, 1.7920e-01, 1.6853e-01,
        1.5849e-01, 1.4905e-01, 1.4017e-01, 1.3183e-01, 1.2397e-01, 1.1659e-01,
        1.0965e-01, 1.0312e-01, 9.6977e-02, 9.1201e-02, 8.5770e-02, 8.0662e-02,
        7.5858e-02, 7.1340e-02, 6.7091e-02, 6.3096e-02, 5.9338e-02, 5.5804e-02,
        5.2481e-02, 4.9355e-02, 4.6416e-02, 4.3652e-02, 4.1052e-02, 3.8607e-02,
        3.6308e-02, 3.4145e-02, 3.2112e-02, 3.0200e-02, 2.8401e-02, 2.6710e-02,
        2.5119e-02, 2.3623e-02, 2.2216e-02, 2.0893e-02, 1.9649e-02, 1.8478e-02,
        1.7378e-02, 1.6343e-02, 1.5370e-02, 1.4454e-02, 1.3594e-02, 1.2784e-02,
        1.2023e-02, 1.

In [78]:
math.log(10000)

9.210340371976184

In [86]:
torch.arange(0, 300, 2)

tensor([  0,   2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,  26,
         28,  30,  32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,
         56,  58,  60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,
         84,  86,  88,  90,  92,  94,  96,  98, 100, 102, 104, 106, 108, 110,
        112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138,
        140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166,
        168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194,
        196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222,
        224, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250,
        252, 254, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278,
        280, 282, 284, 286, 288, 290, 292, 294, 296, 298])

In [88]:
math.log(10000)/300

0.030701134573253946

In [89]:
math.e**9.21

9996.59685943787

In [137]:
import numpy as np
from torch import Tensor

In [133]:
a = np.array([[1,2,11,22], [3,4, 33, 44]])

In [134]:
a[:, 0::2]

array([[ 1, 11],
       [ 3, 33]])

In [139]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size: int) -> None:
        super(TokenEmbedding, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
        
    def forward(self, tokens: Tensor):
        return self.embedding(x.long()) * math.sqrt(self.emb_size)