In [190]:
import torch as t
from os.path import exists
from math import sqrt
import spacy
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import torch.nn as nn
import time
import warnings
warnings.filterwarnings('ignore')

In [191]:
train, val, test = datasets.Multi30k(language_pair=("de", "en"))

In [192]:
spacy_de = spacy.load("de_core_news_sm")
spacy_en = spacy.load("en_core_web_sm")

In [193]:
def tokenize(text, tokenizer):
    return [tok.text for tok in tokenizer.tokenizer(text)]

In [194]:
text = "Good"
A = tokenize(text,spacy_en)

In [195]:
A = spacy_en(text)
A

Good

In [196]:
A.vector.shape

(96,)

In [197]:
A.vector

array([-1.3925309e+00, -8.2032859e-01, -3.3235990e-02,  5.5622303e-01,
       -2.6535493e-01, -7.4802530e-01, -1.8004468e-01,  4.3667999e-01,
       -3.7732559e-01, -1.5458945e+00,  1.6216996e+00, -2.7939260e-01,
        2.1640056e-01,  1.3305581e+00, -1.1649944e+00,  3.7744302e-01,
       -3.1456047e-01, -1.9912491e+00,  2.2011568e-01,  9.1943771e-01,
       -1.2784649e+00,  8.0309922e-01, -1.0825787e+00, -4.6609959e-01,
        2.9785094e-01,  8.5351229e-02, -6.1701751e-01,  4.2586559e-01,
       -6.8177646e-01, -4.8855549e-01,  3.6563525e-01, -3.1858730e-01,
        1.4377862e+00, -1.1866851e+00,  1.2619048e-04,  7.1142912e-02,
        8.2902855e-01, -8.1801403e-01,  1.8817236e+00,  9.5357406e-01,
       -1.5417643e+00,  7.6864004e-02,  3.9940736e-01,  4.9163008e-01,
        3.8675052e-01, -1.3487972e+00, -3.0620223e-01, -9.1533399e-01,
        5.9590173e-01, -7.0674175e-01,  3.5050794e-01,  3.4263593e-01,
       -3.2672539e-01,  3.9872783e-01,  7.7480209e-01, -6.3176453e-03,
      

In [198]:
def yield_tokens(data_iter, tokenizer, index):
    for from_to_tuple in data_iter:
        yield tokenizer(from_to_tuple[index])

In [199]:
for label, line in train:
    print(label)
    print(line)
    break

Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
Two young, White males are outside near many bushes.


In [201]:
def build_vocabulary(spacy_de, spacy_en):
    def tokenize_de(text):
        return tokenize(text, spacy_de)

    def tokenize_en(text):
        return tokenize(text, spacy_en)

    print("Building German Vocabulary ...")
    train, val, test = datasets.Multi30k(language_pair=("de", "en"))
    vocab_src = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_de, index=0),
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    print("Building English Vocabulary ...")
    train, val, test = datasets.Multi30k(language_pair=("de", "en"))
    vocab_tgt = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_en, index=1),
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    vocab_src.set_default_index(vocab_src["<unk>"])
    vocab_tgt.set_default_index(vocab_tgt["<unk>"])

    return vocab_src, vocab_tgt

In [202]:
def load_vocab(spacy_de, spacy_en):
    if not exists("vocab.pt"):
        vocab_src, vocab_tgt = build_vocabulary(spacy_de, spacy_en)
        t.save((vocab_src, vocab_tgt), "vocab.pt")
    else:
        vocab_src, vocab_tgt = t.load("vocab.pt")
    print("Finished.\nVocabulary sizes:")
    print(len(vocab_src))
    print(len(vocab_tgt))
    return vocab_src, vocab_tgt

In [203]:
source,translated = load_vocab(spacy_de, spacy_en)

Finished.
Vocabulary sizes:
8315
6384


In [204]:
len(translated)

6384

In [205]:
translated.lookup_tokens(range(translated.__len__()))

['<s>',
 '</s>',
 '<blank>',
 '<unk>',
 'a',
 '.',
 'A',
 'in',
 'the',
 'on',
 'is',
 'and',
 'man',
 'of',
 'with',
 ',',
 'woman',
 'are',
 'to',
 'Two',
 'at',
 'wearing',
 'people',
 'shirt',
 'white',
 'young',
 'black',
 'his',
 'an',
 'while',
 'blue',
 'sitting',
 'red',
 'girl',
 'dog',
 'boy',
 'men',
 'standing',
 'playing',
 'group',
 'street',
 'down',
 'walking',
 '-',
 'front',
 'her',
 'holding',
 'water',
 'by',
 'The',
 'up',
 'An',
 'one',
 'green',
 'women',
 'for',
 'looking',
 'child',
 'outside',
 'Three',
 'as',
 'large',
 'little',
 'through',
 'yellow',
 'brown',
 'two',
 'from',
 'ball',
 'their',
 'hat',
 'into',
 'person',
 'children',
 'other',
 'next',
 'dressed',
 'small',
 'out',
 'over',
 'building',
 'People',
 'riding',
 'running',
 'near',
 'around',
 'another',
 'jacket',
 'some',
 'field',
 'sidewalk',
 'beach',
 'orange',
 'crowd',
 'jumping',
 'pink',
 'sits',
 'stands',
 'behind',
 'table',
 'snow',
 'hair',
 'grass',
 'background',
 "'s",
 's

In [208]:
# EMBED_SIZE refers to representation size of each word
# HIDDEN_SIZE refers to size of query, key and value vectors.

In [209]:
nHEADS = 8
EMBED_SIZE = 512
HIDDEN_SIZE = 64

In [210]:
print(t.__version__)

1.11.0+cu102


In [211]:
X = t.rand(7,512)
X

tensor([[0.0446, 0.3989, 0.6646,  ..., 0.1572, 0.0438, 0.4550],
        [0.7909, 0.7255, 0.1799,  ..., 0.6089, 0.5568, 0.9677],
        [0.5927, 0.5514, 0.9579,  ..., 0.3743, 0.0576, 0.8229],
        ...,
        [0.0322, 0.2345, 0.3550,  ..., 0.7652, 0.2395, 0.5010],
        [0.7640, 0.1187, 0.2920,  ..., 0.2644, 0.4300, 0.5027],
        [0.5731, 0.9691, 0.9493,  ..., 0.8721, 0.9441, 0.8067]])

In [212]:
X.shape

torch.Size([7, 512])

In [213]:
class EncAttention(nn.Module):
    def __init__(self):
        super(EncAttention, self).__init__()
        self.Wq = nn.Parameter(t.rand(nHEADS, EMBED_SIZE, HIDDEN_SIZE))
        self.Wk = nn.Parameter(t.rand(nHEADS, EMBED_SIZE, HIDDEN_SIZE))
        self.Wv = nn.Parameter(t.rand(nHEADS, EMBED_SIZE, HIDDEN_SIZE))
        self.Wo = nn.Parameter(t.rand(nHEADS*HIDDEN_SIZE, EMBED_SIZE))
        
    def forward(self,X):
        Q = X@self.Wq
        K = X@self.Wk
        V = X@self.Wv
        Z = t.bmm(Q,K.transpose(1,2))/sqrt(HIDDEN_SIZE)
        Z = nn.Softmax(dim=2)(Z)
        Z = t.einsum('ijj->ij',[Z])
        Z = t.einsum('ij,ijk->ijk',Z,V)
        Z = t.reshape(Z,(Z.shape[1],-1))
        Z = Z@self.Wo
        Z = nn.Dropout(p=0.1)(Z)
        Z = nn.LayerNorm(Z.shape)(Z+X)
        return Z

In [216]:
class FeedForward(nn.Module):
    def __init__(self):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(512,2048)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(2048,512)
        self.relu2 = nn.ReLU()
            
    def forward(self,X):
        Z = self.linear1(X)
        Z = self.relu1(Z)
        Z = self.linear2(Z)
        Z = self.relu2(Z)
        Z = nn.Dropout(p=0.1)(Z)
        Z = nn.LayerNorm(Z.shape)(Z+X)
        return Z

In [217]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.attention = EncAttention()
        self.feedforward = FeedForward()
        
    def forward(self,X):
        Z = self.attention(X)
        Z = self.feedforward(Z)
        return Z

In [223]:
class MaskedAttention(nn.Module):
    def __init__(self):
        super(MaskedAttention, self).__init__()
        self.Wq = nn.Parameter(t.rand(nHEADS, EMBED_SIZE, HIDDEN_SIZE))
        self.Wk = nn.Parameter(t.rand(nHEADS, EMBED_SIZE, HIDDEN_SIZE))
        self.Wv = nn.Parameter(t.rand(nHEADS, EMBED_SIZE, HIDDEN_SIZE))
        self.Wo = nn.Parameter(t.rand(nHEADS*HIDDEN_SIZE, EMBED_SIZE))
        
        
    def forward(self,X):
        Q = X@self.Wq
        K = X@self.Wk
        V = X@self.Wv
        Z = t.bmm(Q,K.transpose(1,2))/sqrt(HIDDEN_SIZE)
        r,c = t.triu_indices(Z.shape[1],Z.shape[1],1)
        Z[:,r,c] = float('-inf')
        Z = nn.Softmax(dim=2)(Z)
        Z = t.einsum('ijj->ij',[Z])
        Z = t.einsum('ij,ijk->ijk',Z,V)
        Z = t.reshape(Z,(Z.shape[1],-1))
        Z = Z@self.Wo
        Z = nn.Dropout(p=0.1)(Z)
        Z = nn.LayerNorm(Z.shape)(Z+X)
        return Z, Q

In [224]:
class EncDecAttention(nn.Module):
    def __init__(self):
        super(EncDecAttention,self).__init__()
        self.Wo = nn.Parameter(t.rand(nHEADS*HIDDEN_SIZE, EMBED_SIZE))
        
        
    def forward(self, X, maskquery, enc_output):
        Q = maskquery
        K = enc_output.reshape(nHEADS,enc_output.shape[0],-1)
        V = enc_output.reshape(nHEADS,enc_output.shape[0],-1)
        Z = t.bmm(Q,K.transpose(1,2))/sqrt(HIDDEN_SIZE)
        r,c = t.triu_indices(Z.shape[1],Z.shape[1],1)
        Z[:,r,c] = float('-inf')
        Z = nn.Softmax(dim=2)(Z)
        Z = t.einsum('ijj->ij',[Z])
        Z = t.einsum('ij,ijk->ijk',Z,V)
        Z = t.reshape(Z,(Z.shape[1],-1))
        Z = Z@self.Wo
        Z = nn.Dropout(p=0.1)(Z)
        Z = nn.LayerNorm(Z.shape)(Z+X)
        return Z

In [225]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder,self).__init__()
        self.masked = MaskedAttention()
        self.encdec = EncDecAttention()
        self.feedforward = FeedForward()
        
    def forward(self,X,enc_output):
        Z, Q = self.masked(X)
        Z = self.encdec(Z, Q, enc_output)
        Z = self.feedforward(Z)
        return Z

In [226]:
class EncoderStack(nn.Module):
    def __init__(self):
        super(EncoderStack,self).__init__()
        self.enc1 = Encoder()
        self.enc2 = Encoder()
        self.enc3 = Encoder()
        self.enc4 = Encoder()
        self.enc5 = Encoder()
        self.enc6 = Encoder()
        
    def forward(self,X):
        Z = self.enc1(X)
        Z = self.enc2(Z)
        Z = self.enc3(Z)
        Z = self.enc4(Z)
        Z = self.enc5(Z)
        Z = self.enc6(Z)
        return Z

In [227]:
class DecoderStack(nn.Module):
    def __init__(self):
        super(DecoderStack,self).__init__()
        self.dec1 = Decoder()
        self.dec2 = Decoder()
        self.dec3 = Decoder()
        self.dec4 = Decoder()
        self.dec5 = Decoder()
        self.dec6 = Decoder()
        
    def forward(self,X, enc_output):
        Z = self.dec1(X, enc_output)
        Z = self.dec2(Z, enc_output)
        Z = self.dec3(Z, enc_output)
        Z = self.dec4(Z, enc_output)
        Z = self.dec5(Z, enc_output)
        Z = self.dec6(Z, enc_output)
        return Z

In [228]:
class finalComponent(nn.Module):
    def __init__(self):
        super(finalComponent,self).__init__()
        self.fc = nn.Linear(EMBED_SIZE,len(translated))
        self.softmax = nn.Softmax()
        
    def forward(self,X):
        Z = self.fc(X)
        Z = self.softmax(Z)
        return Z

In [229]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer,self).__init__()
        self.EncStack = EncoderStack()
        self.DecStack = DecoderStack()
        self.finalComponent = finalComponent()
    
    def forward(self,X):
        Z = self.EncStack(X)
        Z = self.DecStack(X,Z)
        Z = self.finalComponent(Z)
        return Z

In [230]:
truemodel = Transformer()

In [231]:
begin = time.time()
Z = truemodel(X)
end = time.time()

In [232]:
end-begin

0.03658580780029297

In [233]:
Z.shape

torch.Size([7, 6384])