In [1]:
# Torch Imports
import torch as t
import torch.nn as nn
from torch.optim import Adam,lr_scheduler
from torch.nn.functional import pad
from torch.utils.data import DataLoader

# NLP Imports
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
from torchtext.data.functional import to_map_style_dataset

# Miscellaneous Imports
from os.path import exists
from math import sqrt
import time
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
# nHEADS is number of heads in multi-attention
# d_model refers to representation size of each word
# HIDDEN_SIZE refers to size of query, key and value vectors.
nHEADS = 8
d_model = 512
HIDDEN_SIZE = 64

In [3]:
# Setting device
device = t.device('cuda:2')

In [4]:
# Loading Data
train, val, test = datasets.Multi30k(language_pair=("de", "en"))

In [5]:
# Converting Generator Style Dataset to Map Style Dataset
train = to_map_style_dataset(train)
test = to_map_style_dataset(test)

In [6]:
# Loading Dependencies
spacy_de = spacy.load("de_core_news_sm")
spacy_en = spacy.load("en_core_web_sm")

In [7]:
# Tokenize in source and target language
def tokenize(text, tokenizer):
    return [tok.text for tok in tokenizer.tokenizer(text)]

def tokenize_de(text):
    return tokenize(text, spacy_de)

def tokenize_en(text):
    return tokenize(text, spacy_en)

In [8]:
# Tokenizes from each sentence in dataset 
def yield_tokens(data_iter, tokenizer, index):
    for from_to_tuple in data_iter:
        yield tokenizer(from_to_tuple[index])

In [10]:
# Builds Vocabulary of source and target language
def build_vocabulary(spacy_de, spacy_en):

    print("Building German Vocabulary ...")
    train, val, test = datasets.Multi30k(language_pair=("de", "en"))
    vocab_src = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_de, index=0),
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    print("Building English Vocabulary ...")
    train, val, test = datasets.Multi30k(language_pair=("de", "en"))
    vocab_tgt = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_en, index=1),
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    vocab_src.set_default_index(vocab_src["<unk>"])
    vocab_tgt.set_default_index(vocab_tgt["<unk>"])

    return vocab_src, vocab_tgt

In [11]:
# Loads Vocabulary
def load_vocab(spacy_de, spacy_en):
    if not exists("vocab.pt"):
        vocab_src, vocab_tgt = build_vocabulary(spacy_de, spacy_en)
        t.save((vocab_src, vocab_tgt), "vocab.pt")
    else:
        vocab_src, vocab_tgt = t.load("vocab.pt")
    print("Finished.\nVocabulary sizes:")
    print(len(vocab_src))
    print(len(vocab_tgt))
    return vocab_src, vocab_tgt

In [12]:
source_vocab,target_vocab = load_vocab(spacy_de, spacy_en)

Finished.
Vocabulary sizes:
8315
6384


In [13]:
#Uncomment for length of target vocabulary
#len(target_vocab)

In [14]:
# Uncomment for indices of each word
#target_vocab(tokenize_en("Two young, White males are outside near many bushes."))

In [15]:
# Uncomment this for vocabulary in target language
#target_vocab.lookup_tokens(range(target_vocab.__len__()))

In [16]:
# Sentence pre-processing Helper Function
def collate_batch(
    batch,
    device,
    max_padding=128,
    pad_id=2,
):
    bs_id = t.tensor([0], device=device)  # <s> token id
    eos_id = t.tensor([1], device=device)  # </s> token id
    src_list, tgt_list = [], []
    for (_src, _tgt) in batch:
        processed_src = t.cat(
            [
                bs_id,
                t.tensor(
                    source_vocab(tokenize_de(_src)),
                    dtype=t.int64,
                    device=device,
                ),
                eos_id,
            ]
        )
        processed_tgt = t.cat(
            [
                bs_id,
                t.tensor(
                    target_vocab(tokenize_en(_tgt)),
                    dtype=t.int64,
                    device=device,
                ),
                eos_id,
            ]
        )
        src_list.append(
            # warning - overwrites values for negative values of padding - len
            pad(
                processed_src,
                (
                    0,
                    max_padding - len(processed_src),
                ),
                value=pad_id,
            )
        )
        tgt_list.append(
            pad(
                processed_tgt,
                (0, max_padding - len(processed_tgt)),
                value=pad_id,
            )
        )

    src = t.stack(src_list)
    tgt = t.stack(tgt_list)
    return (src, tgt)

In [17]:
# Collate Function for DataLoader
def collate_fn(batch):
    return collate_batch(
    batch,
    device
    )

In [18]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        
    def forward(self, data):
        ans = t.empty_like(data)
        for i,X in enumerate(data):
            y = t.arange(self.d_model/2,device=device)        
            y = t.repeat_interleave(y,2)
            Z = t.empty((128,512),device=device)
            for _ in range(Z.shape[0]):
                Z[_] = y
            Z = Z/self.d_model
            Z = 1/(1e4)**Z
            Z = t.arange(X.shape[1],device=device)*Z
            Z[:, 0::2] = t.sin(Z[:, 0::2])
            Z[:, 1::2] = t.cos(Z[:, 1::2])
            Z = nn.Dropout(p=0.1)(Z)
            ans[i] = nn.LayerNorm(Z.shape,device=device)(Z+X)
        return ans

In [19]:
# Word Embeddings
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model,device=device)
        self.d_model = d_model

    def forward(self, X):
        return self.lut(X) * sqrt(self.d_model)

In [20]:
# Self-Attention in Encoder
class EncAttention(nn.Module):
    def __init__(self):
        super(EncAttention, self).__init__()
        self.Wq = nn.Parameter(t.rand(nHEADS, d_model, HIDDEN_SIZE)).to(device)
        self.Wk = nn.Parameter(t.rand(nHEADS, d_model, HIDDEN_SIZE)).to(device)
        self.Wv = nn.Parameter(t.rand(nHEADS, d_model, HIDDEN_SIZE)).to(device)
        self.Wo = nn.Parameter(t.rand(nHEADS*HIDDEN_SIZE, d_model)).to(device)
        
    def forward(self,data):
        ans = t.empty_like(data)
        for i,X in enumerate(data):
            X = X.to(device)
            Q = X@self.Wq
            K = X@self.Wk
            V = X@self.Wv
            Z = t.bmm(Q,K.transpose(1,2))/sqrt(HIDDEN_SIZE)
            Z = nn.Softmax(dim=2)(Z)
            Z = t.einsum('ijj->ij',[Z])
            Z = t.einsum('ij,ijk->ijk',Z,V)
            Z = t.reshape(Z,(Z.shape[1],-1))
            Z = Z@self.Wo
            Z = nn.Dropout(p=0.1)(Z)
            Z = nn.LayerNorm(Z.shape,device=device)(Z+X)
            ans[i] = Z
        return ans

In [21]:
# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(512,2048,device=device)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(2048,512,device=device)
        self.relu2 = nn.ReLU()
            
    def forward(self,data):
        ans = t.empty_like(data)
        for i,X in enumerate(data):
            Z = self.linear1(X)
            Z = self.relu1(Z)
            Z = self.linear2(Z)
            Z = self.relu2(Z)
            Z = nn.Dropout(p=0.1)(Z)
            Z = nn.LayerNorm(Z.shape,device=device)(Z+X)
            ans[i] = Z
        return ans

In [22]:
# Encoder
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.attention = EncAttention()
        self.feedforward = FeedForward()
        
    def forward(self,X):
        Z = self.attention(X)
        Z = self.feedforward(Z)
        return Z

In [23]:
# Masked Attention in Decoder
class MaskedAttention(nn.Module):
    def __init__(self):
        super(MaskedAttention, self).__init__()
        self.Wq = nn.Parameter(t.rand(nHEADS, d_model, HIDDEN_SIZE)).to(device)
        self.Wk = nn.Parameter(t.rand(nHEADS, d_model, HIDDEN_SIZE)).to(device)
        self.Wv = nn.Parameter(t.rand(nHEADS, d_model, HIDDEN_SIZE)).to(device)
        self.Wo = nn.Parameter(t.rand(nHEADS*HIDDEN_SIZE, d_model)).to(device)
        
        
    def forward(self,data):
        ans = t.empty_like(data)
        for i,X in enumerate(data):
            Q = X@self.Wq
            K = X@self.Wk
            V = X@self.Wv
            Z = t.bmm(Q,K.transpose(1,2))/sqrt(HIDDEN_SIZE)
            r,c = t.triu_indices(Z.shape[1],Z.shape[1],1)
            Z[:,r,c] = float('-inf')
            Z = nn.Softmax(dim=2)(Z)
            Z = t.einsum('ijj->ij',[Z])
            Z = t.einsum('ij,ijk->ijk',Z,V)
            Z = t.reshape(Z,(Z.shape[1],-1))
            Z = Z@self.Wo
            Z = nn.Dropout(p=0.1)(Z)
            Z = nn.LayerNorm(Z.shape,device=device)(Z+X)
            ans[i] = Z
        return ans.reshape(ans.shape[0],nHEADS,ans.shape[1],-1)

In [24]:
# Encoder-Decoder Attention
class EncDecAttention(nn.Module):
    def __init__(self):
        super(EncDecAttention,self).__init__()
        self.Wo = nn.Parameter(t.rand(nHEADS*HIDDEN_SIZE, d_model)).to(device)
        
    def forward(self, maskquery, enc_output):
        ans = t.empty_like(enc_output)
        enc_output = enc_output.reshape(enc_output.shape[0],nHEADS,enc_output.shape[1],-1)
        for i,X in enumerate(maskquery):
            Q = X
            K = enc_output[i]
            V = enc_output[i]
            Z = t.bmm(Q,K.transpose(1,2))/sqrt(HIDDEN_SIZE)
            r,c = t.triu_indices(Z.shape[0],Z.shape[1],1)
            Z[:,r,c] = float('-inf') 
            Z = nn.Softmax(dim=2)(Z)
            Z = t.einsum('ijj->ij',[Z])           
            Z = t.einsum('ij,ijk->ijk',Z,V)
            Z = t.reshape(Z,(Z.shape[1],-1))
            Z = Z@self.Wo
            Z = nn.Dropout(p=0.1)(Z)           
            Z = nn.LayerNorm(Z.shape,device=device)(Z+X.reshape(X.shape[1],-1))
            ans[i] = Z            
        return ans

In [25]:
# Decoder
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder,self).__init__()
        self.masked = MaskedAttention()
        self.encdec = EncDecAttention()
        self.feedforward = FeedForward()
        
    def forward(self, X, enc_output):
        Z = self.masked(X)
        Z = self.encdec(Z, enc_output)
        Z = self.feedforward(Z)
        return Z

In [26]:
# Encoder Stack
class EncoderStack(nn.Module):
    def __init__(self):
        super(EncoderStack,self).__init__()
        self.enc1 = Encoder()
        self.enc2 = Encoder()
        self.enc3 = Encoder()
        self.enc4 = Encoder()
        self.enc5 = Encoder()
        self.enc6 = Encoder()
        
    def forward(self,X):
        Z = self.enc1(X)
#         print("Encoder 1 Completed")
        Z = self.enc2(Z)
#         print("Encoder 2 Completed")
        Z = self.enc3(Z)
#         print("Encoder 3 Completed")
        Z = self.enc4(Z)
#         print("Encoder 4 Completed")
        Z = self.enc5(Z)
#         print("Encoder 5 Completed")
        Z = self.enc6(Z)
#         print("Encoder 6 Completed")
        return Z

In [27]:
# Decoder Stack
class DecoderStack(nn.Module):
    def __init__(self):
        super(DecoderStack,self).__init__()
        self.dec1 = Decoder()
        self.dec2 = Decoder()
        self.dec3 = Decoder()
        self.dec4 = Decoder()
        self.dec5 = Decoder()
        self.dec6 = Decoder()
        
    def forward(self,X, enc_output):
        Z = self.dec1(X, enc_output)
#         print("Decoder 1 Completed")
        Z = self.dec2(Z, enc_output)
#         print("Decoder 2 Completed")
        Z = self.dec3(Z, enc_output)
#         print("Decoder 3 Completed")
        Z = self.dec4(Z, enc_output)
#         print("Decoder 4 Completed")
        Z = self.dec5(Z, enc_output)
#         print("Decoder 5 Completed")
        Z = self.dec6(Z, enc_output)
#         print("Decoder 6 Completed")
        return Z

In [28]:
# Final Component of Fully-Connected and Softmax Layer
class finalComponent(nn.Module):
    def __init__(self):
        super(finalComponent,self).__init__()
        self.fc = nn.Linear(d_model,len(target_vocab)).to(device)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self,data):
        ans = t.empty(data.shape[0],128,len(target_vocab))
        for i,X in enumerate(data):
            Z = self.fc(X)
            Z = self.softmax(Z)
            ans[i] = Z
        return ans

In [46]:
# Merging all components into 1 class
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer,self).__init__()
        self.deu_embed = Embeddings(512,len(source_vocab))
        self.eng_embed = Embeddings(512,len(target_vocab))
        self.pe = PositionalEncoding()
        self.EncStack = EncoderStack()
        self.DecStack = DecoderStack()
        self.finalComponent = finalComponent()
    
    def forward(self, Xdeu,mode="train"):
        Xeng = t.full(Xdeu.shape,1,device=device)
        Xeng[:,0] = 0
        Xdeu = self.deu_embed(Xdeu)
        Xdeu = self.pe(Xdeu)        
        Z = self.EncStack(Xdeu)
        Xeng = self.eng_embed(Xeng)              
        Z = self.DecStack(Xeng,Z)
        Z = self.finalComponent(Z)
        if mode=="train":
            return Z
        else:
            return t.argmax(Z,dim=2)

In [53]:
model = Transformer()

In [54]:
for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

In [31]:
train_dataloader = DataLoader(
        train,
        batch_size=100,
        shuffle=True,
        collate_fn=collate_fn,
    )

In [35]:
epochs = 10
warmup_steps = 4000
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = Adam(model.parameters(),betas=(0.9,0.98))
lambda1 = lambda step_num : ((d_model)**(-0.5))*min((step_num+1)**(-0.5),(step_num+1)*(warmup_steps)**(-1.5))
scheduler = lr_scheduler.LambdaLR(optimizer,lr_lambda=lambda1)

In [None]:
losses = []
model.train()
for epoch in tqdm(range(epochs)):
    for data in tqdm(train_dataloader):
        model.zero_grad()
        data = t.stack(list(data))
        output = model(data[0])
        target = t.zeros_like(output)
        for i,X in enumerate(data[1]):
            target[i][t.arange(X.size(0)),X] = 1
        loss = criterion(output,target)
        loss.backward()
        losses.append(loss.mean().item())
        scheduler.step()
model = model.to(t.device('cpu'))
t.save(model,'weights.pt')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/291 [00:00<?, ?it/s]

In [None]:
def translate(num):
    real = collate_fn([test[0]])
    real = t.stack(list(real))
    output = model(real[0],"test")
    for i in output.squeeze():
        print(target_vocab.lookup_token(i),end=" ")