In [58]:
import torch
import torch.nn as nn
from torch import Tensor
from torch.optim.lr_scheduler import LambdaLR
import math as m
import warnings
import copy
from data import Data
from tqdm import tqdm

warnings.filterwarnings('ignore')
torch.manual_seed(0)
RUN_EXAMPLES = True

In [59]:
config = {"layernorm_eps": 5e-7, 
          "d_model": 512,
          "hidden_size": 2048,
          "dropout": 0.1,
          "n_heads": 8,
          "n_layers": 6,
          "batch_size": 128,
          "max_length": 30,
          "warmup": 3000,
          "base_lr": 0.5}

In [60]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [61]:
class Embeddings(nn.Module):
    def __init__(self, vocab_size, d_model = config["d_model"]):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        
        embedding = self.embedding(x)
        return embedding * m.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model = config["d_model"], dropout = config["dropout"], max_len=config["max_length"]):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(m.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [62]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads = config["n_heads"], d_model = config["d_model"]):
        super().__init__()
        self.Qs = []
        self.Ks = []
        self.Vs = []
        self.d_model = d_model
        self.n_head = n_heads
        self.softmax = nn.Softmax(dim=-1)
        
        self.Qs = nn.Linear(d_model,d_model,bias=True)
        self.Ks = nn.Linear(d_model,d_model,bias=True)
        self.Vs = nn.Linear(d_model,d_model,bias=True)
        self.mha = nn.Linear(d_model,d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        scaled_dot = torch.matmul(Q, torch.transpose(K,dim0=-2,dim1=-1))/m.sqrt(self.d_model)
        if mask is not None:
            scaled_dot = scaled_dot.masked_fill(mask == 0, -1e9)
        score = self.softmax(scaled_dot)
        attention = torch.matmul(score,V)
        return attention



    def forward(self, q_in, k_in, v_in, mask):
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)

        batch_size = q_in.shape[0]
        head_size = int(self.d_model/self.n_head)
        Q = self.Qs(q_in).view(batch_size, -1, self.n_head, head_size).transpose(1, 2)
        K = self.Ks(k_in).view(batch_size, -1, self.n_head, head_size).transpose(1, 2)
        V = self.Vs(v_in).view(batch_size, -1, self.n_head, head_size).transpose(1, 2)

       
        scaled_dot = self.scaled_dot_product_attention(Q,K,V,mask)
        scaled_dot = (
            scaled_dot.transpose(1, 2)
            .contiguous()
            .view(batch_size, -1, self.d_model)
        )
        output = self.mha(scaled_dot)
        return output



In [63]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model = config["d_model"], 
                       n_heads=config["n_heads"], 
                       hidden_size= config["hidden_size"], 
                       dropout = config["dropout"]):
        super(EncoderLayer,self).__init__()
        self.layer_norm = nn.LayerNorm(d_model,config["layernorm_eps"])
        self.linear1 = nn.Linear(d_model,hidden_size)
        self.linear2 = nn.Linear(hidden_size, d_model)
        self.mha = MultiHeadedAttention(n_heads,d_model)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p = dropout)
        self.size = d_model

    def forward(self, x, src_mask = None):
        x = self.layer_norm(x)
        x = self.layer_norm(x + self.dropout(self.mha(x, x, x, src_mask)))
        x = x + self.dropout(self.linear_block(x))
        return x

    def linear_block(self, x:Tensor) -> Tensor:
        out = self.linear2(self.dropout(self.activation(self.linear1(x))))
        return out

In [64]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model = config["d_model"], 
                       n_heads=config["n_heads"], 
                       hidden_size= config["hidden_size"], 
                       dropout = config["dropout"]):
        super().__init__()
        
        self.layer_norm = nn.LayerNorm(d_model,config["layernorm_eps"])
        self.linear1 = nn.Linear(d_model,hidden_size)
        self.linear2 = nn.Linear(hidden_size, d_model)
        self.mha_1 = MultiHeadedAttention(n_heads,d_model)
        self.mha_2 = MultiHeadedAttention(n_heads,d_model)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p = dropout)
        self.size = d_model


    def forward(self,x, encoder_in, src_mask = None, trg_mask = None):

        x = self.layer_norm(x)
        x = self.layer_norm(x + self.dropout(self.mha_1(x, x, x, trg_mask)))
        x = self.layer_norm(x + self.dropout(self.mha_2(x, encoder_in, encoder_in, src_mask)))
        x = x + self.dropout(self.linear_block(x))
       
        return x

    def linear_block(self, x:Tensor) -> Tensor:
        out = self.linear2(self.dropout(self.activation(self.linear1(x))))
        return out

In [65]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"

    def __init__(self, N = config["n_layers"]):
        super(Encoder, self).__init__()
        EncLayer = EncoderLayer()
        self.layers = clones(EncLayer, N)
        self.norm = nn.LayerNorm(config["d_model"],config["layernorm_eps"])

    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [66]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."

    def __init__(self, N = config["n_layers"]):
        super(Decoder, self).__init__()
        DecLayer = DecoderLayer()
        self.layers = clones(DecLayer, N)
        self.norm = nn.LayerNorm(config["d_model"],config["layernorm_eps"])

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [67]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, padding_idx):
        super(Transformer, self).__init__()
        self.src_emb = nn.Sequential(Embeddings(src_vocab_size), PositionalEncoding())
        self.trg_emb = nn.Sequential(Embeddings(trg_vocab_size), PositionalEncoding())
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.proj = nn.Linear(config["d_model"], trg_vocab_size)
        self.padding_idx = padding_idx

        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def make_mask(self, seq, seq_type):
        if seq_type == "src":
            mask = (seq != self.padding_idx).unsqueeze(-2)
            return mask

        else:
            pad_mask = (seq != self.padding_idx).unsqueeze(-2)
            no_peak_mask = torch.triu(torch.ones((1, seq.shape[1], seq.shape[1])))
            no_peak_mask = no_peak_mask.transpose(dim0=-1, dim1 = -2).type_as(seq.data)
            mask = no_peak_mask & pad_mask
            return mask

    def forward(self,src,trg):
        src_mask = self.make_mask(src,"src")
        trg_mask = self.make_mask(trg,"trg")
        enc_out = self.encoder(self.src_emb(src),src_mask)
        dec_out = self.decoder(self.src_emb(trg),enc_out,src_mask,trg_mask)
        out = self.proj(dec_out)
        return out


In [68]:
train_data = Data(batch_size=config["batch_size"],max_length=config["max_length"])
train_loader, val_loader, test_loader = train_data.prepare_data()
padding_idx, src_size, trg_size = train_data.get_properties()

Getting Dataset..
Building German Vocabulary..
Building English Vocabulary..
German Vocabulary: 8315 entries, English Vocabulary: 6475 entries


In [69]:
def rate(step, model_size, factor, warmup):
    """
    we have to default the step to 1 for LambdaLR function
    to avoid zero raising to negative power.
    """
    if step == 0:
        step = 1
    return factor * (
        model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )

In [70]:
def run_one_step(model, optimizer, loss, device, data, mode, trg_size, lr_scheduler = None):
    src = data[0].to(device)
    trg = data[1].to(device)
    trg_input = trg[:, :-1]
    ys = trg[:, 1:].reshape(-1)
    if mode == 'train':
        model.train()
        optimizer.zero_grad()
        logits = model(src, trg_input)
        loss_ = loss(logits.reshape(-1, trg_size), ys)
        loss_.backward()
        optimizer.step()
        
        if lr_scheduler != None:
            lr_scheduler.step()

    else:
        model.eval()
        logits = model(src, trg_input)
        loss_ = loss(logits.reshape(-1, trg_size), ys)

    return loss_.item() 

In [71]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(src_size, trg_size, padding_idx)
model.to(device)
optimizer = torch.optim.Adam(
        model.parameters(), lr=config["base_lr"], betas=(0.9, 0.98), eps=1e-9
    )
lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, model_size=512, factor=1.0, warmup=config["warmup"]
        ),
    )
loss = nn.CrossEntropyLoss()
max_epoch = 30
best_val_loss = 1e9
for epoch in range(max_epoch):
    train_loss = 0.0
    val_loss = 0.0
    for batch in tqdm(train_loader):
        running_loss = run_one_step(model,optimizer,loss,device,batch,"train",trg_size,lr_scheduler)
        train_loss += running_loss
    train_loss /= len(train_loader)
    lr = optimizer.param_groups[0]["lr"]
    for batch in val_loader:
        running_loss = run_one_step(model,optimizer,loss,device,batch,"val",trg_size,lr_scheduler)
        val_loss += running_loss
    val_loss /= len(val_loader)

    if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'saved/model-{0}.pt'.format(val_loss))
    print("Epoch {}, train loss: {}, val_loss: {}, current_lr: {}".format(epoch, train_loss, val_loss, lr)) 

100%|██████████| 227/227 [00:52<00:00,  4.30it/s]


Epoch 0, train loss: 5.195475902851458, val_loss: 3.443099319934845, current_lr: 3.0526639569065404e-05


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 1, train loss: 2.7050097986464983, val_loss: 2.381317913532257, current_lr: 6.105327913813081e-05


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 2, train loss: 2.2128206349679553, val_loss: 1.9941923171281815, current_lr: 9.157991870719621e-05


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 3, train loss: 1.88513056192104, val_loss: 1.761987343430519, current_lr: 0.00012210655827626162


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 4, train loss: 1.7086672089698556, val_loss: 1.623069241642952, current_lr: 0.00015263319784532704


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 5, train loss: 1.5715585022770886, val_loss: 1.515848845243454, current_lr: 0.00018315983741439243


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 6, train loss: 1.4590840271391008, val_loss: 1.4197117984294891, current_lr: 0.00021368647698345784


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 7, train loss: 1.362291153832154, val_loss: 1.36534982919693, current_lr: 0.00024421311655252324


100%|██████████| 227/227 [00:52<00:00,  4.36it/s]


Epoch 8, train loss: 1.2816076872107216, val_loss: 1.304085150361061, current_lr: 0.00027473975612158865


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 9, train loss: 1.2082267668803883, val_loss: 1.2541113644838333, current_lr: 0.00030526639569065407


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 10, train loss: 1.1408612365764668, val_loss: 1.2276903241872787, current_lr: 0.0003357930352597195


100%|██████████| 227/227 [00:52<00:00,  4.33it/s]


Epoch 11, train loss: 1.0826170360464356, val_loss: 1.1833254247903824, current_lr: 0.00036631967482878485


100%|██████████| 227/227 [00:52<00:00,  4.36it/s]


Epoch 12, train loss: 1.0274923160212681, val_loss: 1.1562179550528526, current_lr: 0.0003968463143978502


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 13, train loss: 0.9759329397247751, val_loss: 1.1438519731163979, current_lr: 0.00039197473702003524


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 14, train loss: 0.9163881839634563, val_loss: 1.116587609052658, current_lr: 0.0003786835726097239


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 15, train loss: 0.8636436811627796, val_loss: 1.1045258268713951, current_lr: 0.000366658792549947


100%|██████████| 227/227 [00:52<00:00,  4.36it/s]


Epoch 16, train loss: 0.8113794032697635, val_loss: 1.0945205986499786, current_lr: 0.0003557112777046742


100%|██████████| 227/227 [00:52<00:00,  4.35it/s]


Epoch 17, train loss: 0.7664469703703725, val_loss: 1.1010716632008553, current_lr: 0.0003456892247916521


100%|██████████| 227/227 [00:52<00:00,  4.33it/s]


Epoch 18, train loss: 0.7243505713698097, val_loss: 1.1017155796289444, current_lr: 0.0003364691838917606


100%|██████████| 227/227 [00:52<00:00,  4.33it/s]


Epoch 19, train loss: 0.6839913368750249, val_loss: 1.1098638772964478, current_lr: 0.00032794959387586994


100%|██████████| 227/227 [00:52<00:00,  4.36it/s]


Epoch 20, train loss: 0.6492766333571615, val_loss: 1.1128222793340683, current_lr: 0.00032004603258690333


  9%|▉         | 21/227 [00:05<00:49,  4.14it/s]


KeyboardInterrupt: 

In [None]:
def idx_to_word(x, vocab):
    words = []
    for i in x:
        word = vocab.lookup_token(i)
        if '<' not in word:
            words.append(word)
    words = " ".join(words)
    return words

In [None]:
model.load_state_dict(torch.load("saved/model-1.0646797716617584.pt"))
with torch.no_grad():
        for i,batch in enumerate(test_loader):
                model.eval()
                src = batch[0].to(device)
                trg = batch[1].to(device)

                trg_input = trg[:, :-1]
                ys = trg[:, 1:].reshape(-1)
                logits = model(src, trg_input)
                for j in range(64):
                        src_words = idx_to_word(src[j], train_data.vocab_src)
                        trg_words = idx_to_word(trg[j], train_data.vocab_trg)
                        output_words = logits[j].max(dim=1)[1]
                        output_words = idx_to_word(output_words, train_data.vocab_trg)

                        print('source :', src_words)
                        print('target :', trg_words)
                        print('predicted :', output_words)
                

source : Ein Mann mit einem orangefarbenen Hut , der etwas .
target : A man in an orange hat starring at something .
predicted : A man in an orange hat is at something .
source : Ein Boston Terrier läuft über Gras vor einem weißen Zaun .
target : A Boston Terrier is running on lush green grass in front of a white fence .
predicted : A and and running and grass grass grass and front of a fence fence .
source : Ein Mädchen in einem Karateanzug bricht einen Stock mit einem Tritt .
target : A girl in karate uniform breaking a stick with a front kick .
predicted : A girl in a is is with gun with a net hand .
source : Fünf Leute in Winterjacken und mit Helmen stehen im Schnee mit im Hintergrund .
target : Five people wearing winter jackets and helmets stand in the snow , with in the background .
predicted : Five people wearing orange hats are helmets are in the snow with with their on the background .
source : Leute Reparieren das Dach eines Hauses .
target : People are fixing the roof of a 