In [2]:
import torch
import torch.nn as nn
from torch import Tensor

from torch.nn.functional import log_softmax
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, Vocab
from torch.utils.data import DataLoader,IterableDataset,Dataset
from torch.optim.lr_scheduler import LambdaLR
import math as m
import pandas as pd
import spacy
from spacy.symbols import ORTH
import os
import warnings
warnings.filterwarnings('ignore')
torch.manual_seed(0)

In [3]:
# class CustomDataset(Dataset):
#     def __init__(self,txts_file,labels_file):
#         self.texts = pd.read_csv("Data/"+txts_file,delimiter="\n", header= None)
#         self.label = pd.read_csv("Data/"+labels_file,delimiter="\n", header= None)

#     def __len__(self):
#         return len(self.labels)

#     def __getitem__(self,idx):
#         label = self.labels[idx]
#         text = self.text[idx]
#         sample = {"Text": text, "Class": label}
#         return sample

# class CustomDataset(IterableDataset):
#     def __init__(self,filename_en,filename_gm):
#         self.filename_en = "Data/"+filename_en
#         self.filename_gm = "Data/"+filename_gm

#     def preprocess(self, text):

#         ### Do something with text here
#         trimmed_text = ''.join([x for x in text if x in string.ascii_letters + '\'- '])
#         trimmed_text = trimmed_text.replace("AT-AT"," ")
#         text_pp = trimmed_text.split()
#         ###

#         return text_pp

#     def line_mapper(self, line):
        
#         #We only have the text in the file for this case
#         text = line
#         text = self.preprocess(text)
#         return text


#     def __iter__(self):

#         #Create an iterator
#         en_itr = open(self.filename_en)
#         gm_itr = open(self.filename_gm)
        
#         #Map each element using the line_mapper
#         mapped_en_itr = map(self.line_mapper, en_itr)
#         mapped_gm_itr = map(self.line_mapper, gm_itr)
        
#         #Zip both iterators
#         zipped_itr = zip(mapped_en_itr, mapped_gm_itr)
        
#         return zipped_itr

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()
        self.Qs = []
        self.Ks = []
        self.Vs = []
        self.d_model = d_model
        self.n_head = n_head
        self.softmax = nn.Softmax(dim=-1)
        
        self.Qs = nn.Linear(d_model,d_model,bias=True)
        self.Ks = nn.Linear(d_model,d_model,bias=True)
        self.Vs = nn.Linear(d_model,d_model,bias=True)
        self.mha = nn.Linear(d_model,d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        scaled_dot = torch.matmul(Q, torch.transpose(K,dim0=-2,dim1=-1))/m.sqrt(self.d_model)
        if mask is not None:
            scaled_dot = scaled_dot.masked_fill(mask == 0, -1e9)
        score = self.softmax(scaled_dot)
        attention = torch.matmul(score,V)
        return attention



    def forward(self, q_in, k_in, v_in, mask):
        Q = self.Qs(q_in)
        K = self.Ks(k_in)
        V = self.Vs(v_in)

        batch_size = q_in.shape[0]
        head_size = int(self.d_model/self.n_head)

        Q = Q.reshape(batch_size, self.n_head, -1, head_size)
        K = K.reshape(batch_size, self.n_head, -1, head_size)
        V = V.reshape(batch_size, self.n_head, -1, head_size)
        scaled_dot = self.scaled_dot_product_attention(Q,K,V,mask)
        scaled_dot = scaled_dot.reshape(batch_size, -1, self.d_model)
        output = self.mha(scaled_dot)
        return output



In [5]:
class Embeddinglayers(nn.Module):
    def __init__(self, vocab_size, padding_idx, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=padding_idx)

    def forward(self, x):
        
        embedding = self.embedding(x)
        return embedding * m.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(m.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [6]:
class Encoder(nn.Module):
    def __init__(self,d_model, n_head, hidden_size, dropout):
        super().__init__()
        self.layer_norm = nn.LayerNorm(d_model)
        self.linear1 = nn.Linear(d_model,hidden_size)
        self.linear2 = nn.Linear(hidden_size, d_model)
        self.mha = MultiHeadAttention(d_model,n_head)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p = dropout)

    def forward(self, x, src_mask = None):
        x = self.layer_norm(x + self.dropout(self.mha(x, x, x, src_mask)))
        x = self.layer_norm(x + self.dropout(self.linear_block(x)))
        return x

    def linear_block(self, x:Tensor) -> Tensor:
        out = self.linear2(self.activation(self.linear1(x)))
        return out

In [7]:

class Encoder_Block(nn.Module):
    def __init__(self, vocab_size, padding_idx, max_length, n_block = 6, d_model = 512, n_head = 8, hidden_size = 2048, dropout=0.1):
        super().__init__()
        self.embedding = Embeddinglayers(vocab_size, padding_idx,d_model)
        self.positional_embedding = PositionalEncoding(d_model, dropout, max_length)
        self.layers = nn.ModuleList([Encoder(d_model, n_head, hidden_size, dropout) for i in range(n_block)])
    def forward(self, x, src_mask = None):
        
            
        x = self.embedding(x)
        x = self.positional_embedding(x)
        for encode in self.layers:
            x = encode(x, src_mask)
        return x

In [8]:
class Decoder(nn.Module):
    def __init__(self,d_model, n_head, hidden_size, dropout):
        super().__init__()
        
        self.layer_norm = nn.LayerNorm(d_model)
        self.linear1 = nn.Linear(d_model,hidden_size)
        self.linear2 = nn.Linear(hidden_size, d_model)
        self.mha_1 = MultiHeadAttention(d_model,n_head)
        self.mha_2 = MultiHeadAttention(d_model,n_head)
        self.activation = nn.ReLU()

        self.dropout = nn.Dropout(p = dropout)

    def forward(self,x, encoder_in, src_mask = None, trg_mask = None):
        x = self.layer_norm(x + self.dropout(self.mha_1(x, x, x, trg_mask)))
        x = self.layer_norm(x + self.dropout(self.mha_2(x, encoder_in, encoder_in, src_mask)))
        x = self.layer_norm(x + self.dropout(self.linear_block(x)))
        return x

    def linear_block(self, x:Tensor) -> Tensor:
        out = self.linear2(self.activation(self.linear1(x)))
        return out

In [9]:
class Decoder_Block(nn.Module):
    def __init__(self, vocab_size, padding_idx, max_length, n_block = 6, d_model = 512, n_head = 8, hidden_size = 2048, dropout=0.1):
        super().__init__()
        self.embedding = Embeddinglayers(vocab_size, padding_idx,d_model)
        self.positional_embedding = PositionalEncoding(d_model, dropout, max_length)
        self.layers = nn.ModuleList([Decoder(d_model,n_head,hidden_size,dropout) for i in range(n_block)])
    def forward(self,x, enconder_in, src_mask = None, trg_mask = None):
        x = self.embedding(x)
        x = self.positional_embedding(x)
        for decode in self.layers:
            x = decode(x, enconder_in, src_mask,trg_mask)
        return x

In [10]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, padding_idx, max_length, device = 'cuda:0', n_block = 6, d_model = 512, n_head = 8, hidden_size = 2048, dropout=0.1):
        super().__init__()
        self.padding_idx = padding_idx
        self.device = device
        self.enc_block = Encoder_Block(src_vocab_size,padding_idx,max_length,n_block,d_model,n_head,hidden_size,dropout)
        self.dec_block = Decoder_Block(trg_vocab_size,padding_idx,max_length,n_block,d_model,n_head,hidden_size,dropout)
        self.proj = nn.Linear(d_model, trg_vocab_size)
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self,src,trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_out = self.enc_block(src,src_mask)
        dec_out = self.dec_block(trg,enc_out,src_mask,trg_mask)
        
        out = self.proj(dec_out)
        return out

    def make_src_mask(self, src):
        src_mask = (src != self.padding_idx).unsqueeze(-2)
       
        return src_mask.unsqueeze(1)

    def make_trg_mask(self, trg):
        pad_mask = (trg != self.padding_idx).unsqueeze(-2)
        no_peak_mask = torch.triu(torch.ones((1, trg.shape[1], trg.shape[1])))
        no_peak_mask = no_peak_mask.transpose(dim0=-1, dim1 = -2).type_as(trg.data)

        trg_mask = no_peak_mask & pad_mask
        return trg_mask.unsqueeze(1)

In [11]:
def tokenize(text, tokenizer):
    return [tok.text for tok in tokenizer.tokenizer(text)]


def yield_tokens(data_iter, tokenizer, index):
    for from_to_tuple in data_iter:
        yield tokenizer(from_to_tuple[index])

def build_vocabulary(spacy_de, spacy_en):
    def tokenize_de(text):
        return tokenize(text, spacy_de)

    def tokenize_en(text):
        return tokenize(text, spacy_en)

    print("Building German Vocabulary ...")
    train, val, test = Multi30k(language_pair=("de", "en"))
    vocab_src = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_de, index=0),
        min_freq=1,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    print("Building English Vocabulary ...")
    train, val, test = Multi30k(language_pair=("de", "en"))
    vocab_tgt = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_en, index=1),
        min_freq=1,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    vocab_src.set_default_index(vocab_src["<unk>"])
    vocab_tgt.set_default_index(vocab_tgt["<unk>"])

    return vocab_src, vocab_tgt, [train, val, test]

def load_tokenizers():

    try:
        spacy_de = spacy.load("de_core_news_sm")
    except IOError:
        os.system("python -m spacy download de_core_news_sm")
        spacy_de = spacy.load("de_core_news_sm")

    try:
        spacy_en = spacy.load("en_core_web_sm")
    except IOError:
        os.system("python -m spacy download en_core_web_sm")
        spacy_en = spacy.load("en_core_web_sm")

    spacy_de.tokenizer.add_special_case(u'<s>', [{ORTH: u'<s>'}])
    spacy_de.tokenizer.add_special_case(u'</s>', [{ORTH: u'</s>'}])
    spacy_de.tokenizer.add_special_case(u'<black>', [{ORTH: u'<black>'}])
    spacy_de.tokenizer.add_special_case(u'<unk>', [{ORTH: u'<unk>'}])

    spacy_en.tokenizer.add_special_case(u'<s>', [{ORTH: u'<s>'}])
    spacy_en.tokenizer.add_special_case(u'</s>', [{ORTH: u'</s>'}])
    spacy_en.tokenizer.add_special_case(u'<black>', [{ORTH: u'<black>'}])
    spacy_en.tokenizer.add_special_case(u'<unk>', [{ORTH: u'<unk>'}])


    return spacy_de, spacy_en

In [12]:
spacy_de, spacy_en = load_tokenizers()

src_vocab, trg_vocab, data = build_vocabulary(spacy_de,spacy_en)
train,val,test = data
src_vocab_size = src_vocab.__len__()
trg_vocab_size = trg_vocab.__len__()
print("Finished preparing data, source vocabulary: {} entries, target vocabulary: {} entries".format(src_vocab_size,trg_vocab_size))

padding_idx = src_vocab.__getitem__("<blank>")


MAX_SEQ_LEN = 30
def pad_to_max(tokens):
    return tokens[:MAX_SEQ_LEN] + ["<blank>"] * max(0, MAX_SEQ_LEN - len(tokens))

def collate_fn(batch):
    # batch = [(<src1>, <trg1>), (<src2>, <trg2>), ...]
    srcs = []
    trgs = []
    for pair in batch:
        src = pair[0]
        trg = pair[1]
        p = tokenize("<s> " + src + " </s>",spacy_de)
        c = src_vocab(p)

        tokenized_src = src_vocab(pad_to_max(tokenize("<s> " + src + " </s>",spacy_de)))
        tokenized_trg = trg_vocab(pad_to_max(tokenize("<s> " + trg + " </s>",spacy_en)))
        
        srcs.append(tokenized_src)
        trgs.append(tokenized_trg)

    srcs = torch.tensor(srcs, dtype=torch.long)
    trgs = torch.tensor(trgs, dtype=torch.long)
    return srcs, trgs

train_dataloader = DataLoader(list(train), batch_size=64, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(list(val), batch_size=64, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(list(test), batch_size=64, shuffle=False, collate_fn=collate_fn)



Building German Vocabulary ...
Building English Vocabulary ...
Finished preparing data, source vocabulary: 19962 entries, target vocabulary: 11158 entries


In [13]:
def rate(step, model_size, factor, warmup):
    """
    we have to default the step to 1 for LambdaLR function
    to avoid zero raising to negative power.
    """
    if step == 0:
        step = 1
    return factor * (
        model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )

In [16]:
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(src_vocab_size, trg_vocab_size, padding_idx = padding_idx,max_length = MAX_SEQ_LEN)
model.to(device)
optimizer = torch.optim.Adam(
        model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9
    )
lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, model_size=512, factor=1.0, warmup=1000
        ),
    )
loss = nn.CrossEntropyLoss()
max_epoch = 30
best_val_loss = 1e9
for epoch in range(max_epoch):
    train_loss = 0.0
    val_loss = 0.0
    for batch in tqdm(train_dataloader):
        model.train()
        src = batch[0].to(device)
        trg = batch[1].to(device)
        optimizer.zero_grad()
        trg_input = trg[:, :-1]
        ys = trg[:, 1:].reshape(-1)
        logits = model(src, trg_input)
        loss_ = loss(logits.reshape(-1, trg_vocab_size), ys)
        loss_.backward()
        optimizer.step()
        train_loss += loss_.item()
        lr_scheduler.step()

    train_loss /= len(train_dataloader)
    lr = optimizer.param_groups[0]["lr"]
    for i,batch in enumerate(val_dataloader):
        model.eval()
        src = batch[0].to(device)
        trg = batch[1].to(device)
        
        trg_input = trg[:, :-1]
        ys = trg[:, 1:].reshape(-1)
        logits = model(src, trg_input)
        loss_ = loss(logits.reshape(-1, trg_vocab_size), ys) 
        val_loss += loss_.item()
    val_loss /= len(val_dataloader)

    if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'saved/model-{0}.pt'.format(val_loss))
    print("Epoch {}, train loss: {}, val_loss: {}, current_lr: {}".format(epoch, train_loss, val_loss, lr))        

100%|██████████| 454/454 [01:01<00:00,  7.42it/s]


Epoch 0, train loss: 3.0537059230951487, val_loss: 1.810008317232132, current_lr: 0.0003172421443077827


100%|██████████| 454/454 [01:01<00:00,  7.40it/s]


Epoch 1, train loss: 1.686610659313622, val_loss: 1.617099106311798, current_lr: 0.0006344842886155654


100%|██████████| 454/454 [01:01<00:00,  7.38it/s]


Epoch 2, train loss: 1.5004332409556216, val_loss: 1.486424334347248, current_lr: 0.0005987513009682401


100%|██████████| 454/454 [01:01<00:00,  7.39it/s]


Epoch 3, train loss: 1.3343559881664058, val_loss: 1.406144268810749, current_lr: 0.0005185338371874781


100%|██████████| 454/454 [01:01<00:00,  7.37it/s]


Epoch 4, train loss: 1.2001650469943816, val_loss: 1.3636165708303452, current_lr: 0.0004637907634340038


100%|██████████| 454/454 [01:01<00:00,  7.35it/s]


Epoch 5, train loss: 1.082822863094607, val_loss: 1.350212201476097, current_lr: 0.0004233811051589101


100%|██████████| 454/454 [01:01<00:00,  7.37it/s]


Epoch 6, train loss: 0.9736135562611047, val_loss: 1.3588689044117928, current_lr: 0.00039197473702003524


100%|██████████| 454/454 [01:01<00:00,  7.34it/s]


Epoch 7, train loss: 0.8673764309694063, val_loss: 1.3874894976615906, current_lr: 0.000366658792549947


100%|██████████| 454/454 [01:01<00:00,  7.37it/s]


Epoch 8, train loss: 0.7679736329332847, val_loss: 1.4158494025468826, current_lr: 0.0003456892247916521


100%|██████████| 454/454 [01:01<00:00,  7.35it/s]


Epoch 9, train loss: 0.6743433169593895, val_loss: 1.4676295891404152, current_lr: 0.00032794959387586994


100%|██████████| 454/454 [01:01<00:00,  7.36it/s]


Epoch 10, train loss: 0.5893330889782716, val_loss: 1.5180744156241417, current_lr: 0.00031268766891892743


100%|██████████| 454/454 [01:01<00:00,  7.37it/s]


Epoch 11, train loss: 0.5151225890357064, val_loss: 1.5647674351930618, current_lr: 0.00029937565048412007


100%|██████████| 454/454 [01:01<00:00,  7.39it/s]


Epoch 12, train loss: 0.4528150697636709, val_loss: 1.6206425577402115, current_lr: 0.00028763082123731517


100%|██████████| 454/454 [01:01<00:00,  7.36it/s]


Epoch 13, train loss: 0.40311537525464786, val_loss: 1.6784055158495903, current_lr: 0.00027716799460068055


100%|██████████| 454/454 [01:01<00:00,  7.37it/s]


Epoch 14, train loss: 0.3634573796903509, val_loss: 1.7151869013905525, current_lr: 0.00026776972211628414


100%|██████████| 454/454 [01:01<00:00,  7.37it/s]


Epoch 15, train loss: 0.33278053335943936, val_loss: 1.745475947856903, current_lr: 0.0002592669185937391


100%|██████████| 454/454 [01:01<00:00,  7.35it/s]


Epoch 16, train loss: 0.3083986405627843, val_loss: 1.7905724868178368, current_lr: 0.00025152585660950626


100%|██████████| 454/454 [01:01<00:00,  7.36it/s]


Epoch 17, train loss: 0.2898093166592888, val_loss: 1.815447211265564, current_lr: 0.000244439195033298


100%|██████████| 454/454 [01:01<00:00,  7.35it/s]


Epoch 18, train loss: 0.272984097248132, val_loss: 1.8364530801773071, current_lr: 0.0002379196415901674


100%|██████████| 454/454 [01:01<00:00,  7.37it/s]


Epoch 19, train loss: 0.259294811357223, val_loss: 1.8616209924221039, current_lr: 0.0002318953817170019


100%|██████████| 454/454 [01:01<00:00,  7.36it/s]


Epoch 20, train loss: 0.2468863021631598, val_loss: 1.8846192732453346, current_lr: 0.00022630671993405008


100%|██████████| 454/454 [01:01<00:00,  7.38it/s]


Epoch 21, train loss: 0.2373071599111683, val_loss: 1.8904675245285034, current_lr: 0.0002211035710859876


100%|██████████| 454/454 [01:01<00:00,  7.36it/s]


Epoch 22, train loss: 0.22879801533033145, val_loss: 1.903694286942482, current_lr: 0.0002162435584598272


100%|██████████| 454/454 [01:00<00:00,  7.45it/s]


Epoch 23, train loss: 0.2210141707556363, val_loss: 1.9218398407101631, current_lr: 0.00021169055257945504


100%|██████████| 454/454 [00:57<00:00,  7.83it/s]


Epoch 24, train loss: 0.21464804491019984, val_loss: 1.932796649634838, current_lr: 0.00020741353487499126


100%|██████████| 454/454 [00:57<00:00,  7.83it/s]


Epoch 25, train loss: 0.20833056574350936, val_loss: 1.943101480603218, current_lr: 0.00020338570417516115


100%|██████████| 454/454 [00:58<00:00,  7.82it/s]


Epoch 26, train loss: 0.20334299227083308, val_loss: 1.9492912590503693, current_lr: 0.00019958376698941336


100%|██████████| 454/454 [00:58<00:00,  7.83it/s]


Epoch 27, train loss: 0.19924095909763537, val_loss: 1.9737034440040588, current_lr: 0.00019598736851001762


100%|██████████| 454/454 [00:58<00:00,  7.82it/s]


Epoch 28, train loss: 0.1943246356256733, val_loss: 1.9802769050002098, current_lr: 0.0001925786325055462


100%|██████████| 454/454 [00:57<00:00,  7.83it/s]


Epoch 29, train loss: 0.1907908774026165, val_loss: 1.984381079673767, current_lr: 0.00018934178630486195


In [18]:
def idx_to_word(x, vocab):
    words = []
    for i in x:
        word = vocab.lookup_token(i)
        if '<' not in word:
            words.append(word)
    words = " ".join(words)
    return words

In [19]:
model.load_state_dict(torch.load("saved/model-1.350212201476097.pt"))
with torch.no_grad():
        for i,batch in enumerate(val_dataloader):
                model.eval()
                src = batch[0].to(device)
                trg = batch[1].to(device)

                trg_input = trg[:, :-1]
                ys = trg[:, 1:].reshape(-1)
                logits = model(src, trg_input)
                for j in range(64):
                        src_words = idx_to_word(src[j], src_vocab)
                        trg_words = idx_to_word(trg[j], trg_vocab)
                        output_words = logits[j].max(dim=1)[1]
                        output_words = idx_to_word(output_words, trg_vocab)

                        print('source :', src_words)
                        print('target :', trg_words)
                        print('predicted :', output_words)
                

source : Eine Gruppe von Männern lädt Baumwolle auf einen Lastwagen
target : A group of men are loading cotton onto a truck
predicted : A woman of men are taking a a a sidewalk
source : Ein Mann schläft in einem grünen Raum auf einem Sofa .
target : A man sleeping in a green room on a couch .
predicted : A man in in a chair chair in a bed .
source : Ein Junge mit Kopfhörern sitzt auf den Schultern einer Frau .
target : A boy wearing headphones sits on a woman 's shoulders .
predicted : A man in headphones sitting in a couch 's shoulders .
source : Zwei Männer bauen eine blaue Eisfischerhütte auf einem zugefrorenen See auf
target : Two men setting up a blue ice fishing hut on an iced over lake
predicted : Two men are up a blue and workers game on a outdoor day .
source : Ein Mann mit beginnender Glatze , der eine rote Rettungsweste trägt , sitzt in einem kleinen Boot .
target : A balding man wearing a red life jacket is sitting in a small boat .
predicted : A man man with a red shirt ja

IndexError: index 55 is out of bounds for dimension 0 with size 55