In [None]:
"""

We use the data and some of the code pipeline from DLS 2021 part2 homework 5 (Attention)

http://jalammar.github.io/illustrated-transformer/

Code for the transformer is from:
http://nlp.seas.harvard.edu/2018/04/03/attention.html

https://pytorch.org/tutorials/beginner/translation_transformer.html 

"""
None

In [1]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 8.1MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp37-none-any.whl size=14907055 sha256=869b725ffee6b58198c398d15824278f10214f3d817e3a9cd233cd189fcfd163
  Stored in directory: /tmp/pip-ephem-wheel-cache-01uqn0pv/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_ne

In [None]:
"""

Restart session after running the previous cell

"""
None

In [1]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from typing import Iterable, List


SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}


# Create source and target language tokenizer. Make sure to install the dependencies.
# pip install -U spacy
# python -m spacy download en_core_web_sm
# python -m spacy download de_core_news_sm
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set UNK_IDX as the default index. This index is returned when the token is not found.
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 1.00MB/s]


**Changing model**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd ./drive/MyDrive/DLS_nlp/

/content/drive/MyDrive/DLS_nlp


In [3]:
from torch import Tensor
import torch
import torch.nn as nn
import math

import numpy as np
from torch.autograd import Variable


DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [6]:
# For reloading 


import project_part2 as modules
import imp
imp.reload(modules)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])

model = modules.make_model(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, 3, pruning=False)
model = model.to(DEVICE)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

  nn.init.xavier_uniform(p)


The model has 43,015,768 trainable parameters


In [7]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

BATCH_SIZE = 128

In [8]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

def make_std_mask(tgt, pad):
    "Create a mask to hide padding and future words."
    tgt_mask = (tgt != pad).unsqueeze(-2)
    tgt_mask = tgt_mask & Variable(
        subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
    return tgt_mask

In [9]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [None]:
# def get_penalty(model):
#     loss = 0
#     count = 0
#     for name in model.state_dict():
#         if 'L0' in name:

#             import pdb
#             pdb.set_trace()
#             loss += model.state_dict()[name]
#             count += 1
#     return loss / count

# def loss_penalised(loss_bce, model, reg):
#     # import pdb
#     # pdb.set_trace()
#     return loss_bce + reg * get_penalty(model)

In [10]:
def get_penalty(model):
    
    loss = (model.encoder.layers[0].self_attn.L0 + model.encoder.layers[1].self_attn.L0 + 
            model.encoder.layers[2].self_attn.L0 + 
            model.decoder.layers[0].self_attn.L0 + model.decoder.layers[1].self_attn.L0 +
            model.decoder.layers[2].self_attn.L0 + 
            model.decoder.layers[0].src_attn.L0 + model.decoder.layers[1].src_attn.L0 +
            model.decoder.layers[2].src_attn.L0)

    return loss / 9

def loss_penalised(loss_bce, model, reg):
    return loss_bce + reg * get_penalty(model)

In [11]:
from torch.utils.data import DataLoader

reg = 0.01

def train_epoch(model, optimizer, pruned=False):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        # tgt_input = tgt
        tgt_input = tgt[:-1, :]
        # src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        src = src.transpose(0, 1)
        tgt_input = tgt_input.transpose(0, 1)

        src_mask = (src != PAD_IDX).unsqueeze(-2)
        tgt_mask = make_std_mask(tgt_input, PAD_IDX)

        # logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        logits = model(src, tgt_input, src_mask, tgt_mask)
        logits = model.generator(logits)
        logits = logits.transpose(0, 1)


        optimizer.zero_grad()

        tgt_out = tgt[1:, :]

        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))

        if pruned:
            loss = loss_penalised(loss, model, reg)
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)


def evaluate(model, pruned=False):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        # tgt_input = tgt
        tgt_input = tgt[:-1, :]

        src = src.transpose(0, 1)
        tgt_input = tgt_input.transpose(0, 1)

        src_mask = (src != PAD_IDX).unsqueeze(-2)
        tgt_mask = make_std_mask(tgt_input, PAD_IDX)

        # src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        # logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        logits = model(src, tgt_input, src_mask, tgt_mask)
        logits = model.generator(logits)
        logits = logits.transpose(0, 1)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))

        if pruned:
            loss = loss_penalised(loss, model, reg)
        losses += loss.item()

    return losses / len(val_dataloader)


In [None]:
# from timeit import default_timer as timer
# NUM_EPOCHS = 18

# prune = False

# for epoch in range(1, NUM_EPOCHS+1):
#     start_time = timer()
#     train_loss = train_epoch(model, optimizer, pruned=prune)
#     end_time = timer()
#     val_loss = evaluate(model, pruned=prune)
#     print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

Epoch: 1, Train loss: 5.235, Val loss: 4.073, Epoch time = 59.895s
Epoch: 2, Train loss: 3.737, Val loss: 3.352, Epoch time = 64.530s
Epoch: 3, Train loss: 3.181, Val loss: 2.966, Epoch time = 63.945s
Epoch: 4, Train loss: 2.796, Val loss: 2.683, Epoch time = 64.330s
Epoch: 5, Train loss: 2.501, Val loss: 2.477, Epoch time = 64.254s
Epoch: 6, Train loss: 2.255, Val loss: 2.339, Epoch time = 64.313s
Epoch: 7, Train loss: 2.054, Val loss: 2.225, Epoch time = 64.476s
Epoch: 8, Train loss: 1.883, Val loss: 2.131, Epoch time = 64.409s
Epoch: 9, Train loss: 1.742, Val loss: 2.076, Epoch time = 64.477s
Epoch: 10, Train loss: 1.607, Val loss: 2.017, Epoch time = 64.404s
Epoch: 11, Train loss: 1.497, Val loss: 2.002, Epoch time = 64.395s
Epoch: 12, Train loss: 1.397, Val loss: 1.991, Epoch time = 64.432s
Epoch: 13, Train loss: 1.305, Val loss: 2.004, Epoch time = 64.393s
Epoch: 14, Train loss: 1.224, Val loss: 1.982, Epoch time = 64.430s
Epoch: 15, Train loss: 1.147, Val loss: 1.968, Epoch time

In [None]:
# torch.save(model.state_dict(), "model.pt")

In [18]:
import project_part2 as modules
import imp
imp.reload(modules)


model = modules.make_model(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, 3, pruning=True)
model.load_state_dict(torch.load("model.pt", map_location=DEVICE),strict=False)
model = model.to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

  nn.init.xavier_uniform(p)


In [19]:
for name in model.state_dict():
    if 'log_a' in name:
        print(model.state_dict()[name])


tensor([-1., -1., -1., -1., -1., -1., -1., -1.], device='cuda:0')
tensor([-1., -1., -1., -1., -1., -1., -1., -1.], device='cuda:0')
tensor([-1., -1., -1., -1., -1., -1., -1., -1.], device='cuda:0')
tensor([-1., -1., -1., -1., -1., -1., -1., -1.], device='cuda:0')
tensor([-1., -1., -1., -1., -1., -1., -1., -1.], device='cuda:0')
tensor([-1., -1., -1., -1., -1., -1., -1., -1.], device='cuda:0')
tensor([-1., -1., -1., -1., -1., -1., -1., -1.], device='cuda:0')
tensor([-1., -1., -1., -1., -1., -1., -1., -1.], device='cuda:0')
tensor([-1., -1., -1., -1., -1., -1., -1., -1.], device='cuda:0')


In [20]:
from timeit import default_timer as timer
NUM_EPOCHS = 10

prune = True
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer, pruned=prune)
    end_time = timer()
    val_loss = evaluate(model, pruned=prune)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

Epoch: 1, Train loss: 3.564, Val loss: 5.917, Epoch time = 64.026s
Epoch: 2, Train loss: 3.246, Val loss: 5.873, Epoch time = 63.470s
Epoch: 3, Train loss: 3.114, Val loss: 6.193, Epoch time = 63.697s
Epoch: 4, Train loss: 3.029, Val loss: 6.028, Epoch time = 63.772s
Epoch: 5, Train loss: 2.965, Val loss: 5.881, Epoch time = 63.835s
Epoch: 6, Train loss: 2.870, Val loss: 6.036, Epoch time = 63.988s
Epoch: 7, Train loss: 2.800, Val loss: 5.925, Epoch time = 63.879s
Epoch: 8, Train loss: 2.754, Val loss: 5.924, Epoch time = 63.821s
Epoch: 9, Train loss: 2.686, Val loss: 6.019, Epoch time = 63.733s
Epoch: 10, Train loss: 2.645, Val loss: 5.981, Epoch time = 63.789s


In [21]:
for name in model.state_dict():
    if 'log_a' in name:
        print(model.state_dict()[name])

tensor([-1.0738, -1.0532, -1.0645, -1.0651, -1.0434, -1.0551, -1.0622, -1.0574],
       device='cuda:0')
tensor([-1.1560, -1.1823, -1.1647, -1.1514, -1.1686, -1.1264, -1.1351, -1.1370],
       device='cuda:0')
tensor([-1.2156, -1.2210, -1.2083, -1.2142, -1.1929, -1.2121, -1.2164, -1.2171],
       device='cuda:0')
tensor([-1.0711, -1.0816, -1.0465, -1.0960, -1.0847, -1.0490, -1.0461, -1.0731],
       device='cuda:0')
tensor([-1.0320, -1.0416, -1.0709, -1.0677, -1.0247, -1.0646, -1.0756, -1.0368],
       device='cuda:0')
tensor([-1.1216, -1.1259, -1.1274, -1.1230, -1.1859, -1.1526, -1.1257, -1.1611],
       device='cuda:0')
tensor([-1.0166, -1.0278, -1.0306, -1.0439, -1.0284, -1.0256, -1.0388, -1.0510],
       device='cuda:0')
tensor([-1.1823, -1.1922, -1.2087, -1.2043, -1.2004, -1.1923, -1.1844, -1.1870],
       device='cuda:0')
tensor([-1.0573, -1.0283, -1.0327, -1.0335, -1.0524, -1.0499, -1.0386, -1.0401],
       device='cuda:0')


In [15]:
for name in model.state_dict():
    if 'log_a' in name:
        print(model.state_dict()[name])

tensor([0.9276, 0.9380, 0.9437, 0.9008, 0.9455, 0.9289, 0.9310, 0.9360],
       device='cuda:0')
tensor([0.7990, 0.7888, 0.7947, 0.8069, 0.8021, 0.8222, 0.7984, 0.8075],
       device='cuda:0')
tensor([0.7780, 0.7791, 0.7819, 0.7782, 0.7852, 0.7790, 0.7772, 0.7807],
       device='cuda:0')
tensor([0.8897, 0.8649, 0.9213, 0.8338, 0.8869, 0.9179, 0.9033, 0.8673],
       device='cuda:0')
tensor([0.8673, 0.8737, 0.8338, 0.8506, 0.8389, 0.8424, 0.8383, 0.8914],
       device='cuda:0')
tensor([0.8369, 0.8120, 0.8131, 0.8129, 0.7894, 0.8055, 0.8084, 0.8083],
       device='cuda:0')
tensor([0.8511, 0.8490, 0.8539, 0.8524, 0.8412, 0.8483, 0.8575, 0.8645],
       device='cuda:0')
tensor([0.7852, 0.7865, 0.7802, 0.7806, 0.7791, 0.7846, 0.7860, 0.7856],
       device='cuda:0')
tensor([0.8381, 0.8504, 0.8575, 0.8446, 0.8380, 0.8413, 0.8484, 0.8438],
       device='cuda:0')


In [14]:
for name in model.state_dict():
    if 'log_a' in name:
        print(model.state_dict()[name])


tensor([0.9187, 0.9491, 0.9551, 0.9114, 0.9647, 0.9513, 0.9256, 0.9518],
       device='cuda:0')
tensor([0.7909, 0.7851, 0.7917, 0.7937, 0.7878, 0.8185, 0.7952, 0.7912],
       device='cuda:0')
tensor([0.7768, 0.7765, 0.7770, 0.7758, 0.7787, 0.7773, 0.7757, 0.7764],
       device='cuda:0')
tensor([0.8677, 0.8561, 0.9204, 0.8310, 0.8815, 0.9197, 0.9119, 0.8596],
       device='cuda:0')
tensor([0.8631, 0.8621, 0.8133, 0.8460, 0.8335, 0.8200, 0.8250, 0.8618],
       device='cuda:0')
tensor([0.8142, 0.7970, 0.7921, 0.7973, 0.7832, 0.7972, 0.7920, 0.7967],
       device='cuda:0')
tensor([0.8346, 0.8336, 0.8448, 0.8210, 0.8455, 0.8319, 0.8329, 0.8353],
       device='cuda:0')
tensor([0.7805, 0.7807, 0.7772, 0.7766, 0.7769, 0.7799, 0.7803, 0.7796],
       device='cuda:0')
tensor([0.8172, 0.8384, 0.8282, 0.8259, 0.8208, 0.8169, 0.8235, 0.8313],
       device='cuda:0')


In [None]:
# torch.save(model.state_dict(), "model_pruned_18_7.pt")

In [15]:
model.encoder.layers[0].self_attn.log_a

Parameter containing:
tensor([0.9187, 0.9491, 0.9551, 0.9114, 0.9647, 0.9513, 0.9256, 0.9518],
       device='cuda:0', requires_grad=True)

In [17]:
model.decoder.layers[2]

DecoderLayer(
  (self_attn): MultiHeadedAttention(
    (linears): ModuleList(
      (0): Linear(in_features=512, out_features=512, bias=True)
      (1): Linear(in_features=512, out_features=512, bias=True)
      (2): Linear(in_features=512, out_features=512, bias=True)
      (3): Linear(in_features=512, out_features=512, bias=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (src_attn): MultiHeadedAttention(
    (linears): ModuleList(
      (0): Linear(in_features=512, out_features=512, bias=True)
      (1): Linear(in_features=512, out_features=512, bias=True)
      (2): Linear(in_features=512, out_features=512, bias=True)
      (3): Linear(in_features=512, out_features=512, bias=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (feed_forward): PositionwiseFeedForward(
    (w_1): Linear(in_features=512, out_features=2048, bias=True)
    (w_2): Linear(in_features=2048, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sublayer): M

In [None]:
model.decoder.layers[2].src_attn.L0

Parameter containing:
tensor(438.8282, requires_grad=True)

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    c = 1

    memory = model.encode(src, src_mask)

    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)

        tgt_mask = torch.unsqueeze(tgt_mask, 0)

        out = model.decode(memory, src_mask, ys.transpose(0,1), tgt_mask)
        prob = model.generator(out)

        _, next_word = torch.max(prob[:,-1,:], dim=1)


        # if c > 1:
        #     import pdb
        #     pdb.set_trace()
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
        c+=1
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]

    src = src.transpose(0, 1)

    # src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    src_mask = (src != PAD_IDX).unsqueeze(-2)

    

    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

print(translate(model, "Eine Gruppe von Menschen steht vor einem Iglu ."))


 A group of people stand in front of an indoor Asia . 
