In [1]:
import sys
sys.path.append('../src')

import torch
import torch.nn as nn
import torch.optim as optim

import os
import json
import time
import random
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score

from IPython.display import Image

In [2]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

SEED = 1
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

batch_size = 8
# Percentage of training data
learning_rate = 0.001
epochs = 100

# Double-copy task dataset

The entire dataset comprises of the binary representation of all numbers uptil a range defined. The binary sequence from left to right (most significant to least significant) is the input. The target is just the reverse sequence.

In [108]:
import copy

# Generating data
state_size = 12
data_x = []
for i in range(pow(2, state_size)):
    data_x.append([int(x) for x in list(np.binary_repr(i, width=state_size))])
data_x = np.array(data_x)
data_x.shape

(4096, 12)

In [109]:
# Reshaping for tensors
data_x = np.transpose(data_x).reshape(state_size, pow(2, state_size), 1)
data_x = torch.from_numpy(data_x).float()
data_x = torch.zeros(data_x.shape[0], data_x.shape[1], 2).scatter_(2, data_x.long(), 1)
data_y = data_x.clone()
data_x.shape, data_y.shape

(torch.Size([12, 4096, 2]), torch.Size([12, 4096, 2]))

In [110]:
# Doubling the target sequence
data_y = torch.cat((data_y, data_y), dim=0)

In [111]:
# add a start-of-sequence tag (1,1)
START = torch.ones(data_x.shape[-1]).long().to(device)
data_x = torch.nn.functional.pad(data_x, (0,0,0,0,1,0), 'constant', 1)
data_y = torch.nn.functional.pad(data_y, (0,0,0,0,1,0), 'constant', 1)

In [112]:
# Creating training and test sets
train_size = 0.75
ordering = torch.randperm(pow(2, state_size))
data_x = data_x[:, ordering, :]
data_y = data_y[:, ordering, :]
train_x = data_x[:,:int(train_size * len(ordering)),:]
train_y = data_y[:,:int(train_size * len(ordering)),:]
test_x = data_x[:,int(train_size * len(ordering)):,:]
test_y = data_y[:,int(train_size * len(ordering)):,:]

# Creating training and validation sets
## TODO

print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

torch.Size([13, 3072, 2]) torch.Size([25, 3072, 2]) torch.Size([13, 1024, 2]) torch.Size([25, 1024, 2])


# LSTM

## Modelling

In [3]:
# Input dim
input_dim = 2
# Number of hidden nodes
hidden_dim = 16
# Number of output nodes
output_dim = 2
# Number of LSTMs cells to be stacked
layers = 1
# Boolean value for bidirectioanl or not
bidirectional = True
# Boolean value to use LayerNorm or not
layernorm = False

In [4]:
def train(model, train_x, train_y, test_x, test_y, epochs, loss_fn, optimizer, teacher_forcing=0.5):
    train_size = train_x.shape[1]
    device = torch.device("cpu")
    if train_x.is_cuda:
        device = torch.device("cuda")
    layers = model.layers
    hidden_dim = model.hidden_dim
    for i in range(1, epochs + 1):
        model.train()
        loss_tracker = []
        ordering = torch.randperm(train_size)
        train_x = train_x[:,ordering,:]
        train_y = train_y[:,ordering,:]
        
        epoch_time = time.time()
        
        for j in range(int(float(train_size)/batch_size) + 1):
            optimizer.zero_grad()
            start = j*batch_size
            end = min((j+1)*batch_size, train_size)
            batch = end - start
            if batch is 0:
                continue
                
            st = time.time()
            
            if model.bidirectional:
                hidden_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
                cell_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
            else:
                hidden_state = torch.zeros(layers, batch, hidden_dim).to(device)
                cell_state = torch.zeros(layers, batch, hidden_dim).to(device)
            o = model(train_x[:,start:end,:], train_y[:,start:end,:], hidden_state, 
                      cell_state, teacher_forcing)
            gt = torch.argmax(train_y[:,start:end,:], 2, keepdim=True).view(-1)
            loss = loss_fn(o.view(-1, 2), gt)
            loss_tracker.append(loss.item())
            loss.backward()
            optimizer.step()
            print("Epoch #{:<3d}: Batch {:>3d}/{:<3d} -- "
                  "Loss: {:2.5}  time: {:2.5}".format(i, j+1, int(train_size/batch_size), 
                                        loss_tracker[-1], (time.time()-st)), end='\r')
        print()
        f1_train = evaluate(model, train_x, train_y)
        f1_test = evaluate(model, test_x, test_y)
        print("Average Loss: {:2.6}".format(np.mean(loss_tracker)))
        print("Total time: {:2.5}".format(time.time() - epoch_time))
        print("Training F1: {:3.4}".format(f1_train))
        print("Test F1: {:3.4}".format(f1_test))
        print("=" * 50)
    
    return model


def evaluate(model, x, y):
    model.eval()
    test_size = x.shape[1]
    device = torch.device("cpu")
    if x.is_cuda:
        device = torch.device("cuda")
    layers = model.layers
    hidden_dim = model.hidden_dim
    labels = []
    preds = []
    for j in range(int(test_size/batch_size) + 1):
        optimizer.zero_grad()
        start = j*batch_size
        end = min((j+1)*batch_size, test_size)
        batch = end - start
        if batch == 0:
            continue
        if model.bidirectional:
            hidden_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
            cell_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
        else:
            hidden_state = torch.zeros(layers, batch, hidden_dim).to(device)
            cell_state = torch.zeros(layers, batch, hidden_dim).to(device)
        with torch.no_grad():
            o = model(x[:,start:end,:], y[:,start:end,:], hidden_state, cell_state, teacher_forcing=0)
        pred = torch.argmax(o, 2, keepdim=True).view(-1).cpu().detach().numpy()
        preds.extend(pred)
        label = torch.argmax(y[:,start:end,:], 2, 
                             keepdim=True).view(-1).cpu().detach().numpy()
        labels.extend(label)
    return f1_score(labels, preds)

## Our implementation

In [5]:
from lstm import LSTM

class LSTMSeq2SeqDifferent(nn.Module):
    """ LSTM Class for Sequence Labelling (many-to-many-different)

    The class creates the LSTM architecture as specified by the parameters.
    A fully connected layer is added to reduce the last hidden state to output_dim.

    Parameters
    ==========
    vocab_len: int from imdb dataset
    embed_dim: dimensions of the embeddings
    hidden_dim: number of hidden nodes required
    output_dim: numer of output nodes required (1 for sentiment analysis)
    pretrained_vec: weights from imdb object
    layers: number of LSTM cells to be stacked for depth
    bidirectional: boolean
    layernorm: boolean

    """
    def __init__(self, input_dim, hidden_dim, output_dim, layers=1,
                 bidirectional=False, layernorm=False):
        super().__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.bidirectional = bidirectional
        self.layernorm = layernorm

        self.encoder = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, layers=layers,
                         bidirectional=bidirectional, layernorm=layernorm)
        if self.bidirectional:
            self.decoder = LSTM(input_dim=output_dim, hidden_dim=2 * hidden_dim, layers=layers,
                                bidirectional=False, layernorm=layernorm)
            self.fc = nn.Linear(2 * hidden_dim, output_dim)
        else:
            self.decoder = LSTM(input_dim=output_dim, hidden_dim=hidden_dim, layers=layers,
                                bidirectional=False, layernorm=layernorm)
            self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x, target, hidden_state, cell_state, teacher_forcing=0.5):
        device = 'cpu'
        if x.is_cuda:
            device = 'cuda'
        # encoding
        _, (hidden_state, cell_state) = self.encoder(x, hidden_state, cell_state)
        batch_size = x.shape[1]
        timesteps = target.shape[0]
        x = torch.zeros(1, batch_size, self.output_dim).to(device)
        output = torch.tensor([]).to(device)
        if self.bidirectional:
            # concatenating hidden states from two directions
            hidden_state = torch.cat((hidden_state[:self.layers,:,:], 
                                      hidden_state[self.layers:,:,:]), dim=2)
            cell_state = torch.cat((cell_state[:self.layers,:,:], 
                                    cell_state[self.layers:,:,:]), dim=2)
        # decoding
        for t in range(timesteps):           
            x, (hidden_state, cell_state) = self.decoder(x, hidden_state, cell_state)            
            x = self.softmax(self.fc(x))
            output = torch.cat((output, x), dim=0)
            choice = random.random() 
            if choice < teacher_forcing:
                x = target[t].float().to(device)
                x = x.unsqueeze(0)
            else:
                # converting x to a one-hot encoding
                x = torch.zeros(x.shape).to(device).scatter_(2, torch.argmax(x, -1, keepdim=True), 1)
        return output

    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.encoder.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.decoder.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.fc.parameters() if p.requires_grad)
        return tot_sum


In [6]:
our = LSTMSeq2SeqDifferent(input_dim, hidden_dim, output_dim, bidirectional=bidirectional, layers=layers).to(device)
print(our.count_parameters())
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(our.parameters(), lr=learning_rate)

6978


In [None]:
train_x = train_x.to(device)
train_y = train_y.to(device)
test_x = test_x.to(device)
test_y = test_y.to(device)

our = train(our, train_x, train_y, test_x, test_y, epochs=30, loss_fn=loss_fn, optimizer=optimizer, teacher_forcing=0.5)

## PyTorch implementation

In [7]:
class PyTorchBaseline(nn.Module):
    """ LSTM Class for Sequence Labelling (many-to-many-different)

    The class creates the LSTM architecture as specified by the parameters.
    A fully connected layer is added to reduce the last hidden state to output_dim.

    Parameters
    ==========
    vocab_len: int from imdb dataset
    embed_dim: dimensions of the embeddings
    hidden_dim: number of hidden nodes required
    output_dim: numer of output nodes required (1 for sentiment analysis)
    pretrained_vec: weights from imdb object
    layers: number of LSTM cells to be stacked for depth
    bidirectional: boolean
    layernorm: boolean

    """
    def __init__(self, input_dim, hidden_dim, output_dim, layers=1,
                 bidirectional=False, layernorm=False):
        super().__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.bidirectional = bidirectional
        self.layernorm = layernorm

        self.encoder = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=layers,
                         bidirectional=bidirectional) #, layernorm=layernorm)
        if self.bidirectional:
            self.decoder = nn.LSTM(input_size=output_dim, hidden_size=2 * hidden_dim, num_layers=layers,
                                bidirectional=False) #, layernorm=layernorm)
            self.fc = nn.Linear(2 * hidden_dim, output_dim)
        else:
            self.decoder = nn.LSTM(input_size=output_dim, hidden_size=hidden_dim, num_layers=layers,
                                bidirectional=False) #, layernorm=layernorm)
            self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x, target, hidden_state, cell_state, teacher_forcing=0.5):
        device = 'cpu'
        if x.is_cuda:
            device = 'cuda'
        # encoding
        _, (hidden_state, cell_state) = self.encoder(x, (hidden_state, cell_state))
        batch_size = x.shape[1]
        timesteps = target.shape[0]
        x = torch.zeros(1, batch_size, self.output_dim).to(device)
        output = torch.tensor([]).to(device)
        if self.bidirectional:
            # concatenating hidden states from two directions
            hidden_state = torch.cat((hidden_state[:self.layers,:,:], 
                                      hidden_state[self.layers:,:,:]), dim=2)
            cell_state = torch.cat((cell_state[:self.layers,:,:], 
                                    cell_state[self.layers:,:,:]), dim=2)
        # decoding
        for t in range(timesteps):           
            x, (hidden_state, cell_state) = self.decoder(x, (hidden_state, cell_state))
            x = self.softmax(self.fc(x))
            output = torch.cat((output, x), dim=0)
            choice = random.random() 
            if choice < teacher_forcing:
                x = target[t].float().to(device)
                x = x.unsqueeze(0)
            else:
                # converting x to a one-hot encoding
                x = torch.zeros(x.shape).to(device).scatter_(2, torch.argmax(x, -1, keepdim=True), 1)
        return output

    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.encoder.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.decoder.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.fc.parameters() if p.requires_grad)
        return tot_sum


In [8]:
pytorch = PyTorchBaseline(input_dim, hidden_dim, output_dim, layers, bidirectional).to(device)
print(pytorch.count_parameters())
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(pytorch.parameters(), lr=learning_rate)

7234


In [16]:
train_x = train_x.to(device)
train_y = train_y.to(device)
test_x = test_x.to(device)
test_y = test_y.to(device)

pytorch = train(pytorch, train_x, train_y, test_x, test_y, epochs=30, loss_fn=loss_fn, optimizer=optimizer, teacher_forcing=0.5)

Epoch #1  : Batch 384/384 -- Loss: 0.64964
Average Loss: 0.672174
Training F1: 0.6418
Test F1: 0.6348
Epoch #2  : Batch 384/384 -- Loss: 0.62229
Average Loss: 0.62696
Training F1: 0.6057
Test F1: 0.5931
Epoch #3  : Batch 384/384 -- Loss: 0.61447
Average Loss: 0.592932
Training F1: 0.6805
Test F1: 0.6745
Epoch #4  : Batch 384/384 -- Loss: 0.57014
Average Loss: 0.568732
Training F1: 0.6969
Test F1: 0.6871
Epoch #5  : Batch 384/384 -- Loss: 0.57319
Average Loss: 0.555202
Training F1: 0.7054
Test F1: 0.699
Epoch #6  : Batch 384/384 -- Loss: 0.55299
Average Loss: 0.547974
Training F1: 0.6893
Test F1: 0.6802
Epoch #7  : Batch 384/384 -- Loss: 0.51903
Average Loss: 0.542967
Training F1: 0.6951
Test F1: 0.6885
Epoch #8  : Batch 384/384 -- Loss: 0.53368
Average Loss: 0.534724
Training F1: 0.7279
Test F1: 0.7192
Epoch #9  : Batch 384/384 -- Loss: 0.50965
Average Loss: 0.528096
Training F1: 0.7049
Test F1: 0.6949
Epoch #10 : Batch 384/384 -- Loss: 0.51742
Average Loss: 0.523615
Training F1: 0.727

In [0]:
print("Our implementation\n{}".format("=" * len("Our implementation")))
print("# of parameters: {}".format(our.count_parameters()))
for name, param in our.named_parameters():
    print("{:<25}: {}".format(name, param.shape))

In [18]:
print("PyTorch implementation\n{}".format("=" * len("PyTorch implementation")))
print("# of parameters: {}".format(pytorch.count_parameters()))
for name, param in pytorch.named_parameters():
    print("{:<30}: {}".format(name, param.shape))

PyTorch implementation
# of parameters: 7234
encoder.weight_ih_l0          : torch.Size([64, 2])
encoder.weight_hh_l0          : torch.Size([64, 16])
encoder.bias_ih_l0            : torch.Size([64])
encoder.bias_hh_l0            : torch.Size([64])
encoder.weight_ih_l0_reverse  : torch.Size([64, 2])
encoder.weight_hh_l0_reverse  : torch.Size([64, 16])
encoder.bias_ih_l0_reverse    : torch.Size([64])
encoder.bias_hh_l0_reverse    : torch.Size([64])
decoder.weight_ih_l0          : torch.Size([128, 2])
decoder.weight_hh_l0          : torch.Size([128, 32])
decoder.bias_ih_l0            : torch.Size([128])
decoder.bias_hh_l0            : torch.Size([128])
fc.weight                     : torch.Size([2, 32])
fc.bias                       : torch.Size([2])


PyTorch uses $Wh + b_h + Wx + b_x$ whereas we are using $Wx' + b$, where $x'$ is $h, x$ concatenated. Therefore PyTorch has an extra set of biases for each direction for the encoder and also for the decoder.

For one direction - 64 <br>
For reverse direction - 64 <br>
For the decoder - 128 <br>

Our model has $6978$ parameters while the PyTorch model has $6978 + 64 + 64 + 128 = 7234$ parameters.

# Transformer

## harvard NLP

In [9]:
from torch.autograd import Variable

## Data stuff
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = \
                self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask

      
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0


def data_gen(V, batch, nbatches):
    "Generate random data for a src-tgt copy task."
    for i in range(nbatches):
        data = torch.from_numpy(np.random.randint(2, V, size=(batch, 10)))
        src = Variable(data, requires_grad=False)
        tgt = Variable(data.repeat(1, 2), requires_grad=False)
        src = torch.nn.functional.pad(src, (1,0), 'constant', 1)
        tgt = torch.nn.functional.pad(tgt, (1,0), 'constant', 1)
        yield Batch(src, tgt, 0)
        

global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)
        
        
## Optimizer
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
  

## Loss function
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

      
class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, criterion, opt=None):
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm, plot_grad=None):
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1)) / norm
        loss.backward()
        # model as parameters for plotting gradient
        if plot_grad:
            plot_grad_flow(plot_grad.named_parameters())
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss * norm

      
## Training loop
def run_epoch(data_iter, model, loss_compute, plot_grad=False):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    plot_grad = model if plot_grad else None
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.trg)
        loss = loss_compute(out, batch.trg_y, batch.ntokens, plot_grad)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                (i, loss.item() / float(batch.ntokens), tokens.item() / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens

In [10]:
V = 10
b = batch = next(data_gen(V, 30, 20))
print(b.src[0])
print(b.trg[0])
print(b.trg_y[0])

tensor([1, 2, 9, 5, 5, 9, 8, 4, 7, 2, 6])
tensor([1, 2, 9, 5, 5, 9, 8, 4, 7, 2, 6, 2, 9, 5, 5, 9, 8, 4, 7, 2])
tensor([2, 9, 5, 5, 9, 8, 4, 7, 2, 6, 2, 9, 5, 5, 9, 8, 4, 7, 2, 6])


In [11]:
import transformer_baseline as tb

## Model
class TransformerBaselineNLP(nn.Module):
    def __init__(self, in_dim, out_dim, N, heads, model_dim, key_dim, value_dim, ff_dim, 
                 max_len=10000, batch_first=True, pad=tb.Constants.PAD):
        super().__init__()
        self.name = 'transformer'
        
        self.batch_first = batch_first
        self.pad = pad
        
        self.encoder = tb.Models.Encoder(
            n_src_vocab=in_dim, len_max_seq=max_len,
            d_word_vec=model_dim, d_model=model_dim, d_inner=ff_dim,
            n_layers=N, n_head=heads, d_k=key_dim, d_v=value_dim,
            dropout=0.0, embedding='embed')
        
        self.decoder = tb.Models.Decoder(
            n_tgt_vocab=out_dim, len_max_seq=max_len,
            d_word_vec=model_dim, d_model=model_dim, d_inner=ff_dim,
            n_layers=N, n_head=heads, d_k=key_dim, d_v=value_dim,
            dropout=0.0, embedding='embed')
        
        self.fc = nn.Linear(model_dim, out_dim, bias=False)
        
        # This was important from their code. 
        # Initialize parameters with Glorot / fan_avg.
        for p in self.parameters():
            if p.dim() > 1 and p.requires_grad:
                nn.init.xavier_uniform_(p)
    
    def forward(self, x, t):
        
        if not self.batch_first:
            x = x.transpose(0,1)
            t = t.transpose(0,1)
        
        # encoder requires source sequence & positions of each 
        x_pos = torch.arange(x.shape[1]).unsqueeze(0).repeat(x.shape[0], 1)
        t_pos = torch.arange(t.shape[1]).unsqueeze(0).repeat(t.shape[0], 1)
        
        # -- Prepare masks
        # encoder
        e_slf_attn_mask = tb.Models.get_attn_key_pad_mask(seq_k=x, seq_q=x, pad=self.pad)
        e_non_pad_mask = torch.ones(x.shape[0], x.shape[1], 1)
        e_non_pad_mask = tb.Models.get_non_pad_mask(x, pad=self.pad)
        
        # decoder
        d_non_pad_mask = tb.Models.get_non_pad_mask(t)
        slf_attn_mask_subseq = tb.Models.get_subsequent_mask(t)
        slf_attn_mask_keypad = tb.Models.get_attn_key_pad_mask(seq_k=t, seq_q=t)
        d_slf_attn_mask = (slf_attn_mask_keypad + slf_attn_mask_subseq).gt(0)
        d_dec_enc_attn_mask = tb.Models.get_attn_key_pad_mask(seq_k=x, seq_q=t)
        
        enc_attn, *_ = self.encoder(x, x_pos, e_slf_attn_mask, e_non_pad_mask)
        attn, *_ = self.decoder(t, t_pos, x, enc_attn, 
                                  d_slf_attn_mask, d_non_pad_mask, d_dec_enc_attn_mask)
        x = self.fc(attn)
        
        if not self.batch_first:
            x = x.transpose(0,1)
        return x
    
    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.parameters() if p.requires_grad)
        return tot_sum
    
    def generate(self, x, start_symbol, max_len=1):
        
        if not self.batch_first:
            x = x.transpose(0,1)
        
        # initialize target with start symbol
        t = torch.tensor(start_symbol).view(1,-1).repeat(x.shape[0],1).long()
        
        for i in range(max_len-1):    
            pred = self.forward(x, t)
            # get last prediction & combine with target for next iteration
            # pred = torch.zeros_like(pred).scatter_(2, torch.argmax(pred, -1, keepdim=True), 1)
            pred = torch.argmax(pred, -1)
            t = torch.cat((t, pred[:,-1:]), dim=1)
            
        if not self.batch_first:
            t = t.transpose(0,1)
        return t

In [12]:
# Train the simple copy task.
V = 11
# criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
criterion = nn.CrossEntropyLoss()
model = TransformerBaselineNLP(in_dim=V, out_dim=V, N=1, heads=4, model_dim=16, key_dim=4, value_dim=4, ff_dim=64, 
                            batch_first=True, pad=0)

print('Model parameters: ', model.count_parameters())


Model parameters:  8208


In [23]:
model_opt = NoamOpt(16, 1, 400,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

for epoch in range(20):
    print("---- Epoch #", epoch)
    model.train()
    run_epoch(data_gen(V, 30, 20), model, 
              SimpleLossCompute(criterion, model_opt), plot_grad=False)
    model.eval()
    run_epoch(data_gen(V, 30, 10), model, 
              SimpleLossCompute(criterion, None))

Model parameters:  8208
---- Epoch # 0
Epoch Step: 1 Loss: 0.004773 Tokens per Sec: 13282.045679
Epoch Step: 1 Loss: 0.004339 Tokens per Sec: 15194.276296
---- Epoch # 1
Epoch Step: 1 Loss: 0.004164 Tokens per Sec: 13937.381586
Epoch Step: 1 Loss: 0.003875 Tokens per Sec: 15196.983046
---- Epoch # 2
Epoch Step: 1 Loss: 0.003827 Tokens per Sec: 14078.585754
Epoch Step: 1 Loss: 0.003746 Tokens per Sec: 14890.755785
---- Epoch # 3
Epoch Step: 1 Loss: 0.003720 Tokens per Sec: 13823.350343
Epoch Step: 1 Loss: 0.003687 Tokens per Sec: 14976.358778
---- Epoch # 4
Epoch Step: 1 Loss: 0.003668 Tokens per Sec: 13827.565763
Epoch Step: 1 Loss: 0.003636 Tokens per Sec: 15156.893556
---- Epoch # 5
Epoch Step: 1 Loss: 0.003623 Tokens per Sec: 13110.204422
Epoch Step: 1 Loss: 0.003518 Tokens per Sec: 15111.840774
---- Epoch # 6
Epoch Step: 1 Loss: 0.003520 Tokens per Sec: 14081.815664
Epoch Step: 1 Loss: 0.003419 Tokens per Sec: 15122.828925
---- Epoch # 7
Epoch Step: 1 Loss: 0.003387 Tokens per Sec:

In [24]:
print(b.src[:1])
print(b.trg[:1, :-1])
print('---')
print(b.trg_y[:1, :-1])
print('=====')
print('predictions')
pred = model(b.src[:1], b.trg[:1, :-1]).argmax(-1)
print(pred)
print('=====')
print('inference')
gen = model.generate(b.src[:1], start_symbol=1, max_len=b.trg.shape[1])
print(gen)

tensor([[1, 8, 7, 4, 8, 6, 3, 6, 9, 4, 9]])
tensor([[1, 8, 7, 4, 8, 6, 3, 6, 9, 4, 9, 8, 7, 4, 8, 6, 3, 6, 9]])
---
tensor([[8, 7, 4, 8, 6, 3, 6, 9, 4, 9, 8, 7, 4, 8, 6, 3, 6, 9, 4]])
=====
predictions
tensor([[8, 7, 4, 8, 6, 3, 6, 9, 4, 9, 8, 7, 4, 8, 6, 3, 6, 9, 4]])
=====
inference
tensor([[1, 8, 7, 4, 8, 6, 3, 6, 9, 4, 9, 8, 7, 4, 8, 6, 3, 6, 9, 4]])


## Modeling

In [146]:
def train_transformer(model, train_x, train_y, test_x, test_y, epochs, loss_fn, optimizer):
    train_size = train_x.shape[1]
    device = torch.device("cpu")
    if train_x.is_cuda:
        device = torch.device("cuda")
        
    for i in range(1, epochs + 1):
        model.train()
        loss_tracker = []
        ordering = torch.randperm(train_size)
        train_x = train_x[:,ordering,:]
        train_y = train_y[:,ordering,:]
        
        epoch_time = time.time()
        
        for j in range(int(float(train_size)/batch_size) + 1):
            optimizer.zero_grad()
            start = j*batch_size
            end = min((j+1)*batch_size, train_size)
            batch = end - start
            if batch is 0:
                continue
            # 0:(n-1) of target is sent to model
            # 1:n is used as label to predict
            target = train_y[:-1,start:end,:]
            label = train_y[1:,start:end,:]
            
            st = time.time()
            
            o = model(train_x[:,start:end,:], target)
            
            o = o.contiguous().view(-1, 2)
            gt = torch.argmax(label, 2, keepdim=True).view(-1)
            loss = loss_fn(o, gt)
            loss_tracker.append(loss.item())
            loss.backward()
            optimizer.step()
            print("Epoch #{:<3d}: Batch {:>3d}/{:<3d} -- "
                  "Loss: {:2.5}  time: {:2.5}".format(i, j+1, int(train_size/batch_size), 
                                        loss_tracker[-1], (time.time()-st)), end='\r')
        print()
        f1_train = evaluate_transformer(model, train_x, train_y)
        f1_test = evaluate_transformer(model, test_x, test_y)
        print("Average Loss: {:2.6}".format(np.mean(loss_tracker)))
        print("Total time: {:2.5}".format(time.time() - epoch_time))
        print("Training F1: {:3.4}".format(f1_train))
        print("Test F1: {:3.4}".format(f1_test))
        print("=" * 50)
    
    return model


def evaluate_transformer(model, x, y, start_token=START):
    model.eval()
    test_size = x.shape[1]
    device = torch.device("cpu")
    if x.is_cuda:
        device = torch.device("cuda")

    labels = []
    preds = []
    for j in range(int(test_size/batch_size) + 1):
        optimizer.zero_grad()
        start = j*batch_size
        end = min((j+1)*batch_size, test_size)
        batch = end - start
        if batch == 0:
            continue
        with torch.no_grad():
            o = model.generate(x[:,start:end,:], start_token=start_token, max_len=y.shape[0])
        pred = torch.argmax(o, 2, keepdim=True).view(-1).cpu().detach().numpy()
        preds.extend(pred)
        label = torch.argmax(y[:,start:end,:], 2, 
                             keepdim=True).view(-1).cpu().detach().numpy()
        labels.extend(label)
    return f1_score(labels, preds)

## Our implementation

In [180]:
from transformer import PositionalEncoding, Encoder, Decoder, get_subsequent_mask

class TransformerSeq2SeqDifferent(nn.Module):
    
    def __init__(self, in_dim, out_dim, N, heads, model_dim, key_dim, value_dim, ff_dim, 
                 max_len=10000, batch_first=True):
        
        super().__init__()
        self.name = 'transformer'
        
        self.batch_first = batch_first
        
        # define layers
        # embedding layers
        self.src_embed = nn.Linear(in_dim, model_dim)
        self.tgt_embed = nn.Linear(in_dim, model_dim)
        self.pos_enc = PositionalEncoding(model_dim, max_len)
        # encoder-decoder
        self.encoder = Encoder(N, heads, model_dim, key_dim, value_dim, ff_dim)
        self.decoder = Decoder(N, heads, model_dim, key_dim, value_dim, ff_dim)
        # final output layer
        self.fc = nn.Linear(model_dim, out_dim)
    
        # xavier initialization
        for p in self.parameters():
            if p.dim() > 1 and p.requires_grad:
                nn.init.xavier_uniform_(p)
    
    def forward(self, src, tgt, src_mask=None, tgt_mask=None):

        # transpose to use [batch, seq_len, dim]
        if not self.batch_first:
            src = src.transpose(0, 1)
            tgt = tgt.transpose(0, 1)
        
        # get subsequent mask for target sequence
        tgt_subseq_mask = get_subsequent_mask(tgt)
        # combine with tgt mask if provided
        if tgt_mask is not None:
            tgt_subseq_mask = (tgt_mask + tgt_subseq_mask).gt(0)
        
        ## get encoder attention from source
        src = self.src_embed(src)
        src = self.pos_enc(src)
        src_attn = self.encoder(src, src_mask)
        
        ## get decoder attention from target & source attention
        tgt = self.tgt_embed(tgt)
        tgt = self.pos_enc(tgt)
        x = self.decoder(src_attn, tgt, src_mask, tgt_subseq_mask)
        
        x = self.fc(x)
        # transpose to use [batch, seq_len, dim]
        if not self.batch_first:
            x = x.transpose(0, 1)
        return x
        
    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.parameters() if p.requires_grad)
        return tot_sum

    def generate(self, src, start_token, src_mask=None, max_len=1):
        
        # transpose to use [batch, seq_len, dim]
        if not self.batch_first:
            src = src.transpose(0, 1)
        
        ## get encoder attention from source
        src = self.src_embed(src)
        src = self.pos_enc(src)
        src_attn = self.encoder(src, src_mask)
        
        # initialize target with start symbol - 1 x b x dim
        tgt = torch.tensor(start_token).view(1,1,-1).repeat(src.shape[0],1,1).float()
        
        for i in range(max_len-1):
            # generate subsequent mask for target sequence
            tgt_subseq_mask = get_subsequent_mask(tgt)
            ## get decoder attention from target & source attention
            tgt_embed = self.tgt_embed(tgt)
            tgt_embed = self.pos_enc(tgt_embed)
            x = self.decoder(src_attn, tgt_embed, src_mask, tgt_subseq_mask)
            # get last predictions and combine with target for next iteration
            x = self.fc(x[:,-1:])
            x = torch.zeros_like(x).scatter_(2, torch.argmax(x, -1, keepdim=True), 1)            
            tgt = torch.cat((tgt, x), dim=1)
            
        # transpose to use [batch, seq_len, dim]
        if not self.batch_first:
            tgt = tgt.transpose(0, 1)
        return tgt
    
#     def generate(self, src, start_token, src_mask=None, max_len=1):

#         # initialize target with start symbol - 1 x b x dim
#         t = torch.tensor(start_token).view(1,1,-1).repeat(1,src.shape[1],1).float()

#         for i in range(max_len-1):
#             # get the last prediction only
#             pred = self.forward(src, t, src_mask)[-1:]
#             # get last prediction & combine with target for next iteration
#             pred = torch.zeros_like(pred).scatter_(2, torch.argmax(pred, -1, keepdim=True), 1)
#             t = torch.cat((t, pred), dim=0)

#         return t

In [181]:
pad = torch.tensor([0,0]).float()

transformer = TransformerSeq2SeqDifferent(in_dim=2, out_dim=2, N=1, heads=4, 
                                          model_dim=16, key_dim=4, value_dim=3, ff_dim=64,
                                          batch_first=False)
transformer = transformer.to(device)

print(transformer.count_parameters())
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=learning_rate)

7414


In [183]:
transformer = train_transformer(transformer, train_x, train_y, test_x, test_y, 
                                epochs=30, loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch 384/384 -- Loss: 0.023669  time: 0.0229065




Average Loss: 0.0441945
Total time: 38.951
Training F1: 1.0
Test F1: 1.0
Epoch #2  : Batch 384/384 -- Loss: 0.02499  time: 0.01881367
Average Loss: 0.0253661
Total time: 39.108
Training F1: 1.0
Test F1: 1.0
Epoch #3  : Batch 384/384 -- Loss: 0.028833  time: 0.0302924
Average Loss: 0.0196663
Total time: 44.601
Training F1: 1.0
Test F1: 1.0
Epoch #4  : Batch 384/384 -- Loss: 0.0022561  time: 0.0152979
Average Loss: 0.0132855
Total time: 35.416
Training F1: 1.0
Test F1: 1.0
Epoch #5  : Batch 384/384 -- Loss: 0.0013465  time: 0.0176073
Average Loss: 0.0108105
Total time: 41.438
Training F1: 1.0
Test F1: 1.0
Epoch #6  : Batch 384/384 -- Loss: 0.00057886  time: 0.025523
Average Loss: 0.00908422
Total time: 49.995
Training F1: 1.0
Test F1: 1.0
Epoch #7  : Batch 384/384 -- Loss: 0.00056462  time: 0.018888
Average Loss: 0.00711359
Total time: 49.959
Training F1: 1.0
Test F1: 1.0
Epoch #8  : Batch 384/384 -- Loss: 0.00039035  time: 0.021207
Average Loss: 0.00690112
Total time: 53.431
Training F1

## Baseline implementation

From github: https://github.com/jadore801120/attention-is-all-you-need-pytorch

In [16]:
import transformer_baseline as tb


## Padding masks - for 3 dim input
def get_attn_key_pad_mask(seq_k, seq_q, pad=tb.Constants.PAD):
    ''' For masking out the padding part of key sequence. '''
    assert seq_k.dim() == 3 and seq_q.dim() == 3
    
    # Expand to fit the shape of key query attention matrix.
    len_q = seq_q.size(1)
    padding_mask = torch.all(seq_k.eq(pad), dim=-1)  # b x lq
    padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1)  # b x lq x lk
    return padding_mask

def get_non_pad_mask(seq, pad=tb.Constants.PAD):
    assert seq.dim() == 3
    padding_mask = ~torch.all(seq.ne(pad), dim=-1)  # b x l
#     padding_mask = padding_mask.repeat(1, 1, seq.shape[-1])  # b x l x d (repeated)
    return padding_mask.type(torch.float).unsqueeze(-1)


## Model
class TransformerBaseline(nn.Module):
    def __init__(self, in_dim, out_dim, N, heads, model_dim, key_dim, value_dim, ff_dim, 
                 max_len=10000, batch_first=True, pad=tb.Constants.PAD):
        super().__init__()
        self.name = 'transformer'
        
        self.batch_first = batch_first
        self.pad = pad
        
        self.encoder = tb.Models.Encoder(
            n_src_vocab=in_dim, len_max_seq=max_len,
            d_word_vec=model_dim, d_model=model_dim, d_inner=ff_dim,
            n_layers=N, n_head=heads, d_k=key_dim, d_v=value_dim,
            dropout=0.0, embedding='linear')
        
        self.decoder = tb.Models.Decoder(
            n_tgt_vocab=out_dim, len_max_seq=max_len,
            d_word_vec=model_dim, d_model=model_dim, d_inner=ff_dim,
            n_layers=N, n_head=heads, d_k=key_dim, d_v=value_dim,
            dropout=0.0, embedding='linear')
        
        self.fc = nn.Linear(model_dim, out_dim, bias=False)
        
        # This was important from their code. 
        # Initialize parameters with Glorot / fan_avg.
        for p in self.parameters():
            if p.dim() > 1 and p.requires_grad:
                nn.init.xavier_uniform_(p)
    
    def forward(self, x, t):
        
        if not self.batch_first:
            x = x.transpose(0,1)
            t = t.transpose(0,1)
        
        # encoder requires source sequence & positions of each 
        x_pos = torch.arange(x.shape[1]).unsqueeze(0).repeat(x.shape[0], 1)
        t_pos = torch.arange(t.shape[1]).unsqueeze(0).repeat(t.shape[0], 1)
        
        # -- Prepare masks
        # encoder
        e_slf_attn_mask = get_attn_key_pad_mask(seq_k=x, seq_q=x, pad=self.pad)
        e_non_pad_mask = torch.ones(x.shape[0], x.shape[1], 1)
        e_non_pad_mask = get_non_pad_mask(x, pad=self.pad)
        
        # decoder
        d_non_pad_mask = get_non_pad_mask(t)
        slf_attn_mask_subseq = tb.Models.get_subsequent_mask(t)
        slf_attn_mask_keypad = get_attn_key_pad_mask(seq_k=t, seq_q=t)
        d_slf_attn_mask = (slf_attn_mask_keypad + slf_attn_mask_subseq).gt(0)
        d_dec_enc_attn_mask = get_attn_key_pad_mask(seq_k=x, seq_q=t)
        
        start = time.time()
        
        enc_attn, *_ = self.encoder(x, x_pos, e_slf_attn_mask, e_non_pad_mask)
        
        print('enc time...', time.time() - start)
        
        attn, *_ = self.decoder(t, t_pos, x, enc_attn, 
                                  d_slf_attn_mask, d_non_pad_mask, d_dec_enc_attn_mask)
        print('decoder time............', time.time() - start)
        
        x = self.fc(attn)
        
        if not self.batch_first:
            x = x.transpose(0,1)
        return x
      
    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.parameters() if p.requires_grad)
        return tot_sum

In [17]:
pad = torch.tensor([0,0]).float()

baseline = TransformerBaseline(in_dim=2, out_dim=2, N=1, heads=4, model_dim=16, key_dim=4, value_dim=3, ff_dim=64,
                               batch_first=False, pad=0)
baseline = baseline.to(device)

print(baseline.count_parameters())
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(baseline.parameters(), lr=learning_rate)

7412


In [18]:
baseline = train_transformer(baseline, train_x, train_y, test_x, test_y, epochs=30, loss_fn=loss_fn, optimizer=optimizer)

NameError: name 'train_x' is not defined