In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

import os
import json
import time
import random
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score

from IPython.display import Image

In [3]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

SEED = 1
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

batch_size = 8
# Percentage of training data
learning_rate = 0.001
epochs = 100

# Reverse-copy task dataset

The entire dataset comprises of the binary representation of all numbers uptil a range defined. The binary sequence from left to right (most significant to least significant) is the input. The target is just the reverse sequence.

In [4]:
import copy

# Generating data
state_size = 12
data_x = []
for i in range(pow(2, state_size)):
    data_x.append([int(x) for x in list(np.binary_repr(i, width=state_size))])
data_x = np.array(data_x)
data_x.shape

(4096, 12)

In [5]:
# Reshaping for tensors
data_x = np.transpose(data_x).reshape(state_size, pow(2, state_size), 1)
data_x = torch.from_numpy(data_x).float()
data_x = torch.zeros(data_x.shape[0], data_x.shape[1], 2).scatter_(2, data_x.long(), 1)
data_y = data_x.clone()
data_x.shape, data_y.shape

(torch.Size([12, 4096, 2]), torch.Size([12, 4096, 2]))

In [6]:
# Execute for reverse-copy (comment for copy-task)
data_y = torch.flip(data_y, [0])

In [7]:
# Creating training and test sets
train_size = 0.75
ordering = torch.randperm(pow(2, state_size))
data_x = data_x[:, ordering, :]
data_y = data_y[:, ordering, :]
train_x = data_x[:,:int(train_size * len(ordering)),:]
train_y = data_y[:,:int(train_size * len(ordering)),:]
test_x = data_x[:,int(train_size * len(ordering)):,:]
test_y = data_y[:,int(train_size * len(ordering)):,:]

# Creating training and validation sets
## TODO

print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

torch.Size([12, 3072, 2]) torch.Size([12, 3072, 2]) torch.Size([12, 1024, 2]) torch.Size([12, 1024, 2])


# LSTM

## Modelling

In [14]:
# Input dim
input_dim = 2
# Number of hidden nodes
hidden_dim = 16
# Number of output nodes
output_dim = 2
# Number of LSTMs cells to be stacked
layers = 1
# Boolean value for bidirectioanl or not
bidirectional = True
# Boolean value to use LayerNorm or not
layernorm = False

In [16]:
def train(model, train_x, train_y, test_x, test_y, epochs, loss_fn, optimizer):
    train_size = train_x.shape[1]
    device = torch.device("cpu")
    if train_x.is_cuda:
        device = torch.device("cuda")
    layers = model.layers
    hidden_dim = model.hidden_dim
    for i in range(1, epochs + 1):
        model.train()
        loss_tracker = []
        ordering = torch.randperm(train_size)
        train_x = train_x[:,ordering,:]
        train_y = train_y[:,ordering,:]
        for j in range(int(float(train_size)/batch_size) + 1):
            optimizer.zero_grad()
            start = j*batch_size
            end = min((j+1)*batch_size, train_size)
            batch = end - start
            if batch is 0:
                continue
            if model.bidirectional:
                hidden_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
                cell_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
            else:
                hidden_state = torch.zeros(layers, batch, hidden_dim).to(device)
                cell_state = torch.zeros(layers, batch, hidden_dim).to(device)
                
            o = model(train_x[:,start:end,:], hidden_state, cell_state)
            gt = torch.argmax(train_y[:,start:end,:], 2, keepdim=True).view(-1)
            loss = loss_fn(o.view(-1, train_x.shape[-1]), gt)            
            loss_tracker.append(loss.item())
            loss.backward()
            optimizer.step()
            print("Epoch #{:<3d}: Batch {:>3d}/{:<3d} -- "
                  "Loss: {:2.5}".format(i, j+1, int(train_size/batch_size), 
                                        loss_tracker[-1]), end='\r')
        print()
        f1_train = evaluate(model, train_x, train_y)
        f1_test = evaluate(model, test_x, test_y)
        print("Average Loss: {:2.6}".format(np.mean(loss_tracker)))
        print("Training F1: {:3.4}".format(f1_train))
        print("Test F1: {:3.4}".format(f1_test))
        print("=" * 50)
    
    return model


def evaluate(model, x, y):
    model.eval()
    test_size = x.shape[1]
    device = torch.device("cpu")
    if x.is_cuda:
        device = torch.device("cuda")
    layers = model.layers
    hidden_dim = model.hidden_dim
    labels = []
    preds = []
    for j in range(int(test_size/batch_size) + 1):
        optimizer.zero_grad()
        start = j*batch_size
        end = min((j+1)*batch_size, test_size)
        batch = end - start
        if batch == 0:
            continue
        if model.bidirectional:
            hidden_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
            cell_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
        else:
            hidden_state = torch.zeros(layers, batch, hidden_dim).to(device)
            cell_state = torch.zeros(layers, batch, hidden_dim).to(device)
        with torch.no_grad():
            o = model(x[:,start:end,:], hidden_state, cell_state)
        pred = torch.argmax(o, 2, keepdim=True).view(-1).cpu().detach().numpy()
        preds.extend(pred)
        label = torch.argmax(y[:,start:end,:], 2, 
                             keepdim=True).view(-1).cpu().detach().numpy()
        labels.extend(label)
    return f1_score(labels, preds)

## Our implementation

In [22]:
from lstm import LSTM

class LSTMSeq2SeqSame(nn.Module):
    """ LSTM Class for Sequence to Sequence (many-to-many same)

    The class creates the LSTM architecture as specified by the parameters.
    A fully connected layer is added to reduce the last hidden state to output_dim.

    Parameters
    ==========
    input_dim: input dimensions
    hidden_dim: number of hidden nodes required
    output_dim: numer of output nodes required (1 for sentiment analysis)
    layers: number of LSTM cells to be stacked for depth
    bidirectional: boolean
    layernorm: boolean

    """

    def __init__(self, input_dim, hidden_dim, output_dim,
                 layers=1, bidirectional=False, layernorm=False):
        super().__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.layers = layers
        self.bidirectional = bidirectional
        self.layernorm = layernorm

        self.lstm = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, layers=layers,
                         bidirectional=bidirectional, layernorm=layernorm)
        if self.bidirectional:
            self.fc = nn.Linear(2 * hidden_dim, output_dim)
        else:
            self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x, hidden_state, cell_state):
        output, (_, _) = self.lstm(x, hidden_state, cell_state)
        orig_dims = output.shape
        # fc computation for each element
        output = self.fc(output.view(-1, output.shape[-1]))  
        # reshaping to have (seq_len, batch, output)
        output = output.view(orig_dims[0], orig_dims[1], output.shape[1])  
        output = self.softmax(output)
        return output

    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.lstm.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.fc.parameters() if p.requires_grad)
        return tot_sum


In [23]:
our = LSTMSeq2SeqSame(input_dim, hidden_dim, output_dim, bidirectional=bidirectional, layers=layers).to(device)
print(our.count_parameters())
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(our.parameters(), lr=learning_rate)

2498


In [24]:
train_x = train_x.to(device)
train_y = train_y.to(device)
test_x = test_x.to(device)
test_y = test_y.to(device)

train(our, train_x, train_y, test_x, test_y, epochs=100, loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch 384/384 -- Loss: 0.64726
Average Loss: 0.654597
Training F1: 0.6457
Test F1: 0.6415
Epoch #2  : Batch 384/384 -- Loss: 0.62763
Average Loss: 0.627149
Training F1: 0.6482
Test F1: 0.6483
Epoch #3  : Batch 384/384 -- Loss: 0.61301
Average Loss: 0.620728
Training F1: 0.6592
Test F1: 0.6543
Epoch #4  : Batch 384/384 -- Loss: 0.54931
Average Loss: 0.604173
Training F1: 0.6756
Test F1: 0.6758
Epoch #5  : Batch 384/384 -- Loss: 0.57186
Average Loss: 0.588624
Training F1: 0.6523
Test F1: 0.6496
Epoch #6  : Batch 384/384 -- Loss: 0.57483
Average Loss: 0.57562
Training F1: 0.7473
Test F1: 0.7428
Epoch #7  : Batch 384/384 -- Loss: 0.54522
Average Loss: 0.564409
Training F1: 0.7227
Test F1: 0.7208
Epoch #8  : Batch 384/384 -- Loss: 0.60681
Average Loss: 0.554765
Training F1: 0.7592
Test F1: 0.7528
Epoch #9  : Batch 384/384 -- Loss: 0.53719
Average Loss: 0.544467
Training F1: 0.7726
Test F1: 0.7628
Epoch #10 : Batch 384/384 -- Loss: 0.55513
Average Loss: 0.537218
Training F1: 0.77

Epoch #55 : Batch 384/384 -- Loss: 0.34667
Average Loss: 0.367135
Training F1: 0.9486
Test F1: 0.9493
Epoch #56 : Batch 384/384 -- Loss: 0.37733
Average Loss: 0.367866
Training F1: 0.949
Test F1: 0.9496
Epoch #57 : Batch 384/384 -- Loss: 0.35764
Average Loss: 0.367203
Training F1: 0.9491
Test F1: 0.9497
Epoch #58 : Batch 384/384 -- Loss: 0.37835
Average Loss: 0.365407
Training F1: 0.9499
Test F1: 0.9503
Epoch #59 : Batch 384/384 -- Loss: 0.39697
Average Loss: 0.366656
Training F1: 0.9504
Test F1: 0.9511
Epoch #60 : Batch 384/384 -- Loss: 0.36733
Average Loss: 0.36454
Training F1: 0.9526
Test F1: 0.953
Epoch #61 : Batch 384/384 -- Loss: 0.34664
Average Loss: 0.362859
Training F1: 0.9528
Test F1: 0.9533
Epoch #62 : Batch 384/384 -- Loss: 0.32481
Average Loss: 0.364039
Training F1: 0.9527
Test F1: 0.9532
Epoch #63 : Batch 384/384 -- Loss: 0.34672
Average Loss: 0.362009
Training F1: 0.9528
Test F1: 0.9533
Epoch #64 : Batch 384/384 -- Loss: 0.35545
Average Loss: 0.362458
Training F1: 0.9528

LSTMSeq2SeqSame(
  (lstm): LSTM(
    (model): ModuleList(
      (0): LSTMCell(
        (g1): Sigmoid()
        (g2): Tanh()
      )
    )
    (model_rev): ModuleList(
      (0): LSTMCell(
        (g1): Sigmoid()
        (g2): Tanh()
      )
    )
  )
  (fc): Linear(in_features=32, out_features=2, bias=True)
  (softmax): Softmax()
)

## PyTorch implementation

In [19]:
class PyTorchBaseline(nn.Module):
    """ LSTM Class for Sequence to Sequence (many-to-many same)

    The class creates the LSTM architecture as specified by the parameters.
    A fully connected layer is added to reduce the last hidden state to output_dim.

    Parameters
    ==========
    input_dim: input dimensions
    hidden_dim: number of hidden nodes required
    output_dim: numer of output nodes required (1 for sentiment analysis)
    layers: number of LSTM cells to be stacked for depth
    bidirectional: boolean
    layernorm: boolean

    """

    def __init__(self, input_dim, hidden_dim, output_dim,
                 layers=1, bidirectional=False, layernorm=False):
        super().__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.layers = layers
        self.bidirectional = bidirectional
        self.layernorm = layernorm

        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=layers,
                         bidirectional=bidirectional) #, layernorm=layernorm)
        if self.bidirectional:
            self.fc = nn.Linear(2 * hidden_dim, output_dim)
        else:
            self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x, hidden_state, cell_state):
        output, (_, _) = self.lstm(x, (hidden_state, cell_state))
        orig_dims = output.shape
        # fc computation for each element
        output = self.fc(output.view(-1, output.shape[-1]))  
        # reshaping to have (seq_len, batch, output)
        output = output.view(orig_dims[0], orig_dims[1], output.shape[1])  
        output = self.softmax(output)
        return output

    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.lstm.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.fc.parameters() if p.requires_grad)
        return tot_sum


In [20]:
pytorch = PyTorchBaseline(input_dim, hidden_dim, output_dim, bidirectional=bidirectional, layers=layers).to(device)
print(pytorch.count_parameters())
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(pytorch.parameters(), lr=learning_rate)

2626


In [21]:
train_x = train_x.to(device)
train_y = train_y.to(device)
test_x = test_x.to(device)
test_y = test_y.to(device)

train(pytorch, train_x, train_y, test_x, test_y, epochs=100, loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch 384/384 -- Loss: 0.62225
Average Loss: 0.652653
Training F1: 0.6388
Test F1: 0.6373
Epoch #2  : Batch 384/384 -- Loss: 0.60822
Average Loss: 0.626826
Training F1: 0.6306
Test F1: 0.6264
Epoch #3  : Batch 384/384 -- Loss: 0.62273
Average Loss: 0.613017
Training F1: 0.7228
Test F1: 0.7238
Epoch #4  : Batch 384/384 -- Loss: 0.57933
Average Loss: 0.575599
Training F1: 0.7427
Test F1: 0.7443
Epoch #5  : Batch 384/384 -- Loss: 0.56489
Average Loss: 0.556984
Training F1: 0.7554
Test F1: 0.7574
Epoch #6  : Batch 384/384 -- Loss: 0.53143
Average Loss: 0.540668
Training F1: 0.7786
Test F1: 0.7814
Epoch #7  : Batch 384/384 -- Loss: 0.54065
Average Loss: 0.525078
Training F1: 0.7945
Test F1: 0.7987
Epoch #8  : Batch 384/384 -- Loss: 0.45112
Average Loss: 0.512275
Training F1: 0.8036
Test F1: 0.8036
Epoch #9  : Batch 384/384 -- Loss: 0.58038
Average Loss: 0.503921
Training F1: 0.8117
Test F1: 0.8129
Epoch #10 : Batch 384/384 -- Loss: 0.50724
Average Loss: 0.496757
Training F1: 0.8

Epoch #55 : Batch 384/384 -- Loss: 0.32171
Average Loss: 0.330995
Training F1: 0.9874
Test F1: 0.9883
Epoch #56 : Batch 384/384 -- Loss: 0.31439
Average Loss: 0.330461
Training F1: 0.9866
Test F1: 0.9876
Epoch #57 : Batch 384/384 -- Loss: 0.31405
Average Loss: 0.328323
Training F1: 0.9882
Test F1: 0.9892
Epoch #58 : Batch 384/384 -- Loss: 0.31426
Average Loss: 0.327092
Training F1: 0.9884
Test F1: 0.9892
Epoch #59 : Batch 384/384 -- Loss: 0.32476
Average Loss: 0.329888
Training F1: 0.9889
Test F1: 0.9901
Epoch #60 : Batch 384/384 -- Loss: 0.32083
Average Loss: 0.327942
Training F1: 0.9871
Test F1: 0.9874
Epoch #61 : Batch 384/384 -- Loss: 0.31494
Average Loss: 0.327057
Training F1: 0.9896
Test F1: 0.99
Epoch #62 : Batch 384/384 -- Loss: 0.33469
Average Loss: 0.324783
Training F1: 0.9896
Test F1: 0.9909
Epoch #63 : Batch 384/384 -- Loss: 0.31474
Average Loss: 0.328818
Training F1: 0.9905
Test F1: 0.9914
Epoch #64 : Batch 384/384 -- Loss: 0.32407
Average Loss: 0.324127
Training F1: 0.983

PyTorchBaseline(
  (lstm): LSTM(2, 16, bidirectional=True)
  (fc): Linear(in_features=32, out_features=2, bias=True)
  (softmax): Softmax()
)

In [25]:
print("Our implementation\n{}".format("=" * len("Our implementation")))
print("# of parameters: {}".format(our.count_parameters()))
for name, param in our.named_parameters():
    print("{:<25}: {}".format(name, param.shape))

Our implementation
# of parameters: 2498
lstm.model.0.weights     : torch.Size([18, 64])
lstm.model.0.bias        : torch.Size([64])
lstm.model_rev.0.weights : torch.Size([18, 64])
lstm.model_rev.0.bias    : torch.Size([64])
fc.weight                : torch.Size([2, 32])
fc.bias                  : torch.Size([2])


In [26]:
print("PyTorch implementation\n{}".format("=" * len("PyTorch implementation")))
print("# of parameters: {}".format(pytorch.count_parameters()))
for name, param in pytorch.named_parameters():
    print("{:<30}: {}".format(name, param.shape))

PyTorch implementation
# of parameters: 2626
lstm.weight_ih_l0             : torch.Size([64, 2])
lstm.weight_hh_l0             : torch.Size([64, 16])
lstm.bias_ih_l0               : torch.Size([64])
lstm.bias_hh_l0               : torch.Size([64])
lstm.weight_ih_l0_reverse     : torch.Size([64, 2])
lstm.weight_hh_l0_reverse     : torch.Size([64, 16])
lstm.bias_ih_l0_reverse       : torch.Size([64])
lstm.bias_hh_l0_reverse       : torch.Size([64])
fc.weight                     : torch.Size([2, 32])
fc.bias                       : torch.Size([2])


PyTorch uses $Wh + b_h + Wx + b_x$ whereas we are using $Wx' + b$, where $x'$ is $h, x$ concatenated. Therefore PyTorch has an extra set of biases for each direction for the encoder and also for the decoder.

For one direction - 64 <br>
For reverse direction - 64 <br>
For the decoder - 128 <br>

Our model has $2498$ parameters while the PyTorch model has $2498 + 64 + 64 = 2626$ parameters.

# Transformer

## Modeling

In [11]:
def train_transformer(model, train_x, train_y, test_x, test_y, epochs, loss_fn, optimizer):
    train_size = train_x.shape[1]
    
    for i in range(1, epochs + 1):
        model.train()
        loss_tracker = []
        ordering = torch.randperm(train_size)
        train_x = train_x[:,ordering,:]
        train_y = train_y[:,ordering,:]
        for j in range(int(float(train_size)/batch_size) + 1):
            optimizer.zero_grad()
            start = j*batch_size
            end = min((j+1)*batch_size, train_size)
            batch = end - start
            if batch is 0:
                continue
            # forward pass
            o = model(train_x[:,start:end,:])
            # backward pass
            o = o.contiguous().view(-1, train_x.shape[-1])
            gt = torch.argmax(train_y[:,start:end,:], 2, keepdim=True).view(-1)
            loss = loss_fn(o, gt)            
            loss_tracker.append(loss.item())
            loss.backward()
            optimizer.step()
            print("Epoch #{:<3d}: Batch {:>3d}/{:<3d} -- "
                  "Loss: {:2.5}".format(i, j+1, int(train_size/batch_size), 
                                        loss_tracker[-1]), end='\r')
        print()
        f1_train = evaluate_transformer(model, train_x, train_y)
        f1_test = evaluate_transformer(model, test_x, test_y)
        print("Average Loss: {:2.6}".format(np.mean(loss_tracker)))
        print("Training F1: {:3.4}".format(f1_train))
        print("Test F1: {:3.4}".format(f1_test))
        print("=" * 50)
    
    return model


def evaluate_transformer(model, x, y):
    model.eval()
    test_size = x.shape[1]    
    labels = []
    preds = []
    
    for j in range(int(test_size/batch_size) + 1):
        optimizer.zero_grad()
        start = j*batch_size
        end = min((j+1)*batch_size, test_size)
        batch = end - start
        if batch == 0:
            continue
        # forward pass
        with torch.no_grad():
            o = model(x[:,start:end,:])
        # get predictions
        pred = torch.argmax(o, 2, keepdim=True).view(-1).cpu().detach().numpy()
        preds.extend(pred)
        label = torch.argmax(y[:,start:end,:], 2, 
                             keepdim=True).view(-1).cpu().detach().numpy()
        labels.extend(label)
    return f1_score(labels, preds)

## Our implementation

In [14]:
from transformer import PositionalEncoding, Encoder

class TransformerSeq2SeqSame(nn.Module):
    
    def __init__(self, in_dim, out_dim, N, heads, model_dim, key_dim, value_dim, ff_dim, 
                 max_len=10000, batch_first=False):
        
        super().__init__()
        
        self.batch_first = batch_first
        
        # define layers
        # embedding layers
        self.src_embed = nn.Linear(in_dim, model_dim)
        self.pos_enc = PositionalEncoding(model_dim, max_len)
        # encoder-decoder
        self.encoder = Encoder(N, heads, model_dim, key_dim, value_dim, ff_dim)
        # final output layer
        self.fc = nn.Linear(model_dim, out_dim)
        
        for p in self.parameters():
            if p.dim() > 1 and p.requires_grad:
                nn.init.xavier_uniform_(p)
    
    def forward(self, src, src_mask=None):
        # transpose to use [batch, seq_len, dim]
        if not self.batch_first:
            src = src.transpose(0, 1)
            
        ## get encoder attention from source
        src = self.src_embed(src)
        src = self.pos_enc(src)
        x = self.encoder(src, src_mask)
          
        x = self.fc(x)
        
        # transpose to use [batch, seq_len, dim]
        if not self.batch_first:
            x = x.transpose(0, 1)
        return x
        
    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.parameters() if p.requires_grad)
        return tot_sum

In [15]:
transformer = TransformerSeq2SeqSame(in_dim=2, out_dim=2, N=1, heads=4, model_dim=12, 
                                     key_dim=3, value_dim=3, ff_dim=64)

print(transformer.count_parameters())
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=learning_rate)

2344


In [16]:
transformer = train_transformer(transformer, train_x, train_y, test_x, test_y, 
                                epochs=epochs, loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch 384/384 -- Loss: 0.56472
Average Loss: 0.64766
Training F1: 0.6942
Test F1: 0.6869
Epoch #2  : Batch 384/384 -- Loss: 0.39044
Average Loss: 0.508765
Training F1: 0.8178
Test F1: 0.8112
Epoch #3  : Batch 384/384 -- Loss: 0.41734
Average Loss: 0.4053
Training F1: 0.877
Test F1: 0.871
Epoch #4  : Batch 384/384 -- Loss: 0.14318
Average Loss: 0.286434
Training F1: 0.9308
Test F1: 0.9274
Epoch #5  : Batch 384/384 -- Loss: 0.217268
Average Loss: 0.16742
Training F1: 0.9738
Test F1: 0.9733
Epoch #6  : Batch 384/384 -- Loss: 0.108818
Average Loss: 0.116724
Training F1: 0.9902
Test F1: 0.9902
Epoch #7  : Batch 384/384 -- Loss: 0.061353
Average Loss: 0.0835265
Training F1: 0.9966
Test F1: 0.9966
Epoch #8  : Batch 384/384 -- Loss: 0.0126483
Average Loss: 0.0675894
Training F1: 0.9996
Test F1: 0.9997
Epoch #9  : Batch 384/384 -- Loss: 0.0817935
Average Loss: 0.0529117
Training F1: 0.9711
Test F1: 0.9715
Epoch #10 : Batch 384/384 -- Loss: 0.0157022
Average Loss: 0.0457065
Training 

KeyboardInterrupt: 

## Baseline implementation

From github: https://github.com/jadore801120/attention-is-all-you-need-pytorch

In [18]:
import transformer_baseline as tb


## Padding masks - for 3 dim input
def get_attn_key_pad_mask(seq_k, seq_q, pad=tb.Constants.PAD):
    ''' For masking out the padding part of key sequence. '''
    assert seq_k.dim() == 3 and seq_q.dim() == 3
    
    # Expand to fit the shape of key query attention matrix.
    len_q = seq_q.size(1)
    padding_mask = torch.all(seq_q.eq(pad), dim=-1)  # b x lq
    padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1)  # b x lq x lk
    return padding_mask

def get_non_pad_mask(seq, pad=tb.Constants.PAD):
    assert seq.dim() == 3
    padding_mask = ~torch.all(seq.ne(pad), dim=-1)  # b x l
#     padding_mask = padding_mask.repeat(1, 1, seq.shape[-1])  # b x l x d (repeated)
    return padding_mask.type(torch.float).unsqueeze(-1)


## Model
class TransformerBaseline(nn.Module):
    def __init__(self, in_dim, out_dim, N, heads, model_dim, key_dim, value_dim, ff_dim, 
                 max_len=10000, batch_first=True, pad=tb.Constants.PAD):
        super().__init__()
        self.name = 'transformer'
        
        self.batch_first = batch_first
        self.pad = pad
        
        self.encoder = tb.Models.Encoder(
            n_src_vocab=in_dim, len_max_seq=max_len,
            d_word_vec=model_dim, d_model=model_dim, d_inner=ff_dim,
            n_layers=N, n_head=heads, d_k=key_dim, d_v=value_dim,
            dropout=0.0, embedding='linear')
        
        self.fc = nn.Linear(model_dim, out_dim)
        
        # This was important from their code. 
        # Initialize parameters with Glorot / fan_avg.
        for p in self.parameters():
            if p.dim() > 1 and p.requires_grad:
                nn.init.xavier_uniform_(p)
    
    def forward(self, x):
        
        if not self.batch_first:
            x = x.transpose(0,1)
        
        # encoder requires source sequence & positions of each 
        x_pos = torch.arange(x.shape[1]).unsqueeze(0).repeat(x.shape[0], 1)
        
        # -- Prepare masks
        # encoder
        e_slf_attn_mask = get_attn_key_pad_mask(seq_k=x, seq_q=x, pad=self.pad)
        e_non_pad_mask = torch.ones(x.shape[0], x.shape[1], 1)
        e_non_pad_mask = get_non_pad_mask(x, pad=self.pad)
        
        attn, *_ = self.encoder(x, x_pos, e_slf_attn_mask, e_non_pad_mask)
        x = self.fc(attn)
        
        if not self.batch_first:
            x = x.transpose(0,1)
        return x
      
    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.parameters() if p.requires_grad)
        return tot_sum

In [19]:
pad = torch.tensor([0,0]).float()

baseline = TransformerBaseline(in_dim=2, out_dim=2, N=1, heads=4, model_dim=12, 
                               key_dim=3, value_dim=3, ff_dim=64, 
                               batch_first=False, pad=pad)

print(baseline.count_parameters())
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(baseline.parameters(), lr=learning_rate)

2344


In [20]:
baseline = train_transformer(baseline, train_x, train_y, test_x, test_y, 
                                epochs=100, loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch 384/384 -- Loss: 0.56578
Average Loss: 0.638133
Training F1: 0.711
Test F1: 0.7042
Epoch #2  : Batch 384/384 -- Loss: 0.26251
Average Loss: 0.409439
Training F1: 0.9372
Test F1: 0.9353
Epoch #3  : Batch 384/384 -- Loss: 0.106741
Average Loss: 0.175441
Training F1: 0.9952
Test F1: 0.9948
Epoch #4  : Batch 384/384 -- Loss: 0.0804535
Average Loss: 0.0773654
Training F1: 1.0
Test F1: 1.0
Epoch #5  : Batch 384/384 -- Loss: 0.0165111
Average Loss: 0.042578
Training F1: 1.0
Test F1: 1.0
Epoch #6  : Batch 384/384 -- Loss: 0.02163742
Average Loss: 0.0233776
Training F1: 0.9983
Test F1: 0.998
Epoch #7  : Batch 384/384 -- Loss: 0.01174728
Average Loss: 0.0159691
Training F1: 1.0
Test F1: 1.0
Epoch #8  : Batch 384/384 -- Loss: 0.00117437
Average Loss: 0.0135939
Training F1: 1.0
Test F1: 1.0
Epoch #9  : Batch 384/384 -- Loss: 0.00192751
Average Loss: 0.0116191
Training F1: 1.0
Test F1: 1.0
Epoch #10 : Batch 384/384 -- Loss: 0.00142676
Average Loss: 0.00964245
Training F1: 1.0
Test

KeyboardInterrupt: 