In [1]:
import sys
sys.path.append('../src')

import torch
import torch.nn as nn
import torch.optim as optim

import os
import json
import time
import random
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score

from IPython.display import Image

In [2]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

SEED = 1
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

batch_size = 8
# Percentage of training data
learning_rate = 0.001
epochs = 100

# XOR task

The entire dataset comprises of the binary representation of all numbers uptil a range defined. The binary sequence from left to right (most significant to least significant) is the input. While the y or the output for an input is calculated as: $a1 \oplus a10 \wedge a3 \oplus a7$. Where, the most significant bit is a1, the least significant bit is a10.

In [3]:
# Generating data
state_size = 10
data_x = []
for i in range(pow(2, state_size)):
    data_x.append([int(x) for x in list(np.binary_repr(i, width=state_size))])
data_x = np.array(data_x)

data_y = []
for x in data_x:
    # a1 xor a10 ^ a3 xor a7
    data_y.append(np.bitwise_and(np.bitwise_xor(x[0], x[9]), 
                                 np.bitwise_xor(x[2], x[6])))
data_y = np.array(data_y)

In [4]:
# Reshaping for tensors
data_x = np.transpose(data_x).reshape(state_size, pow(2, state_size), 1)
data_x = torch.from_numpy(data_x).float()
data_y = torch.from_numpy(data_y).float()

# Reshaping X to 2-input dimensions
data_x = torch.zeros(data_x.shape[0], data_x.shape[1], 2).scatter_(2, data_x.long(), 1).float()

In [5]:
data_x.shape

torch.Size([10, 1024, 2])

In [6]:
# Creating training and test sets
train_size = 0.7
ordering = torch.randperm(pow(2, state_size))
data_x = data_x[:, ordering, :]
data_y = data_y[ordering]
train_x = data_x[:,:int(train_size * len(ordering)),:]
train_y = data_y[:int(train_size * len(ordering))]
test_x = data_x[:,int(train_size * len(ordering)):,:]
test_y = data_y[int(train_size * len(ordering)):]

In [7]:
print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

torch.Size([10, 716, 2]) torch.Size([716]) torch.Size([10, 308, 2]) torch.Size([308])


# LSTM

## Modeling

In [7]:
# Input dim
input_dim = 2
# Number of hidden nodes
hidden_dim = 16
# Number of output nodes
output_dim = 1
# Number of LSTMs cells to be stacked
layers = 1
# Boolean value for bidirectioanl or not
bidirectional = True
# Boolean value to use LayerNorm or not
layernorm = False

In [3]:
def train_lstm(model, train_x, train_y, test_x, test_y, epochs, loss_fn, optimizer):
    train_size = train_x.shape[1]
    for i in range(1, epochs + 1):
        model.train()
        loss_tracker = []
        ordering = torch.randperm(train_size)
        train_x = train_x[:,ordering,:]
        train_y = train_y[ordering]
        for j in range(int(float(train_size)/batch_size) + 1):
            optimizer.zero_grad()
            start = j*batch_size
            end = min((j+1)*batch_size, train_size)
            batch = end - start
            if batch is 0:
                continue
            if model.bidirectional:
                hidden_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
                cell_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
            else:
                hidden_state = torch.zeros(layers, batch, hidden_dim).to(device)
                cell_state = torch.zeros(layers, batch, hidden_dim).to(device)
            o = model(train_x[:,start:end,:], hidden_state, cell_state)
            loss = loss_fn(o.view(-1), train_y[start:end])
            loss_tracker.append(loss.item())
            loss.backward()
            optimizer.step()
            print("Epoch #{:<3d}: Batch {:>3d}/{:<3d} -- "
                  "Loss: {:2.5}".format(i, j+1, int(train_size/batch_size) + 1, 
                                        loss_tracker[-1]), end='\r')
        print()
        f1_train = evaluate_lstm(model, train_x, train_y)
        f1_test = evaluate_lstm(model, test_x, test_y)
        print("Average Loss: {:2.6}".format(np.mean(loss_tracker)))
        print("Training F1: {:3.4}".format(f1_train))
        print("Test F1: {:3.4}".format(f1_test))
    
    return model


def evaluate_lstm(model, x, y):
    model.eval()
    test_size = x.shape[1]
    labels = []
    preds = []
    for j in range(int(test_size/batch_size) + 1):
        optimizer.zero_grad()
        start = j*batch_size
        end = min((j+1)*batch_size, test_size)
        batch = end - start
        if batch == 0:
            continue
        if model.bidirectional:
            hidden_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
            cell_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
        else:
            hidden_state = torch.zeros(layers, batch, hidden_dim).to(device)
            cell_state = torch.zeros(layers, batch, hidden_dim).to(device)
        with torch.no_grad():
            o = model(x[:,start:end,:], hidden_state, cell_state)
        pred = torch.round(torch.sigmoid(o.view(-1))).cpu().detach().numpy()
        preds.extend(pred)
        labels.extend(y[start:end].int().detach().cpu().numpy())
    return f1_score(labels, preds)

## Our implementation

In [4]:
from lstm import LSTMCell

class LSTM(nn.Module):
    """A complete LSTM architecture

    Allows to stack multiple LSTM cells and also
    create a bidirectional LSTM network.

    Parameters
    ==========
    input_dim: Dimension of input data
    hidden_dim: Size of hidden state
    layernorm: True/False
    layers: Number of LSTM cells to stack
    bidirectional: True/False

    """
    def __init__(self, input_dim, hidden_dim, layers=1, bidirectional=False, layernorm=False):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.bidirectional = bidirectional
        self.layernorm = layernorm

        if self.layers < 1:
            raise ValueError("layers need to be > 1")
        self.model = []
        for i in range(self.layers):
            self.model.append(LSTMCell(input_dim, hidden_dim, layernorm))
        self.model = nn.ModuleList(self.model)
        if self.bidirectional:
            self.model_rev = []
            for i in range(self.layers):
                self.model_rev.append(LSTMCell(input_dim, hidden_dim, layernorm))
            self.model_rev = nn.ModuleList(self.model_rev)

    def forward(self, x, hidden_state, cell_state):
        """Forward pass for the LSTM network

        Parameters
        ==========
        x: [sequence_length, batch_size, input_dim]
        hidden_state: [1, batch_size, hidden_dim]
        cell_state: [1, batch_size, hidden_dim]

        Returns
        =======
        output, (hidden_state, cell_state)
            output: [sequence_length, batch_size, hidden_dim]
                contains the output/hidden_state from all the timesteps
                for the final layer in sequence 1...T
            hidden_state: [layers, batch_size, hidden_dim]
                contains the hidden_state from the last timestep T
                from all the layers
            cell_state: [layers, batch_size, hidden_dim]
                contains the cell_state from the last timestep T
                from all the layers

            If bidirectional=True
                output: [sequence_length, batch_size, 2 * hidden_dim]
                    [:,:,:hidden_dim] - for left-to-right
                    [:,:,hidden_dim:] - for right-to-left
                hidden_state: [2 * layers, batch_size, hidden_dim]
                    [:layers,:,:] - for left-to-right
                    [layers:,:,:] - for right-to-left
                cell_state: [layers, batch_size, hidden_dim]
                    [:layers,:,:] - for left-to-right
                    [layers:,:,:] - for right-to-left
        """
        device = 'cpu'
        if x.is_cuda:
            device = 'cuda'
        seq_length = x.shape[0]
        # Left-to-right pass
        # index of states is equivalent to index of layer in LSTM stack
        hidden_states = hidden_state[:self.layers,:,:].view(self.layers, 1,
                                                            hidden_state.shape[1],
                                                            hidden_state.shape[2])
        cell_states = cell_state[:self.layers,:,:].view(self.layers, 1,
                                                        cell_state.shape[1],
                                                        cell_state.shape[2])
        output = torch.tensor([], requires_grad=True).to(device)
        # forward pass for one cell at a time along layers
        for j in range(self.layers):
            output, (hidden_states[j], cell_states[j]) = self.model[j](x, hidden_states[j].clone(),
                                                                  cell_states[j].clone())
        hidden_states = hidden_states.squeeze(1)
        cell_states = cell_states.squeeze(1)

        ## TODO:
        ## The current code will work if bidirectional=False and
        ## the hidden_state.shape[0] > self.layers owing to [:layers,:,:].
        ## Maybe a warning should be raised without termination.

        # Right-to-left pass
        if self.bidirectional:
            # flipping inputs/rearranging x to be in reverse timestep order
            x = torch.flip(x, [0])  # reversing only the sequence dimension
            # index of states is equivalent to index of layer in LSTM stack
            hidden_states_rev = hidden_state[self.layers:,:,:].view(self.layers, 1,
                                                                hidden_state.shape[1],
                                                                hidden_state.shape[2])
            cell_states_rev = cell_state[self.layers:,:,:].view(self.layers, 1,
                                                             cell_state.shape[1],
                                                             cell_state.shape[2])
            output_rev = torch.tensor([], requires_grad=True).to(device)
            # forward pass for one cell at a time along layers
            for j in range(self.layers):
                output_rev, (hidden_states_rev[j], cell_states_rev[j]) = self.model_rev[j](x,
                                                                        hidden_states_rev[j].clone(),
                                                                        cell_states_rev[j].clone())
            # flipping outputs to be in correct timestep order
            output_rev = torch.flip(output_rev, [0]) # reversing only the sequence dimension
            hidden_states_rev = hidden_states_rev.squeeze(1)
            cell_states_rev = cell_states_rev.squeeze(1)
            # concatenating tensors
            ## creating tensors as expected in
            ## here: https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
            hidden_states = torch.cat((hidden_states,
                                       hidden_states_rev), dim=0)
            cell_states = torch.cat((cell_states,
                                     cell_states_rev), dim=0)
            output = torch.cat((output,
                                output_rev), dim=2)

        return output, (hidden_states, cell_states)


In [5]:
# from lstm import LSTM

class LSTMSeqLabel(nn.Module):
    """ LSTM Class for Sequence Labelling (many-to-one)

    The class creates the LSTM architecture as specified by the parameters.
    A fully connected layer is added to reduce the last hidden state to output_dim.

    Parameters
    ==========
    vocab_len: int from imdb dataset
    embed_dim: dimensions of the embeddings
    hidden_dim: number of hidden nodes required
    output_dim: numer of output nodes required (1 for sentiment analysis)
    pretrained_vec: weights from imdb object
    layers: number of LSTM cells to be stacked for depth
    bidirectional: boolean
    layernorm: boolean

    """
    def __init__(self, input_dim, hidden_dim, output_dim,
                 layers=1, bidirectional=False, layernorm=False):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        self.layernorm = layernorm
        
        self.lstm = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, layers=layers,
                         bidirectional=bidirectional, layernorm=layernorm)
        if self.bidirectional:
            self.fc = nn.Linear(2 * hidden_dim, output_dim)
        else:
            self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, hidden_state, cell_state):
        output, (_, _) = self.lstm(x, hidden_state, cell_state)
        output = output[-1].unsqueeze(0)
        output = self.fc(output)
        return output

    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.lstm.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.fc.parameters() if p.requires_grad)
        return tot_sum


In [8]:
model = LSTMSeqLabel(input_dim, hidden_dim, output_dim, bidirectional=True, layers=layers).to(device)
print(model.count_parameters())
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

2465


In [18]:
train_x = train_x.to(device)
train_y = train_y.to(device)
test_x = test_x.to(device)
test_y = test_y.to(device)

train_lstm(model, train_x, train_y, test_x, test_y, epochs=500, loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch  90/90  -- Loss: 0.57015


  'precision', 'predicted', average, warn_for)


Average Loss: 0.619753
Training F1: 0.0
Test F1: 0.0
Epoch #2  : Batch  90/90  -- Loss: 0.27476
Average Loss: 0.561626
Training F1: 0.0
Test F1: 0.0
Epoch #3  : Batch  90/90  -- Loss: 0.84467
Average Loss: 0.566193
Training F1: 0.0
Test F1: 0.0
Epoch #4  : Batch  90/90  -- Loss: 0.84708
Average Loss: 0.565686
Training F1: 0.0
Test F1: 0.0
Epoch #5  : Batch  90/90  -- Loss: 0.83391
Average Loss: 0.566083
Training F1: 0.0
Test F1: 0.0
Epoch #6  : Batch  90/90  -- Loss: 0.85345
Average Loss: 0.566442
Training F1: 0.0
Test F1: 0.0
Epoch #7  : Batch  27/90  -- Loss: 0.42597

KeyboardInterrupt: 

## PyTorch baseline

In [9]:
# using PyTorch LSTM module
class PyTorchBaseline(nn.Module):

    def __init__(self, n_input, n_hidden, n_output, 
                 layers=1, bidirectional=False, layernorm=False):
        super().__init__()

        self.hidden_dim = n_hidden
        self.bidirectional = bidirectional
        self.layers = layers
        self.layernorm = layernorm

        self.lstm = nn.LSTM(n_input, n_hidden, bidirectional=self.bidirectional, num_layers=layers)
        if self.bidirectional:
            self.fc = nn.Linear(2 * n_hidden, n_output)
        else:
            self.fc = nn.Linear(n_hidden, n_output)
        if self.layernorm and self.bidirectional:
            self.ln = LayerNorm(2 * self.hidden_dim)
        elif self.layernorm:
            self.ln = LayerNorm(self.hidden_dim)

    def forward(self, x, h, c):
        o, (_, _) = self.lstm(x, (h, c))
        o = o[-1].unsqueeze(0)
        if self.layernorm:
            output = self.fc(self.ln(o))
        else:
            output = self.fc(o)
        return output

    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.lstm.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.fc.parameters() if p.requires_grad)
        return tot_sum

In [10]:
model = PyTorchBaseline(input_dim, hidden_dim, output_dim, bidirectional=bidirectional, layers=layers).to(device)
print(model.count_parameters())
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

2593


In [22]:
train_x = train_x.to(device)
train_y = train_y.to(device)
test_x = test_x.to(device)
test_y = test_y.to(device)

train_lstm(model, train_x, train_y, test_x, test_y, epochs=500, loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch  90/90  -- Loss: 0.57037
Average Loss: 0.617523
Training F1: 0.0
Test F1: 0.0
Epoch #2  : Batch  90/90  -- Loss: 0.56855
Average Loss: 0.564826
Training F1: 0.0
Test F1: 0.0
Epoch #3  : Batch  90/90  -- Loss: 0.56483
Average Loss: 0.563855
Training F1: 0.0
Test F1: 0.0
Epoch #4  : Batch  90/90  -- Loss: 0.55875
Average Loss: 0.563758
Training F1: 0.0
Test F1: 0.0
Epoch #5  : Batch  90/90  -- Loss: 0.28745
Average Loss: 0.561983
Training F1: 0.0
Test F1: 0.0
Epoch #6  : Batch  90/90  -- Loss: 0.56647
Average Loss: 0.56343
Training F1: 0.0
Test F1: 0.0
Epoch #7  : Batch  90/90  -- Loss: 0.56415
Average Loss: 0.563175
Training F1: 0.0
Test F1: 0.0
Epoch #8  : Batch  90/90  -- Loss: 0.56221
Average Loss: 0.563539
Training F1: 0.0
Test F1: 0.0
Epoch #9  : Batch  90/90  -- Loss: 0.29676
Average Loss: 0.562005
Training F1: 0.0
Test F1: 0.0
Epoch #10 : Batch  64/90  -- Loss: 0.56243

KeyboardInterrupt: 

In [52]:
our = LSTMSeqLabel(input_dim, hidden_dim, output_dim, bidirectional=True, layers=layers).to(device)
pytorch = PyTorchBaseline(input_dim, hidden_dim, output_dim, bidirectional=bidirectional, layers=layers).to(device)

In [58]:
print("Our implementation\n{}".format("=" * len("Our implementation")))
print("# of parameters: {}".format(our.count_parameters()))
for name, param in our.named_parameters():
    print("{:<25}: {}".format(name, param.shape))

Our implementation
# of parameters: 2465
lstm.model.0.weights     : torch.Size([18, 64])
lstm.model.0.bias        : torch.Size([64])
lstm.model_rev.0.weights : torch.Size([18, 64])
lstm.model_rev.0.bias    : torch.Size([64])
fc.weight                : torch.Size([1, 32])
fc.bias                  : torch.Size([1])


In [59]:
print("PyTorch implementation\n{}".format("=" * len("PyTorch implementation")))
print("# of parameters: {}".format(pytorch.count_parameters()))
for name, param in pytorch.named_parameters():
    print("{:<25}: {}".format(name, param.shape))

PyTorch implementation
# of parameters: 2593
lstm.weight_ih_l0        : torch.Size([64, 2])
lstm.weight_hh_l0        : torch.Size([64, 16])
lstm.bias_ih_l0          : torch.Size([64])
lstm.bias_hh_l0          : torch.Size([64])
lstm.weight_ih_l0_reverse: torch.Size([64, 2])
lstm.weight_hh_l0_reverse: torch.Size([64, 16])
lstm.bias_ih_l0_reverse  : torch.Size([64])
lstm.bias_hh_l0_reverse  : torch.Size([64])
fc.weight                : torch.Size([1, 32])
fc.bias                  : torch.Size([1])


PyTorch uses $Wh + b_h + Wx + b_x$ whereas we are using $Wx' + b$, where $x'$ is $h, x$ concatenated. Therefore PyTorch has an extra set of biases for each direction.

For one direction - 64 <br>
For reverse direction - 64 <br>

Our model has $6978$ parameters while the PyTorch model has $6978 + 64 + 64 + 128 = 7234$ parameters.

# Transformer

## Modeling

In [11]:
def train_transformer(model, train_x, train_y, test_x, test_y, epochs, loss_fn, optimizer):
    train_size = train_x.shape[1]
    for i in range(1, epochs + 1):
        model.train()
        loss_tracker = []
        ordering = torch.randperm(train_size)
        train_x = train_x[:,ordering,:]
        train_y = train_y[ordering]
        for j in range(int(float(train_size)/batch_size) + 1):
            optimizer.zero_grad()
            start = j*batch_size
            end = min((j+1)*batch_size, train_size)
            batch = end - start
            if batch is 0:
                continue
            o = model(train_x[:,start:end,:])
            # transform output to be of same dim as label
            o = o.mean(dim=0)  # mean over all attention output
            loss = loss_fn(o.view(-1), train_y[start:end])
            loss_tracker.append(loss.item())
            loss.backward()
            # plot_grad_flow(model.named_parameters())
            optimizer.step()
            print("Epoch #{:<3d}: Batch {:>3d}/{:<3d} -- "
                  "Loss: {:2.5}".format(i, j+1, int(train_size/batch_size) + 1, 
                                        loss_tracker[-1]), end='\r')
        print()
        f1_train = evaluate_transformer(model, train_x, train_y)
        f1_test = evaluate_transformer(model, test_x, test_y)
        print("Average Loss: {:2.6}".format(np.mean(loss_tracker)))
        print("Training F1: {:3.4}".format(f1_train))
        print("Test F1: {:3.4}".format(f1_test))
    
    return model


def evaluate_transformer(model, x, y):
    model.eval()
    test_size = x.shape[1]
    labels = []
    preds = []
    for j in range(int(test_size/batch_size) + 1):
        optimizer.zero_grad()
        start = j*batch_size
        end = min((j+1)*batch_size, test_size)
        batch = end - start
        if batch == 0:
            continue
        with torch.no_grad():
            o = model(x[:,start:end,:])
            # transform output to be of same dim as label
            o = o.mean(dim=0)  # mean over all attention output
        pred = torch.round(torch.sigmoid(o.view(-1))).cpu().detach().numpy()
        preds.extend(pred)
        labels.extend(y[start:end].int().detach().cpu().numpy())
    return f1_score(labels, preds)

## Our implementation

In [12]:
from transformer import PositionalEncoding, Encoder

class TransformerSeqLabel(nn.Module):
    """ Transformer Class for Sequence Labelling (many-to-one)

    The class creates the Transformer encoder architecture as specified by the parameters.
    A fully connected layer is added to reduce the attention to output_dim.
    The final prediction is averaged over sequence length to get final score

    Parameters
    ==========
    in_dim: input vocab size from imdb dataset
    out_dim: output dimensions of the model
    model_dim: embedding dimension, also the dimensionality at which the transformer operates
    key_dim: dimensions for query & key in attention calculation
    value_dim: dimensions for value in attention calculation
    ff_dim: dimensions for Positionwise feed-forward sublayer
    max_len: max length to generate positional encodings (default=10000)
    batch_first: transposes the 1st 2 dimensions of the input to have 'batch' as the first
        if the input dimensions are of form [seq_len, batch, dim] (default=True)
    """
    def __init__(self, in_dim, out_dim, N, heads, model_dim, key_dim, value_dim, ff_dim, 
                 max_len=10000, batch_first=False):
        
        super().__init__()
        
        self.batch_first = batch_first
        
        self.embed = nn.Linear(in_dim, model_dim)
        self.pos_enc = PositionalEncoding(model_dim, max_len)
        self.encoder = Encoder(N, heads, model_dim, key_dim, value_dim, ff_dim)
        # final output layer
        self.fc = nn.Linear(model_dim, out_dim)
        
        # xavier initialization
        for p in self.parameters():
            if p.dim() > 1 and p.requires_grad:
                nn.init.xavier_uniform_(p)
    
    def forward(self, x, mask=None):
        # transpose to use [batch, seq_len, dim]
        if not self.batch_first:
            x = x.transpose(0, 1)
            
        x = self.embed(x)
        x = self.pos_enc(x)
        x = self.encoder(x, mask)
        x = self.fc(x)
        
        # transpose back to original [seq_len, batch, dim]
        if not self.batch_first:
            x = x.transpose(0, 1)
        return x
        
    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.parameters() if p.requires_grad)
        return tot_sum


In [13]:
transformer = TransformerSeqLabel(in_dim=2, out_dim=1, N=1, heads=4, model_dim=12, 
                                  key_dim=4, value_dim=3, ff_dim=64)
transformer = transformer.to(device)

print(transformer.count_parameters())
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=learning_rate)

2437


In [22]:
transformer = train_transformer(transformer, train_x, train_y, test_x, test_y, epochs=epochs, 
                  loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch  90/90  -- Loss: 0.23617
Average Loss: 0.580998
Training F1: 0.0
Test F1: 0.0
Epoch #2  : Batch  90/90  -- Loss: 0.87053
Average Loss: 0.578611
Training F1: 0.0
Test F1: 0.0
Epoch #3  : Batch  90/90  -- Loss: 0.55954
Average Loss: 0.57496
Training F1: 0.0
Test F1: 0.0
Epoch #4  : Batch  90/90  -- Loss: 0.56821
Average Loss: 0.574023
Training F1: 0.0
Test F1: 0.0
Epoch #5  : Batch  90/90  -- Loss: 0.32981
Average Loss: 0.574171
Training F1: 0.0
Test F1: 0.0
Epoch #6  : Batch  90/90  -- Loss: 0.56612
Average Loss: 0.575961
Training F1: 0.0
Test F1: 0.0
Epoch #7  : Batch  90/90  -- Loss: 0.56355
Average Loss: 0.575225
Training F1: 0.0
Test F1: 0.0
Epoch #8  : Batch  90/90  -- Loss: 0.88615
Average Loss: 0.578117
Training F1: 0.0
Test F1: 0.0
Epoch #9  : Batch  90/90  -- Loss: 0.85113
Average Loss: 0.576824
Training F1: 0.0
Test F1: 0.0
Epoch #10 : Batch  90/90  -- Loss: 0.57687
Average Loss: 0.570679
Training F1: 0.0
Test F1: 0.0
Epoch #11 : Batch  90/90  -- Loss: 0.8558

Epoch #83 : Batch  90/90  -- Loss: 0.00014658
Average Loss: 0.0109825
Training F1: 1.0
Test F1: 1.0
Epoch #84 : Batch  90/90  -- Loss: 0.00124386
Average Loss: 0.00317428
Training F1: 1.0
Test F1: 1.0
Epoch #85 : Batch  90/90  -- Loss: 0.00233798
Average Loss: 0.00511538
Training F1: 1.0
Test F1: 1.0
Epoch #86 : Batch  90/90  -- Loss: 0.00070885
Average Loss: 0.00390235
Training F1: 1.0
Test F1: 1.0
Epoch #87 : Batch  90/90  -- Loss: 9.3216e-05
Average Loss: 0.00563968
Training F1: 1.0
Test F1: 1.0
Epoch #88 : Batch  90/90  -- Loss: 2.3722e-05
Average Loss: 0.00190103
Training F1: 1.0
Test F1: 1.0
Epoch #89 : Batch  90/90  -- Loss: 0.00049761
Average Loss: 0.002579
Training F1: 1.0
Test F1: 1.0
Epoch #90 : Batch  90/90  -- Loss: 0.00051212
Average Loss: 0.00335914
Training F1: 1.0
Test F1: 1.0
Epoch #91 : Batch  90/90  -- Loss: 0.00011241
Average Loss: 0.00586544
Training F1: 1.0
Test F1: 1.0
Epoch #92 : Batch  90/90  -- Loss: 0.00054141
Average Loss: 0.00320768
Training F1: 1.0
Test F

## Baseline implementation

From github: https://github.com/jadore801120/attention-is-all-you-need-pytorch

In [23]:
import transformer_baseline as tb


## Padding masks - for 3 dim input
def get_attn_key_pad_mask(seq_k, seq_q, pad=tb.Constants.PAD):
    ''' For masking out the padding part of key sequence. '''
    assert seq_k.dim() == 3 and seq_q.dim() == 3
    
    # Expand to fit the shape of key query attention matrix.
    len_q = seq_q.size(1)
    padding_mask = torch.all(seq_q.eq(pad), dim=-1)  # b x lq
    padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1)  # b x lq x lk
    return padding_mask

def get_non_pad_mask(seq, pad=tb.Constants.PAD):
    assert seq.dim() == 3
    padding_mask = ~torch.all(seq.ne(pad), dim=-1)  # b x l
#     print(padding_mask.shape, '....')
#     padding_mask = padding_mask.repeat(1, 1, seq.shape[-1])  # b x l x d (repeated)
    return padding_mask.type(torch.float).unsqueeze(-1)


## Model
class TransformerBaseline(nn.Module):
    def __init__(self, in_dim, out_dim, N, heads, model_dim, key_dim, value_dim, ff_dim, 
                 max_len=10000, batch_first=True, pad=tb.Constants.PAD):
        super().__init__()
        self.name = 'transformer'
        
        self.batch_first = batch_first
        self.pad = pad
        
        self.encoder = tb.Models.Encoder(
            n_src_vocab=in_dim, len_max_seq=max_len,
            d_word_vec=model_dim, d_model=model_dim, d_inner=ff_dim,
            n_layers=N, n_head=heads, d_k=key_dim, d_v=value_dim,
            dropout=0.0, embedding='linear')
        
        self.fc = nn.Linear(model_dim, out_dim)
        
        # This was important from their code. 
        # Initialize parameters with Glorot / fan_avg.
        for p in self.parameters():
            if p.dim() > 1 and p.requires_grad:
                nn.init.xavier_uniform_(p)
    
    def forward(self, x):
        
        if not self.batch_first:
            x = x.transpose(0,1)
        
        # encoder requires source sequence & positions of each 
        x_pos = torch.arange(x.shape[1]).unsqueeze(0).repeat(x.shape[0], 1)

        # -- Prepare masks
        # encoder
        e_slf_attn_mask = get_attn_key_pad_mask(seq_k=x, seq_q=x, pad=self.pad)
        e_non_pad_mask = torch.ones(x.shape[0], x.shape[1], 1)
        e_non_pad_mask = get_non_pad_mask(x, pad=self.pad)
        
        attn, *_ = self.encoder(x, x_pos, e_slf_attn_mask, e_non_pad_mask)
        x = self.fc(attn)
        
        if not self.batch_first:
            x = x.transpose(0,1)
        return x
      
    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.parameters() if p.requires_grad)
        return tot_sum

In [24]:
pad = torch.tensor([0,0]).float()

baseline = TransformerBaseline(in_dim=2, out_dim=1, N=1, heads=4, model_dim=12, 
                               key_dim=4, value_dim=3, ff_dim=64, max_len=100, 
                               batch_first=False, pad=pad)
baseline = baseline.to(device)

print(baseline.count_parameters())
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(baseline.parameters(), lr=learning_rate)

2436


In [25]:
baseline = train_transformer(baseline, train_x, train_y, test_x, test_y, epochs=epochs, 
                  loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch  90/90  -- Loss: 0.74986
Average Loss: 0.636718
Training F1: 0.0
Test F1: 0.0
Epoch #2  : Batch  90/90  -- Loss: 0.57023
Average Loss: 0.578866
Training F1: 0.0
Test F1: 0.0
Epoch #3  : Batch  90/90  -- Loss: 0.56595
Average Loss: 0.581646
Training F1: 0.0
Test F1: 0.0
Epoch #4  : Batch  90/90  -- Loss: 0.55542
Average Loss: 0.57248
Training F1: 0.0
Test F1: 0.0
Epoch #5  : Batch  90/90  -- Loss: 0.84022
Average Loss: 0.574285
Training F1: 0.0
Test F1: 0.0
Epoch #6  : Batch  90/90  -- Loss: 0.84121
Average Loss: 0.579182
Training F1: 0.0
Test F1: 0.0
Epoch #7  : Batch  90/90  -- Loss: 1.20136
Average Loss: 0.574237
Training F1: 0.0
Test F1: 0.0
Epoch #8  : Batch  90/90  -- Loss: 0.42253
Average Loss: 0.573607
Training F1: 0.0
Test F1: 0.0
Epoch #9  : Batch  90/90  -- Loss: 0.77886
Average Loss: 0.573453
Training F1: 0.0
Test F1: 0.0
Epoch #10 : Batch  90/90  -- Loss: 0.24967
Average Loss: 0.564493
Training F1: 0.0
Test F1: 0.0
Epoch #11 : Batch  90/90  -- Loss: 0.5587

Epoch #82 : Batch  90/90  -- Loss: 0.1573321
Average Loss: 0.102991
Training F1: 0.9973
Test F1: 0.9929
Epoch #83 : Batch  90/90  -- Loss: 0.02472742
Average Loss: 0.129153
Training F1: 0.9686
Test F1: 0.9211
Epoch #84 : Batch  90/90  -- Loss: 0.1437944
Average Loss: 0.103316
Training F1: 0.9918
Test F1: 0.9929
Epoch #85 : Batch  90/90  -- Loss: 0.0716562
Average Loss: 0.102813
Training F1: 0.9973
Test F1: 0.9929
Epoch #86 : Batch  90/90  -- Loss: 0.0034786
Average Loss: 0.0883656
Training F1: 0.9973
Test F1: 0.9929
Epoch #87 : Batch  90/90  -- Loss: 0.00396269
Average Loss: 0.0888478
Training F1: 0.9973
Test F1: 0.9929
Epoch #88 : Batch  90/90  -- Loss: 0.1994835
Average Loss: 0.0626708
Training F1: 0.9973
Test F1: 0.9929
Epoch #89 : Batch  90/90  -- Loss: 0.0534288
Average Loss: 0.0649114
Training F1: 0.9918
Test F1: 0.9784
Epoch #90 : Batch  90/90  -- Loss: 0.0287147
Average Loss: 0.0736369
Training F1: 1.0
Test F1: 1.0
Epoch #91 : Batch  90/90  -- Loss: 0.0618817
Average Loss: 0.05

TransformerBaseline(
  (encoder): Encoder(
    (src_word_emb): Linear(in_features=2, out_features=12, bias=True)
    (position_enc): Embedding(101, 12)
    (layer_stack): ModuleList(
      (0): EncoderLayer(
        (slf_attn): MultiHeadAttention(
          (w_qs): Linear(in_features=12, out_features=16, bias=True)
          (w_ks): Linear(in_features=12, out_features=16, bias=True)
          (w_vs): Linear(in_features=12, out_features=12, bias=True)
          (attention): ScaledDotProductAttention(
            (dropout): Dropout(p=0.1)
            (softmax): Softmax()
          )
          (layer_norm): LayerNorm(torch.Size([12]), eps=1e-05, elementwise_affine=True)
          (fc): Linear(in_features=12, out_features=12, bias=True)
          (dropout): Dropout(p=0.0)
        )
        (pos_ffn): PositionwiseFeedForward(
          (w_1): Conv1d(12, 64, kernel_size=(1,), stride=(1,))
          (w_2): Conv1d(64, 12, kernel_size=(1,), stride=(1,))
          (layer_norm): LayerNorm(torch.S