In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import os
import json
import time
import random
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score

from IPython.display import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 42

## XOR task

The entire dataset comprises of the binary representation of all numbers uptil a range defined. The binary sequence from left to right (most significant to least significant) is the input. While the y or the output for an input is calculated as: $a1 \oplus a10 \wedge a3 \oplus a7$. Where, the most significant bit is a1, the least significant bit is a10.

In [39]:
# Generating data
state_size = 10
data_x = []
for i in range(pow(2, state_size)):
    data_x.append([int(x) for x in list(np.binary_repr(i, width=state_size))])
data_x = np.array(data_x)

data_y = []
for x in data_x:
    # a1 xor a10 ^ a3 xor a7
    data_y.append(np.bitwise_and(np.bitwise_xor(x[0], x[9]), 
                                 np.bitwise_xor(x[2], x[6])))
data_y = np.array(data_y)

In [40]:
# Reshaping for tensors
data_x = np.transpose(data_x).reshape(state_size, pow(2, state_size), 1)
data_x = torch.from_numpy(data_x).float()
data_y = torch.from_numpy(data_y).float()

# Reshaping X to 2-input dimensions
data_x = torch.zeros(data_x.shape[0], data_x.shape[1], 2).scatter_(2, data_x.long(), 1).float()

In [41]:
data_x.shape

torch.Size([10, 1024, 2])

In [42]:
# Creating training and test sets
train_size = 0.7
ordering = torch.randperm(pow(2, state_size))
data_x = data_x[:, ordering, :]
data_y = data_y[ordering]
train_x = data_x[:,:int(train_size * len(ordering)),:]
train_y = data_y[:int(train_size * len(ordering))]
test_x = data_x[:,int(train_size * len(ordering)):,:]
test_y = data_y[int(train_size * len(ordering)):]

In [43]:
print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

torch.Size([10, 716, 2]) torch.Size([716]) torch.Size([10, 308, 2]) torch.Size([308])


## Modelling

In [30]:
# Input dim
input_dim = 2
# Number of hidden nodes
hidden_dim = 16
# Number of output nodes
output_dim = 1
# Number of LSTMs cells to be stacked
layers = 1
# Boolean value for bidirectioanl or not
bidirectional = True
# Boolean value to use LayerNorm or not
layernorm = False

batch_size = 8
# Percentage of training data
learning_rate = 0.001
epochs = 100

device = "cuda"

In [31]:
def train(model, train_x, train_y, test_x, test_y, epochs, loss_fn, optimizer):
    train_size = train_x.shape[1]
    for i in range(1, epochs + 1):
        model.train()
        loss_tracker = []
        ordering = torch.randperm(train_size)
        train_x = train_x[:,ordering,:]
        train_y = train_y[ordering]
        for j in range(int(float(train_size)/batch_size) + 1):
            optimizer.zero_grad()
            start = j*batch_size
            end = min((j+1)*batch_size, train_size)
            batch = end - start
            if batch is 0:
                continue
            if model.bidirectional:
                hidden_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
                cell_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
            else:
                hidden_state = torch.zeros(layers, batch, hidden_dim).to(device)
                cell_state = torch.zeros(layers, batch, hidden_dim).to(device)
            o = model(train_x[:,start:end,:], hidden_state, cell_state)
            loss = loss_fn(o.view(-1), train_y[start:end])
            loss_tracker.append(loss.item())
            loss.backward()
            optimizer.step()
            print("Epoch #{:<3d}: Batch {:>3d}/{:<3d} -- "
                  "Loss: {:2.5}".format(i, j+1, int(train_size/batch_size) + 1, 
                                        loss_tracker[-1]), end='\r')
        print()
        f1_train = evaluate(model, train_x, train_y)
        f1_test = evaluate(model, test_x, test_y)
        print("Average Loss: {:2.6}".format(np.mean(loss_tracker)))
        print("Training F1: {:3.4}".format(f1_train))
        print("Test F1: {:3.4}".format(f1_test))
    
    return model


def evaluate(model, x, y):
    model.eval()
    test_size = x.shape[1]
    labels = []
    preds = []
    for j in range(int(test_size/batch_size) + 1):
        optimizer.zero_grad()
        start = j*batch_size
        end = min((j+1)*batch_size, test_size)
        batch = end - start
        if batch == 0:
            continue
        if model.bidirectional:
            hidden_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
            cell_state = torch.zeros(2 * layers, batch, hidden_dim).to(device)
        else:
            hidden_state = torch.zeros(layers, batch, hidden_dim).to(device)
            cell_state = torch.zeros(layers, batch, hidden_dim).to(device)
        with torch.no_grad():
            o = model(x[:,start:end,:], hidden_state, cell_state)
        pred = torch.round(torch.sigmoid(o.view(-1))).cpu().detach().numpy()
        preds.extend(pred)
        labels.extend(y[start:end].int().detach().cpu().numpy())
    return f1_score(labels, preds)

## Our implementation

In [47]:
from lstm import LSTMCell

class LSTM(nn.Module):
    """A complete LSTM architecture

    Allows to stack multiple LSTM cells and also
    create a bidirectional LSTM network.

    Parameters
    ==========
    input_dim: Dimension of input data
    hidden_dim: Size of hidden state
    layernorm: True/False
    layers: Number of LSTM cells to stack
    bidirectional: True/False

    """
    def __init__(self, input_dim, hidden_dim, layers=1, bidirectional=False, layernorm=False):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.bidirectional = bidirectional
        self.layernorm = layernorm

        if self.layers < 1:
            raise ValueError("layers need to be > 1")
        self.model = []
        for i in range(self.layers):
            self.model.append(LSTMCell(input_dim, hidden_dim, layernorm))
        self.model = nn.ModuleList(self.model)
        if self.bidirectional:
            self.model_rev = []
            for i in range(self.layers):
                self.model_rev.append(LSTMCell(input_dim, hidden_dim, layernorm))
            self.model_rev = nn.ModuleList(self.model_rev)

    def forward(self, x, hidden_state, cell_state):
        """Forward pass for the LSTM network

        Parameters
        ==========
        x: [sequence_length, batch_size, input_dim]
        hidden_state: [1, batch_size, hidden_dim]
        cell_state: [1, batch_size, hidden_dim]

        Returns
        =======
        output, (hidden_state, cell_state)
            output: [sequence_length, batch_size, hidden_dim]
                contains the output/hidden_state from all the timesteps
                for the final layer in sequence 1...T
            hidden_state: [layers, batch_size, hidden_dim]
                contains the hidden_state from the last timestep T
                from all the layers
            cell_state: [layers, batch_size, hidden_dim]
                contains the cell_state from the last timestep T
                from all the layers
                
            If bidirectional=True
                output: [sequence_length, batch_size, 2 * hidden_dim]
                    [:,:,:hidden_dim] - for left-to-right
                    [:,:,hidden_dim:] - for right-to-left
                hidden_state: [2 * layers, batch_size, hidden_dim]
                    [:layers,:,:] - for left-to-right
                    [layers:,:,:] - for right-to-left
                cell_state: [layers, batch_size, hidden_dim]
                    [:layers,:,:] - for left-to-right
                    [layers:,:,:] - for right-to-left
        """
        device = 'cpu'
        if x.is_cuda:
            device = 'cuda'
        seq_length = x.shape[0]
        # Left-to-right pass
        # index of state is equivalent to index of layer in LSTM stack
        hidden_states = hidden_state.view(hidden_state.shape[0], 1, 
                                           hidden_state.shape[1], hidden_state.shape[2])
        cell_states = cell_state.view(cell_state.shape[0], 1, 
                                       cell_state.shape[1], cell_state.shape[2])
        output = torch.tensor([], requires_grad=True).to(device)
        # forward pass for one cell at a time
        for j in range(self.layers):
            output, (hidden_states[j], cell_states[j]) = self.model[j](x, hidden_states[j].clone(),
                                                                  cell_states[j].clone(),
                                                                  device)
        hidden_states = hidden_states.squeeze(1)
        cell_states = cell_states.squeeze(1) 

        # Right-to-left pass
        if self.bidirectional:
            # flipping inputs/rearranging x to be in reverse timestep order
            x = torch.flip(x, [0])  # reversing only the sequence dimension
            # index of state is equivalent to index of layer in LSTM stack
            hidden_states_rev = hidden_state.view(hidden_state.shape[0], 1, 
                                               hidden_state.shape[1], hidden_state.shape[2])
            cell_states_rev = cell_state.view(cell_state.shape[0], 1, 
                                           cell_state.shape[1], cell_state.shape[2])
            output_rev = torch.tensor([], requires_grad=True).to(device)
            # forward pass for one cell at a time
            for j in range(self.layers):
                output_rev, (hidden_states_rev[j], cell_states_rev[j]) = self.model_rev[j](x,
                                                                        hidden_states_rev[j].clone(),
                                                                        cell_states_rev[j].clone())
            # flipping outputs to be in correct timestep order
            output_rev = torch.flip(output_rev, [0]) # reversing only the sequence dimension
            hidden_states_rev = hidden_states_rev.squeeze(1)
            cell_states_rev = cell_states_rev.squeeze(1)
            # concatenating tensors
            ## creating tensors as expected in
            ## here: https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
            hidden_states = torch.cat((hidden_states,
                                       hidden_states_rev), dim=0)
            cell_states = torch.cat((cell_states,
                                     cell_states_rev), dim=0)
            output = torch.cat((output,
                                output_rev), dim=2)

        return output, (hidden_states, cell_states)

In [48]:
# from lstm import LSTM

class LSTMSeqLabel(nn.Module):
    """ LSTM Class for Sequence Labelling (many-to-one)

    The class creates the LSTM architecture as specified by the parameters.
    A fully connected layer is added to reduce the last hidden state to output_dim.

    Parameters
    ==========
    vocab_len: int from imdb dataset
    embed_dim: dimensions of the embeddings
    hidden_dim: number of hidden nodes required
    output_dim: numer of output nodes required (1 for sentiment analysis)
    pretrained_vec: weights from imdb object
    layers: number of LSTM cells to be stacked for depth
    bidirectional: boolean
    layernorm: boolean

    """
    def __init__(self, input_dim, hidden_dim, output_dim,
                 layers=1, bidirectional=False, layernorm=False):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        self.layernorm = layernorm
        
        self.lstm = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, layers=layers,
                         bidirectional=bidirectional, layernorm=layernorm)
        if self.bidirectional:
            self.fc = nn.Linear(2 * hidden_dim, output_dim)
        else:
            self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, hidden_state, cell_state):
        output, (_, _) = self.lstm(x, hidden_state, cell_state)
        output = output[-1].unsqueeze(0)
        output = self.fc(output)
        return output

    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.lstm.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.fc.parameters() if p.requires_grad)
        return tot_sum


In [49]:
model = LSTMSeqLabel(input_dim, hidden_dim, output_dim, bidirectional=True, layers=layers).to(device)
print(model.count_parameters())
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

2465


In [50]:
train_x = train_x.to(device)
train_y = train_y.to(device)
test_x = test_x.to(device)
test_y = test_y.to(device)

train(model, train_x, train_y, test_x, test_y, epochs=500, loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch  90/90  -- Loss: 0.32766
Average Loss: 0.617631
Training F1: 0.0
Test F1: 0.0
Epoch #2  : Batch  90/90  -- Loss: 0.28181
Average Loss: 0.576765
Training F1: 0.0
Test F1: 0.0
Epoch #3  : Batch  90/90  -- Loss: 0.27816
Average Loss: 0.57674
Training F1: 0.0
Test F1: 0.0
Epoch #4  : Batch  90/90  -- Loss: 0.30521
Average Loss: 0.575937
Training F1: 0.0
Test F1: 0.0
Epoch #5  : Batch  90/90  -- Loss: 0.88851
Average Loss: 0.579744
Training F1: 0.0
Test F1: 0.0
Epoch #6  : Batch  90/90  -- Loss: 0.79432
Average Loss: 0.578506
Training F1: 0.0
Test F1: 0.0
Epoch #7  : Batch  90/90  -- Loss: 0.32734
Average Loss: 0.576033
Training F1: 0.0
Test F1: 0.0
Epoch #8  : Batch  90/90  -- Loss: 0.33777
Average Loss: 0.575859
Training F1: 0.0
Test F1: 0.0
Epoch #9  : Batch  90/90  -- Loss: 0.56343
Average Loss: 0.578558
Training F1: 0.0
Test F1: 0.0
Epoch #10 : Batch  90/90  -- Loss: 0.28159
Average Loss: 0.574664
Training F1: 0.0
Test F1: 0.0
Epoch #11 : Batch  90/90  -- Loss: 0.6044

Epoch #86 : Batch  90/90  -- Loss: 0.524498
Average Loss: 0.360302
Training F1: 0.6393
Test F1: 0.5349
Epoch #87 : Batch  90/90  -- Loss: 0.5667565
Average Loss: 0.359741
Training F1: 0.6641
Test F1: 0.6132
Epoch #88 : Batch  90/90  -- Loss: 0.5048411
Average Loss: 0.357354
Training F1: 0.5561
Test F1: 0.3817
Epoch #89 : Batch  90/90  -- Loss: 0.716949
Average Loss: 0.357729
Training F1: 0.6143
Test F1: 0.5333
Epoch #90 : Batch  90/90  -- Loss: 0.363796
Average Loss: 0.355099
Training F1: 0.545
Test F1: 0.4058
Epoch #91 : Batch  90/90  -- Loss: 0.360429
Average Loss: 0.356629
Training F1: 0.5922
Test F1: 0.472
Epoch #92 : Batch  90/90  -- Loss: 0.175272
Average Loss: 0.353627
Training F1: 0.5237
Test F1: 0.3492
Epoch #93 : Batch  90/90  -- Loss: 0.3192449
Average Loss: 0.358039
Training F1: 0.58
Test F1: 0.4414
Epoch #94 : Batch  90/90  -- Loss: 0.6335727
Average Loss: 0.35488
Training F1: 0.5479
Test F1: 0.3411
Epoch #95 : Batch  90/90  -- Loss: 0.333491
Average Loss: 0.367108
Trainin

Epoch #166: Batch  90/90  -- Loss: 0.5307933
Average Loss: 0.341013
Training F1: 0.5905
Test F1: 0.4094
Epoch #167: Batch  90/90  -- Loss: 0.0016235
Average Loss: 0.337921
Training F1: 0.5995
Test F1: 0.4604
Epoch #168: Batch  90/90  -- Loss: 0.552672
Average Loss: 0.340644
Training F1: 0.654
Test F1: 0.5542
Epoch #169: Batch  90/90  -- Loss: 0.341618
Average Loss: 0.33855
Training F1: 0.6813
Test F1: 0.5914
Epoch #170: Batch  90/90  -- Loss: 0.3949449
Average Loss: 0.339179
Training F1: 0.6129
Test F1: 0.4526
Epoch #171: Batch  90/90  -- Loss: 0.2928178
Average Loss: 0.338057
Training F1: 0.6368
Test F1: 0.5283
Epoch #172: Batch  90/90  -- Loss: 0.3038718
Average Loss: 0.337565
Training F1: 0.6667
Test F1: 0.5389
Epoch #173: Batch  90/90  -- Loss: 0.0013661
Average Loss: 0.337087
Training F1: 0.6292
Test F1: 0.4286
Epoch #174: Batch  90/90  -- Loss: 0.346937
Average Loss: 0.337836
Training F1: 0.6193
Test F1: 0.48
Epoch #175: Batch  90/90  -- Loss: 0.0009979
Average Loss: 0.336242
Tra

Epoch #246: Batch  90/90  -- Loss: 0.0019791
Average Loss: 0.128787
Training F1: 0.8779
Test F1: 0.8027
Epoch #247: Batch  90/90  -- Loss: 0.43792791
Average Loss: 0.163685
Training F1: 0.8179
Test F1: 0.7087
Epoch #248: Batch  90/90  -- Loss: 0.00718799
Average Loss: 0.137024
Training F1: 0.8918
Test F1: 0.8138
Epoch #249: Batch  90/90  -- Loss: 0.00230159
Average Loss: 0.140846
Training F1: 0.8835
Test F1: 0.791
Epoch #250: Batch  90/90  -- Loss: 0.0847972
Average Loss: 0.122335
Training F1: 0.877
Test F1: 0.8029
Epoch #251: Batch  90/90  -- Loss: 0.00135468
Average Loss: 0.15093
Training F1: 0.7787
Test F1: 0.6522
Epoch #252: Batch  90/90  -- Loss: 0.09846629
Average Loss: 0.190612
Training F1: 0.7624
Test F1: 0.5937
Epoch #253: Batch  90/90  -- Loss: 0.00018368
Average Loss: 0.179504
Training F1: 0.7416
Test F1: 0.541
Epoch #254: Batch  90/90  -- Loss: 0.28273833
Average Loss: 0.175793
Training F1: 0.8421
Test F1: 0.7619
Epoch #255: Batch  90/90  -- Loss: 0.25879243
Average Loss: 0

Epoch #325: Batch  90/90  -- Loss: 0.03327974
Average Loss: 0.0119746
Training F1: 1.0
Test F1: 0.9926
Epoch #326: Batch  90/90  -- Loss: 0.01036117
Average Loss: 0.00972016
Training F1: 1.0
Test F1: 0.9926
Epoch #327: Batch  90/90  -- Loss: 0.00631843
Average Loss: 0.0080781
Training F1: 1.0
Test F1: 0.9926
Epoch #328: Batch  90/90  -- Loss: 7.7569e-05
Average Loss: 0.00686039
Training F1: 1.0
Test F1: 0.9926
Epoch #329: Batch  90/90  -- Loss: 0.00626395
Average Loss: 0.00643535
Training F1: 1.0
Test F1: 0.9926
Epoch #330: Batch  90/90  -- Loss: 0.00476993
Average Loss: 0.0053989
Training F1: 1.0
Test F1: 0.9926
Epoch #331: Batch  90/90  -- Loss: 0.00679585
Average Loss: 0.00490524
Training F1: 1.0
Test F1: 0.9926
Epoch #332: Batch  90/90  -- Loss: 0.01178458
Average Loss: 0.00448421
Training F1: 1.0
Test F1: 0.9926
Epoch #333: Batch  90/90  -- Loss: 5.674e-055
Average Loss: 0.00408089
Training F1: 1.0
Test F1: 0.9926
Epoch #334: Batch  90/90  -- Loss: 0.00941173
Average Loss: 0.00378

Epoch #404: Batch  90/90  -- Loss: 0.00014869
Average Loss: 6.61542e-05
Training F1: 1.0
Test F1: 0.9926
Epoch #405: Batch  90/90  -- Loss: 0.00013719
Average Loss: 6.26816e-05
Training F1: 1.0
Test F1: 0.9926
Epoch #406: Batch  90/90  -- Loss: 3.5763e-07
Average Loss: 5.86767e-05
Training F1: 1.0
Test F1: 0.9926
Epoch #407: Batch  90/90  -- Loss: 0.00010688
Average Loss: 5.64128e-05
Training F1: 1.0
Test F1: 0.9926
Epoch #408: Batch  90/90  -- Loss: 0.00020811
Average Loss: 5.40212e-05
Training F1: 1.0
Test F1: 0.9926
Epoch #409: Batch  90/90  -- Loss: 5.4951e-05
Average Loss: 5.05255e-05
Training F1: 1.0
Test F1: 0.9926
Epoch #410: Batch  90/90  -- Loss: 4.1333e-05
Average Loss: 4.79143e-05
Training F1: 1.0
Test F1: 0.9926
Epoch #411: Batch  90/90  -- Loss: 4.6012e-05
Average Loss: 4.55171e-05
Training F1: 1.0
Test F1: 0.9926
Epoch #412: Batch  90/90  -- Loss: 5.4836e-06
Average Loss: 4.31582e-05
Training F1: 1.0
Test F1: 0.9926
Epoch #413: Batch  90/90  -- Loss: 1.8715e-05
Average L

Epoch #482: Batch  90/90  -- Loss: 5.9605e-07
Average Loss: 1.55187e-06
Training F1: 1.0
Test F1: 0.9926
Epoch #483: Batch  90/90  -- Loss: 1.1325e-06
Average Loss: 1.48548e-06
Training F1: 1.0
Test F1: 0.9926
Epoch #484: Batch  90/90  -- Loss: 3.5763e-06
Average Loss: 1.4315e-06
Training F1: 1.0
Test F1: 0.9926
Epoch #485: Batch  90/90  -- Loss: 1.0729e-06
Average Loss: 1.35335e-06
Training F1: 1.0
Test F1: 0.9926
Epoch #486: Batch  90/90  -- Loss: 1.3709e-06
Average Loss: 1.2964e-06
Training F1: 1.0
Test F1: 0.9926
Epoch #487: Batch  90/90  -- Loss: 2.9802e-08
Average Loss: 1.22885e-06
Training F1: 1.0
Test F1: 0.9926
Epoch #488: Batch  90/90  -- Loss: 5.9605e-08
Average Loss: 1.1747e-06
Training F1: 1.0
Test F1: 0.9926
Epoch #489: Batch  90/90  -- Loss: 2.0862e-07
Average Loss: 1.12603e-06
Training F1: 1.0
Test F1: 0.9926
Epoch #490: Batch  90/90  -- Loss: 0.0838e-06
Average Loss: 1.07305e-06
Training F1: 1.0
Test F1: 0.9853
Epoch #491: Batch  90/90  -- Loss: 1.3709e-06
Average Loss

LSTMSeqLabel(
  (lstm): LSTM(
    (model): ModuleList(
      (0): LSTMCell(
        (g1): Sigmoid()
        (g2): Tanh()
      )
    )
    (model_rev): ModuleList(
      (0): LSTMCell(
        (g1): Sigmoid()
        (g2): Tanh()
      )
    )
  )
  (fc): Linear(in_features=32, out_features=1, bias=True)
)

## PyTorch baseline

In [44]:
# using PyTorch LSTM module
class PyTorchBaseline(nn.Module):

    def __init__(self, n_input, n_hidden, n_output, 
                 layers=1, bidirectional=False, layernorm=False):
        super().__init__()

        self.hidden_dim = n_hidden
        self.bidirectional = bidirectional
        self.layers = layers
        self.layernorm = layernorm

        self.lstm = nn.LSTM(n_input, n_hidden, bidirectional=self.bidirectional, num_layers=layers)
        if self.bidirectional:
            self.fc = nn.Linear(2 * n_hidden, n_output)
        else:
            self.fc = nn.Linear(n_hidden, n_output)
        if self.layernorm and self.bidirectional:
            self.ln = LayerNorm(2 * self.hidden_dim)
        elif self.layernorm:
            self.ln = LayerNorm(self.hidden_dim)

    def forward(self, x, h, c):
        o, (_, _) = self.lstm(x, (h, c))
        o = o[-1].unsqueeze(0)
        if self.layernorm:
            output = self.fc(self.ln(o))
        else:
            output = self.fc(o)
        return output

    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.lstm.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.fc.parameters() if p.requires_grad)
        return tot_sum

In [45]:
model = PyTorchBaseline(input_dim, hidden_dim, output_dim, bidirectional=bidirectional, layers=layers).to(device)
print(model.count_parameters())
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

2593


In [46]:
train_x = train_x.to(device)
train_y = train_y.to(device)
test_x = test_x.to(device)
test_y = test_y.to(device)

train(model, train_x, train_y, test_x, test_y, epochs=500, loss_fn=loss_fn, optimizer=optimizer)

Epoch #1  : Batch  90/90  -- Loss: 0.45927
Average Loss: 0.678364
Training F1: 0.0
Test F1: 0.0
Epoch #2  : Batch  90/90  -- Loss: 0.56302
Average Loss: 0.58079
Training F1: 0.0
Test F1: 0.0
Epoch #3  : Batch  90/90  -- Loss: 0.56522
Average Loss: 0.578788
Training F1: 0.0
Test F1: 0.0
Epoch #4  : Batch  90/90  -- Loss: 0.31799
Average Loss: 0.577131
Training F1: 0.0
Test F1: 0.0
Epoch #5  : Batch  90/90  -- Loss: 0.30906
Average Loss: 0.576287
Training F1: 0.0
Test F1: 0.0
Epoch #6  : Batch  90/90  -- Loss: 0.55948
Average Loss: 0.577445
Training F1: 0.0
Test F1: 0.0
Epoch #7  : Batch  90/90  -- Loss: 0.57317
Average Loss: 0.578531
Training F1: 0.0
Test F1: 0.0
Epoch #8  : Batch  90/90  -- Loss: 0.81123
Average Loss: 0.578521
Training F1: 0.0
Test F1: 0.0
Epoch #9  : Batch  90/90  -- Loss: 0.30972
Average Loss: 0.575707
Training F1: 0.0
Test F1: 0.0
Epoch #10 : Batch  90/90  -- Loss: 0.55801
Average Loss: 0.576991
Training F1: 0.0
Test F1: 0.0
Epoch #11 : Batch  90/90  -- Loss: 0.3141

Epoch #86 : Batch  90/90  -- Loss: 0.79364
Average Loss: 0.5783
Training F1: 0.0
Test F1: 0.0
Epoch #87 : Batch  90/90  -- Loss: 0.55259
Average Loss: 0.577766
Training F1: 0.0
Test F1: 0.0
Epoch #88 : Batch  90/90  -- Loss: 0.33411
Average Loss: 0.575577
Training F1: 0.0
Test F1: 0.0
Epoch #89 : Batch  90/90  -- Loss: 0.59335
Average Loss: 0.576981
Training F1: 0.0
Test F1: 0.0
Epoch #90 : Batch  90/90  -- Loss: 0.30669
Average Loss: 0.574895
Training F1: 0.0
Test F1: 0.0
Epoch #91 : Batch  90/90  -- Loss: 0.59173
Average Loss: 0.576351
Training F1: 0.0
Test F1: 0.0
Epoch #92 : Batch  90/90  -- Loss: 0.30928
Average Loss: 0.574995
Training F1: 0.0
Test F1: 0.0
Epoch #93 : Batch  90/90  -- Loss: 0.57807
Average Loss: 0.576923
Training F1: 0.0
Test F1: 0.0
Epoch #94 : Batch  90/90  -- Loss: 0.30678
Average Loss: 0.575203
Training F1: 0.0
Test F1: 0.0
Epoch #95 : Batch  90/90  -- Loss: 0.31104
Average Loss: 0.574977
Training F1: 0.0
Test F1: 0.0
Epoch #96 : Batch  90/90  -- Loss: 0.32276

Epoch #172: Batch  90/90  -- Loss: 0.58198
Average Loss: 0.575588
Training F1: 0.0
Test F1: 0.0
Epoch #173: Batch  90/90  -- Loss: 0.54034
Average Loss: 0.576004
Training F1: 0.0
Test F1: 0.0
Epoch #174: Batch  90/90  -- Loss: 0.34144
Average Loss: 0.574643
Training F1: 0.0
Test F1: 0.0
Epoch #175: Batch  90/90  -- Loss: 0.99764
Average Loss: 0.577806
Training F1: 0.0
Test F1: 0.0
Epoch #176: Batch  90/90  -- Loss: 0.52845
Average Loss: 0.574447
Training F1: 0.0
Test F1: 0.0
Epoch #177: Batch  90/90  -- Loss: 0.52833
Average Loss: 0.574268
Training F1: 0.0
Test F1: 0.0
Epoch #178: Batch  90/90  -- Loss: 0.30635
Average Loss: 0.572904
Training F1: 0.0
Test F1: 0.0
Epoch #179: Batch  90/90  -- Loss: 0.55328
Average Loss: 0.57288
Training F1: 0.0
Test F1: 0.0
Epoch #180: Batch  90/90  -- Loss: 0.30392
Average Loss: 0.572101
Training F1: 0.0
Test F1: 0.0
Epoch #181: Batch  90/90  -- Loss: 0.29576
Average Loss: 0.572817
Training F1: 0.0
Test F1: 0.0
Epoch #182: Batch  90/90  -- Loss: 1.2683

Epoch #253: Batch  90/90  -- Loss: 0.4906465
Average Loss: 0.282823
Training F1: 0.7533
Test F1: 0.6533
Epoch #254: Batch  90/90  -- Loss: 0.185746
Average Loss: 0.279182
Training F1: 0.7519
Test F1: 0.6624
Epoch #255: Batch  90/90  -- Loss: 0.212583
Average Loss: 0.272331
Training F1: 0.7586
Test F1: 0.6497
Epoch #256: Batch  90/90  -- Loss: 0.084369
Average Loss: 0.271796
Training F1: 0.7677
Test F1: 0.6753
Epoch #257: Batch  90/90  -- Loss: 0.209373
Average Loss: 0.268731
Training F1: 0.7912
Test F1: 0.6879
Epoch #258: Batch  90/90  -- Loss: 0.227871
Average Loss: 0.262984
Training F1: 0.7688
Test F1: 0.6667
Epoch #259: Batch  90/90  -- Loss: 0.547391
Average Loss: 0.26169
Training F1: 0.7828
Test F1: 0.6974
Epoch #260: Batch  90/90  -- Loss: 0.068621
Average Loss: 0.25651
Training F1: 0.797
Test F1: 0.6711
Epoch #261: Batch  90/90  -- Loss: 0.272975
Average Loss: 0.260352
Training F1: 0.7868
Test F1: 0.68
Epoch #262: Batch  90/90  -- Loss: 0.453091
Average Loss: 0.252414
Training F

Epoch #332: Batch  90/90  -- Loss: 0.0523134
Average Loss: 0.0408116
Training F1: 0.9974
Test F1: 0.9618
Epoch #333: Batch  90/90  -- Loss: 0.2269419
Average Loss: 0.0409718
Training F1: 1.0
Test F1: 0.9618
Epoch #334: Batch  90/90  -- Loss: 0.0012478
Average Loss: 0.0392897
Training F1: 1.0
Test F1: 0.9697
Epoch #335: Batch  90/90  -- Loss: 0.1988451
Average Loss: 0.0366872
Training F1: 0.9973
Test F1: 0.9692
Epoch #336: Batch  90/90  -- Loss: 0.0276229
Average Loss: 0.0336763
Training F1: 0.9869
Test F1: 0.9706
Epoch #337: Batch  90/90  -- Loss: 0.0486611
Average Loss: 0.0325859
Training F1: 1.0
Test F1: 0.9612
Epoch #338: Batch  90/90  -- Loss: 0.02148911
Average Loss: 0.0297609
Training F1: 0.9974
Test F1: 0.9774
Epoch #339: Batch  90/90  -- Loss: 0.00287683
Average Loss: 0.0290608
Training F1: 1.0
Test F1: 0.9848
Epoch #340: Batch  90/90  -- Loss: 0.0006058
Average Loss: 0.0262582
Training F1: 1.0
Test F1: 0.9848
Epoch #341: Batch  90/90  -- Loss: 0.0086941
Average Loss: 0.0251354

Epoch #411: Batch  90/90  -- Loss: 0.00035372
Average Loss: 0.00158268
Training F1: 1.0
Test F1: 1.0
Epoch #412: Batch  90/90  -- Loss: 0.00099056
Average Loss: 0.00150909
Training F1: 1.0
Test F1: 1.0
Epoch #413: Batch  90/90  -- Loss: 0.00094177
Average Loss: 0.00144537
Training F1: 1.0
Test F1: 1.0
Epoch #414: Batch  90/90  -- Loss: 0.00148846
Average Loss: 0.00139549
Training F1: 1.0
Test F1: 1.0
Epoch #415: Batch  90/90  -- Loss: 0.00176659
Average Loss: 0.00135028
Training F1: 1.0
Test F1: 1.0
Epoch #416: Batch  90/90  -- Loss: 7.3909e-06
Average Loss: 0.0012974
Training F1: 1.0
Test F1: 1.0
Epoch #417: Batch  90/90  -- Loss: 0.00050857
Average Loss: 0.00125849
Training F1: 1.0
Test F1: 1.0
Epoch #418: Batch  90/90  -- Loss: 0.00036858
Average Loss: 0.00122283
Training F1: 1.0
Test F1: 1.0
Epoch #419: Batch  90/90  -- Loss: 0.00046539
Average Loss: 0.0011922
Training F1: 1.0
Test F1: 1.0
Epoch #420: Batch  90/90  -- Loss: 0.00057106
Average Loss: 0.00115765
Training F1: 1.0
Test 

Epoch #490: Batch  90/90  -- Loss: 0.00025536
Average Loss: 0.000122462
Training F1: 1.0
Test F1: 0.9848
Epoch #491: Batch  90/90  -- Loss: 0.00019134
Average Loss: 0.000120058
Training F1: 1.0
Test F1: 0.9848
Epoch #492: Batch  90/90  -- Loss: 6.0198e-05
Average Loss: 0.000114907
Training F1: 1.0
Test F1: 0.9848
Epoch #493: Batch  90/90  -- Loss: 9.1492e-06
Average Loss: 0.000108502
Training F1: 1.0
Test F1: 0.9848
Epoch #494: Batch  90/90  -- Loss: 2.8638e-05
Average Loss: 0.000102724
Training F1: 1.0
Test F1: 0.9848
Epoch #495: Batch  90/90  -- Loss: 3.8861e-05
Average Loss: 9.76e-05
Training F1: 1.0
Test F1: 0.9925
Epoch #496: Batch  90/90  -- Loss: 5.2986e-05
Average Loss: 9.69174e-05
Training F1: 1.0
Test F1: 0.9925
Epoch #497: Batch  90/90  -- Loss: 5.239e-055
Average Loss: 9.57881e-05
Training F1: 1.0
Test F1: 0.9848
Epoch #498: Batch  90/90  -- Loss: 5.8021e-05
Average Loss: 8.77893e-05
Training F1: 1.0
Test F1: 0.9848
Epoch #499: Batch  90/90  -- Loss: 1.4633e-05
Average Loss

PyTorchBaseline(
  (lstm): LSTM(2, 16, bidirectional=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)

In [52]:
our = LSTMSeqLabel(input_dim, hidden_dim, output_dim, bidirectional=True, layers=layers).to(device)
pytorch = PyTorchBaseline(input_dim, hidden_dim, output_dim, bidirectional=bidirectional, layers=layers).to(device)

In [58]:
print("Our implementation\n{}".format("=" * len("Our implementation")))
print("# of parameters: {}".format(our.count_parameters()))
for name, param in our.named_parameters():
    print("{:<25}: {}".format(name, param.shape))

Our implementation
# of parameters: 2465
lstm.model.0.weights     : torch.Size([18, 64])
lstm.model.0.bias        : torch.Size([64])
lstm.model_rev.0.weights : torch.Size([18, 64])
lstm.model_rev.0.bias    : torch.Size([64])
fc.weight                : torch.Size([1, 32])
fc.bias                  : torch.Size([1])


In [59]:
print("PyTorch implementation\n{}".format("=" * len("PyTorch implementation")))
print("# of parameters: {}".format(pytorch.count_parameters()))
for name, param in pytorch.named_parameters():
    print("{:<25}: {}".format(name, param.shape))

PyTorch implementation
# of parameters: 2593
lstm.weight_ih_l0        : torch.Size([64, 2])
lstm.weight_hh_l0        : torch.Size([64, 16])
lstm.bias_ih_l0          : torch.Size([64])
lstm.bias_hh_l0          : torch.Size([64])
lstm.weight_ih_l0_reverse: torch.Size([64, 2])
lstm.weight_hh_l0_reverse: torch.Size([64, 16])
lstm.bias_ih_l0_reverse  : torch.Size([64])
lstm.bias_hh_l0_reverse  : torch.Size([64])
fc.weight                : torch.Size([1, 32])
fc.bias                  : torch.Size([1])


PyTorch uses $Wh + Wx$ whereas we are using $Wx'$, where $x'$ is $h, x$ concatenated. Therefore PyTorch has an extra set of biases for each direction.

For one direction - 64 \\
For reverse direction - 64 \\

Our model has $2465$ parameters while PyTorch model has $2465 + 64 + 64 = 2593$ parameters.