In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import os
import json
import time
import random
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score

from IPython.display import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 42

## Reverse sequence dataset

In [1040]:
for i in range(train_size):
    for j in range(sequence_length):
        target[j, i, train_x[j, i,].item()] = 1
# target = torch.flip(target, [0]).long()
train_x = train_x.float()

In [1039]:
sequence_length = 5
train_size = 100
train_x = torch.randint(0, 2, (sequence_length, train_size, input_dim))
target = torch.zeros(sequence_length, train_size, output_dim)

In [1]:
import torch
import torch.nn as nn
import numpy as np
import os
import json
import time
import random
from matplotlib import pyplot as plt

from lstm import LSTM

from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score

class LSTMSeq2SeqDifferent(nn.Module):
    """ LSTM Class for Sequence Labelling (many-to-many-different)

    The class creates the LSTM architecture as specified by the parameters.
    A fully connected layer is added to reduce the last hidden state to output_dim.

    Parameters
    ==========
    vocab_len: int from imdb dataset
    embed_dim: dimensions of the embeddings
    hidden_dim: number of hidden nodes required
    output_dim: numer of output nodes required (1 for sentiment analysis)
    pretrained_vec: weights from imdb object
    layers: number of LSTM cells to be stacked for depth
    bidirectional: boolean
    layernorm: boolean

    """
    def __init__(self, input_dim, hidden_dim, output_dim, layers=1,
                 bidirectional=False, layernorm=False):
        super().__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.bidirectional = bidirectional
        self.layernorm = layernorm

        self.encoder = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, layers=layers,
                         bidirectional=bidirectional, layernorm=layernorm)
        if self.bidirectional:
            self.decoder = LSTM(input_dim=output_dim, hidden_dim=2 * hidden_dim, layers=layers,
                                bidirectional=False, layernorm=layernorm)
            self.fc = nn.Linear(2 * hidden_dim, output_dim)
        else:
            self.decoder = LSTM(input_dim=output_dim, hidden_dim=hidden_dim, layers=layers,
                                bidirectional=False, layernorm=layernorm)
            self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x, target, hidden_state, cell_state, teacher_forcing=0.5):
        # encoding
        _, (hidden_state, cell_state) = self.encoder(x, hidden_state, cell_state)
        batch_size = x.shape[1]
        timesteps = x.shape[0]
        x = torch.zeros(1, batch_size, self.output_dim).to(device)
        output = torch.tensor([]).to(device)
        if self.bidirectional:
            # concatenating hidden states from two directions
            hidden_state = torch.cat((hidden_state[:,0,:,:], hidden_state[:,1,:,:]), dim=2)
            cell_state = torch.cat((cell_state[:,0,:,:], cell_state[:,1,:,:]), dim=2)
#         else:
#             hidden_state = hidden_state[-1].unsqueeze(0)
#             cell_state = cell_state[-1].unsqueeze(0)
        # decoding
        for t in range(timesteps):
            # taking hidden state from last layer
#             if self.bidirectional:
#                 hidden_state = hidden_state[-1,:,:].unsqueeze(0)
#                 cell_state = cell_state[-1,:,:].unsqueeze(0)
#             else:
            hidden_state = hidden_state[-1].unsqueeze(0)
            cell_state = cell_state[-1].unsqueeze(0)            
            x, (hidden_state, cell_state) = self.decoder(x, hidden_state, cell_state)            
            x = self.softmax(self.fc(x))
            output = torch.cat((output, x), dim=0)
            choice = random.random() 
            if choice < teacher_forcing:
                x = target[t].float().to(device)
                x = x.unsqueeze(0)
            else:
                # converting x to a one-hot encoding
                x = torch.zeros(x.shape).to(device).scatter_(2, torch.argmax(x, -1, keepdim=True), 1)
        return output

    def save(self, file_path='./model.pkl'):
        torch.save(self.state_dict(), file_path)

    def load(self, file_path):
        self.load_state_dict(torch.load(file_path))

    def count_parameters(self):
        tot_sum = sum(p.numel() for p in self.encoder.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.decoder.parameters() if p.requires_grad)
        tot_sum += sum(p.numel() for p in self.fc.parameters() if p.requires_grad)
        return tot_sum


In [2]:
input_dim = 1
hidden_dim = 256
output_dim = 2
batch_size = 8
layers = 1
bidirectional = True
layernorm = True
device = 'cuda'

model = LSTMSeq2SeqDifferent(input_dim=input_dim, hidden_dim=hidden_dim, 
                             output_dim=output_dim, layers=layers,
                             bidirectional=bidirectional, layernorm=layernorm).to(device)
model.count_parameters()

1594370

#### Training

In [1041]:
# print(train_x.shape, target.shape)
# print(train_x[:,0,:])
# print(target[:,0,:])

#### Testing

In [1042]:
sequence_length = 5
test_size = 50
test_x = torch.randint(0, 2, (sequence_length, test_size, input_dim))
test_target = torch.zeros(sequence_length, test_size, output_dim)

In [1043]:
for i in range(test_size):
    for j in range(sequence_length):
        test_target[j, i, test_x[j, i,].item()] = 1
# target = torch.flip(target, [0]).long()
test_x = test_x.float()

In [1044]:
# print(test_x.shape, test_target.shape)
# print(test_x[:,0,:])
# print(test_target[:,0,:])

In [1045]:
## flattening softmax output sample for preds in loss_fn
# print(target.view(-1, target.shape[-1])[:20])
## flattening softmax output sample for gt in loss_fn
# print(torch.argmax(target, 2).view(-1)[:20])
# loss_fn = nn.CrossEntropyLoss()
# loss = loss_fn(target.view(-1, target.shape[-1])[:20], torch.argmax(target, 2).view(-1)[:20])
# loss.item()

In [1046]:
# h = torch.zeros(1, batch_size, hidden_dim).to(device)
# c = torch.zeros(1, batch_size, hidden_dim).to(device)
# o = model(train_x[:,:batch_size,:].to(device), target[:,:batch_size,:].to(device), h, c, 0)

In [1061]:
print(torch.argmax(o.view(-1, o.shape[-1]), 1).shape)
print(torch.argmax(target[:,:batch_size,:], 2).view(-1).shape)
f1_score(torch.argmax(o.view(-1, o.shape[-1]), 1).cpu(), 
         torch.argmax(target[:,:batch_size,:], 2).view(-1))

torch.Size([40])
torch.Size([40])


0.5454545454545454

## Training

In [1050]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# print(target[:,0,:])

In [1051]:
hidden_state = torch.zeros(1, batch_size, hidden_dim).to(device)
cell_state = torch.zeros(1, batch_size, hidden_dim).to(device)
hidden_state.shape, cell_state.shape

(torch.Size([1, 8, 256]), torch.Size([1, 8, 256]))

In [1104]:
def train(model, x, y, batch_size, epochs, loss_fn, optimizer, 
          teacher_forcing=0.5, test_x=None, test_y=None):
    model.train()
    stats = {'train':[], 'valid':[]}
    train_size = x.shape[1]
    for i in range(1, epochs + 1):
        loss_tracker = []
        ordering = torch.randperm(train_size)
        x = x[:, ordering, :]
        y = y[:, ordering, :]
        for j in range(int(train_size/batch_size) + 1):
            optimizer.zero_grad()
            start = j*batch_size
            end = min((j+1)*batch_size, train_size)
            batch = end - start
            if batch == 0:
                continue
            hidden_state = torch.zeros(1, batch, hidden_dim).to(device)
            cell_state = torch.zeros(1, batch, hidden_dim).to(device)
            o = model(x[:,start:end,:], y[:,start:end,:], hidden_state, cell_state, teacher_forcing)
            loss = loss_fn(o.view(-1, o.shape[-1]), torch.argmax(y[:,start:end,:], 2).view(-1))
            loss_tracker.append(loss.item())
            optimizer.step()
            print("Epoch #{:<3d}: Batch {:>3d}/{:<3d} -- "
                  "Loss: {:2.5}".format(i, j+1, int(train_size/batch_size) + 1, 
                                        loss_tracker[-1]), end='\r')
        print()
        print("Average Loss: {:2.6}".format(np.mean(loss_tracker)))
        f1_train = evaluate(model, x, y)
        print("Training Acc.: {:3.4}".format(f1_train))
        if test_x is not None and test_y is not None:
            f1_test = evaluate(model, test_x, test_y)
            print("Test Acc.: {:3.4}".format(f1_test))
        print("="*42)
    return model
        
def evaluate(model, x, y):
    model.eval()
    test_size = x.shape[1]
    labels = []
    preds = []
    for j in range(int(test_size/batch_size) + 1):
        optimizer.zero_grad()
        start = j*batch_size
        end = min((j+1)*batch_size, test_size)
        batch = end - start
        if batch == 0:
            continue
        hidden_state = torch.zeros(1, batch, hidden_dim).to(device)
        cell_state = torch.zeros(1, batch, hidden_dim).to(device)
        with torch.no_grad():
            o = model(x[:,start:end,:], y[:,start:end,:], hidden_state, cell_state, teacher_forcing=0)
        preds.extend(torch.argmax(o.view(-1, o.shape[-1]), 1).cpu().numpy())
        labels.extend(torch.argmax(y[:,start:end,:], 2).view(-1).cpu().numpy())
    return accuracy_score(labels, preds)
#         print(torch.argmax(o.view(-1, o.shape[-1]), 1).cpu().numpy().shape, 
#               torch.argmax(y[:,start:end,:], 2).view(-1).cpu().numpy().shape, len(preds), len(labels))
#     from sklearn.metrics import accuracy_score, precision_score, recall_score
#     return confusion_matrix(labels, preds), accuracy_score(labels, preds), \
#            precision_score(labels, preds), recall_score(labels, preds)
# #     return f1_score(labels, preds), labels, preds

In [1118]:
model = train(model, train_x.to(device), target.to(device), batch_size, 10, loss_fn, optimizer,
              test_x=test_x.to(device), test_y=test_target.to(device))

Epoch #1  : Batch  13/13  -- Loss: 0.68828
Average Loss: 0.69258
Training Acc.: 0.516
Test Acc.: 0.484
Epoch #2  : Batch  13/13  -- Loss: 0.69329
Average Loss: 0.692773
Training Acc.: 0.516
Test Acc.: 0.484
Epoch #3  : Batch  13/13  -- Loss: 0.69329
Average Loss: 0.692773
Training Acc.: 0.516
Test Acc.: 0.484
Epoch #4  : Batch  13/13  -- Loss: 0.69329
Average Loss: 0.692773
Training Acc.: 0.516
Test Acc.: 0.484
Epoch #5  : Batch  13/13  -- Loss: 0.68828
Average Loss: 0.69258
Training Acc.: 0.516
Test Acc.: 0.484
Epoch #6  : Batch  13/13  -- Loss: 0.69329
Average Loss: 0.692773
Training Acc.: 0.516
Test Acc.: 0.484
Epoch #7  : Batch  13/13  -- Loss: 0.69162
Average Loss: 0.692708
Training Acc.: 0.516
Test Acc.: 0.484
Epoch #8  : Batch  13/13  -- Loss: 0.69829
Average Loss: 0.692965
Training Acc.: 0.516
Test Acc.: 0.484
Epoch #9  : Batch  13/13  -- Loss: 0.68661
Average Loss: 0.692516
Training Acc.: 0.516
Test Acc.: 0.484
Epoch #10 : Batch  13/13  -- Loss: 0.68995
Average Loss: 0.692644


In [1106]:
evaluate(model, test_x.to(device), test_target.to(device))

0.484

## PyTorch

In [1003]:
class PyTorchBaseline(nn.Module):
    """ LSTM Class for Sequence Labelling (many-to-many-different)

    The class creates the LSTM architecture as specified by the parameters.
    A fully connected layer is added to reduce the last hidden state to output_dim.

    Parameters
    ==========
    vocab_len: int from imdb dataset
    embed_dim: dimensions of the embeddings
    hidden_dim: number of hidden nodes required
    output_dim: numer of output nodes required (1 for sentiment analysis)
    pretrained_vec: weights from imdb object
    layers: number of LSTM cells to be stacked for depth
    bidirectional: boolean
    layernorm: boolean

    """
    def __init__(self, input_dim, hidden_dim, output_dim, layers=1,
                 bidirectional=False, layernorm=False):
        super().__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.bidirectional = bidirectional
        self.layernorm = layernorm

        self.encoder = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=layers,
                         bidirectional=bidirectional) #, layernorm=layernorm)
        if self.bidirectional:
            self.decoder = nn.LSTM(input_size=output_dim, hidden_size=2 * hidden_dim, num_layers=layers,
                                bidirectional=False) #, layernorm=layernorm)
            self.fc = nn.Linear(2 * hidden_dim, output_dim)
        else:
            self.decoder = nn.LSTM(input_size=output_dim, hidden_size=hidden_dim, num_layers=layers,
                                bidirectional=False) #, layernorm=layernorm)
            self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=2)
    
    def forward(self, x, target, hidden_state, cell_state, teacher_forcing=0.5):
        # encoding
        _, (hidden_state, cell_state) = self.encoder(x, (hidden_state, cell_state))
        batch_size = x.shape[1]
        timesteps = x.shape[0]
        x = torch.zeros(1, batch_size, self.output_dim).to(device)
        output = torch.tensor([]).to(device)
        if self.bidirectional:
            # concatenating hidden states from two directions
            hidden_state = torch.cat((hidden_state[:self.layers,:,:], 
                                      hidden_state[self.layers:,:,:]), dim=2).to(device)
            cell_state = torch.cat((cell_state[:self.layers,:,:], 
                                    cell_state[self.layers:,:,:]), dim=2).to(device)
#         else:
#             hidden_state = hidden_state[-1].unsqueeze(0)
#             cell_state = cell_state[-1].unsqueeze(0)
        # decoding
        for t in range(timesteps):          
            x, (hidden_state, cell_state) = self.decoder(x, (hidden_state, cell_state))            
            x = self.softmax(self.fc(x))
            output = torch.cat((output, x), dim=0)
            choice = random.random() 
            if choice < teacher_forcing:
                x = target[t].float().to(device)
                x = x.unsqueeze(0)
            else:
                # converting x to a one-hot encoding
                x = torch.zeros(x.shape).to(device).scatter_(2, torch.argmax(x, -1, keepdim=True), 1)
        return output

In [1111]:
layers = 1
bidirectional = True

pytorch = PyTorchBaseline(input_dim, hidden_dim, output_dim, layers, bidirectional).to(device)

In [1115]:
def train_pytorch(model, x, y, batch_size, epochs, loss_fn, optimizer, 
                  teacher_forcing=0.5, test_x=None, test_y=None):
    model.train()
    stats = {'train':[], 'valid':[]}
    train_size = x.shape[1]
    for i in range(1, epochs + 1):
        loss_tracker = []
        ordering = torch.randperm(train_size)
        x = x[:, ordering, :]
        y = y[:, ordering, :]
        for j in range(int(train_size/batch_size) + 1):
            optimizer.zero_grad()
            start = j*batch_size
            end = min((j+1)*batch_size, train_size)
            batch = end - start
            if batch == 0:
                continue
            if bidirectional:
                hidden_state = torch.zeros(2*layers, batch, hidden_dim).to(device)
                cell_state = torch.zeros(2*layers, batch, hidden_dim).to(device)
            else:
                hidden_state = torch.zeros(layers, batch, hidden_dim).to(device)
                cell_state = torch.zeros(layers, batch, hidden_dim).to(device)
#             print(hidden_state.shape, cell_state.shape)
            o = model(x[:,start:end,:], y[:,start:end,:], hidden_state, cell_state, teacher_forcing)
            loss = loss_fn(o.view(-1, o.shape[-1]), torch.argmax(y[:,start:end,:], 2).view(-1))
            loss_tracker.append(loss.item())
            optimizer.step()
            print("Epoch #{:<3d}: Batch {:>3d}/{:<3d} -- "
                  "Loss: {:2.5}".format(i, j+1, int(train_size/batch_size) + 1, loss_tracker[-1]), end='\r')
        print()
        print("Average Loss: {:2.6}".format(np.mean(loss_tracker)))
        f1_train = evaluate_pytorch(model, x, y)
        print("Training Acc.: {:3.4}".format(f1_train))
        if test_x is not None and test_y is not None:
            f1_test = evaluate_pytorch(model, test_x, test_y)
            print("Test Acc.: {:3.4}".format(f1_test))
        print("="*42)
        print("="*42)
    return model

        
def evaluate_pytorch(model, x, y):
    model.eval()
    test_size = x.shape[1]
    labels = []
    preds = []
    for j in range(int(test_size/batch_size) + 1):
        optimizer.zero_grad()
        start = j*batch_size
        end = min((j+1)*batch_size, test_size)
        batch = end - start
        if batch == 0:
            continue
        if bidirectional:
            hidden_state = torch.zeros(2*layers, batch, hidden_dim).to(device)
            cell_state = torch.zeros(2*layers, batch, hidden_dim).to(device)
        else:
            hidden_state = torch.zeros(layers, batch, hidden_dim).to(device)
            cell_state = torch.zeros(layers, batch, hidden_dim).to(device)
        with torch.no_grad():
            o = model(x[:,start:end,:], y[:,start:end,:], hidden_state, cell_state, teacher_forcing=0)
        preds.extend(torch.argmax(o.view(-1, o.shape[-1]), 1).cpu().numpy())
        labels.extend(torch.argmax(y[:,start:end,:], 2).view(-1).cpu().numpy())
    return accuracy_score(labels, preds)

In [1117]:
pytorch = train_pytorch(pytorch, train_x.to(device), target.to(device), batch_size, 10, loss_fn, optimizer,
                        test_x=test_x.to(device), test_y=test_target.to(device))

Epoch #1  : Batch  13/13  -- Loss: 0.69443
Average Loss: 0.693386
Training Acc.: 0.43
Test Acc.: 0.416
Epoch #2  : Batch  13/13  -- Loss: 0.69331
Average Loss: 0.693343
Training Acc.: 0.43
Test Acc.: 0.416
Epoch #3  : Batch  13/13  -- Loss: 0.69357
Average Loss: 0.693359
Training Acc.: 0.43
Test Acc.: 0.416
Epoch #4  : Batch  13/13  -- Loss: 0.69306
Average Loss: 0.693332
Training Acc.: 0.43
Test Acc.: 0.416
Epoch #5  : Batch  13/13  -- Loss: 0.69342
Average Loss: 0.69336
Training Acc.: 0.43
Test Acc.: 0.416
Epoch #6  : Batch  13/13  -- Loss: 0.69287
Average Loss: 0.693332
Training Acc.: 0.43
Test Acc.: 0.416
Epoch #7  : Batch  13/13  -- Loss: 0.69411
Average Loss: 0.693389
Training Acc.: 0.43
Test Acc.: 0.416
Epoch #8  : Batch  13/13  -- Loss: 0.69361
Average Loss: 0.693351
Training Acc.: 0.43
Test Acc.: 0.416
Epoch #9  : Batch  13/13  -- Loss: 0.69341
Average Loss: 0.69335
Training Acc.: 0.43
Test Acc.: 0.416
Epoch #10 : Batch  13/13  -- Loss: 0.69238
Average Loss: 0.69331
Training A