In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

# Pack padded sequence [Batching in RNNs]

In [2]:
# Given the sentences are of different length, we have to pad them
# Specifically useful for batch training

In [3]:
from nltk.tokenize import word_tokenize

In [4]:
import pickle

In [5]:
with open('train_X.p', 'rb') as fs:
    train_X = pickle.load(fs)

In [6]:
with open('train_y.p', 'rb') as fs:
    train_y = pickle.load(fs)

In [7]:
train_X[0]

"The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal ."

In [8]:
train_y[0]

1

In [9]:
word2index = {}
index2word = {}
i=1
for sent in train_X:
    words = word_tokenize(sent)
    for w in words:
        if w not in word2index:
            word2index[w] = i
            index2word[i] = w
            i+=1

In [10]:
word2index['<UNK>'] = i
index2word[i] = '<UNK>'

In [11]:
sentence_batch = train_X[:4]

In [12]:
labels_batch = train_y[:4]

In [13]:
indexed_batch = []
for sent in sentence_batch:
    ind = [word2index[w] if w in word2index else word2index['<UNK>'] for w in word_tokenize(sent)]
    indexed_batch.append(ind)

In [14]:
batch_lengths = [len(sent) for sent in indexed_batch]

In [15]:
batch_lengths

[36, 38, 39, 19]

In [16]:
max_length = 50
for sent in indexed_batch:
    if len(sent)>max_length:
        sent = sent[:max_length]
    else:
        sent.extend([0 for _ in range(max_length-len(sent))])

In [17]:
[len(sent) for sent in indexed_batch]

[50, 50, 50, 50]

In [18]:
from torch.nn.utils import rnn

In [77]:
class Clf_model(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(len(word2index)+1, 300, padding_idx=0)
        self.rnn = nn.RNN(300, 512, num_layers=2, batch_first=True) #by default seq_len x batch_size x embedding
        self.relu = nn.ReLU()
        self.lin1 = nn.Linear(512, 128)
        self.lin2 = nn.Linear(128,2)
    
    def forward(self, inp, inp_length):
        X = self.emb(inp) # batch_size x seq_len x embedding
        batch_size = X.shape[0]
        X = rnn.pack_padded_sequence(X, inp_lengths, batch_first=True, enforce_sorted=False)
        h_0 = torch.rand(2, batch_size, 512)
        packed_output, last_hidden = self.rnn(X, h_0)
        unpacked_output, lengths = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        out = self.lin1(last_hidden.squeeze(0))
        out = self.relu(out)
        out = self.lin2(out)
        return out

In [75]:
inp_tensor = torch.tensor(indexed_batch)
inp_lengths = torch.tensor(batch_lengths)

In [76]:
model = Clf_model()
out = model(inp_tensor, inp_lengths)

torch.Size([2, 4, 512])
torch.Size([4, 39, 512])


In [32]:
out

tensor([[ 0.1295,  0.0682],
        [ 0.1697, -0.0148],
        [ 0.1156,  0.0618],
        [ 0.1538,  0.0127]], grad_fn=<AddmmBackward0>)

# Multi-layered RNNs

In [33]:
batch_size=4
seq_len=10
input_size=20
inp = torch.randn((4, 10, 20))

In [34]:
hidden_size=25
num_layers=2
model = nn.RNN(input_size, hidden_size, num_layers=num_layers, batch_first=True)

In [35]:
h_0 = torch.randn(num_layers, batch_size, hidden_size)

In [36]:
all_outputs, last_hidden = model(inp, h_0)

In [37]:
all_outputs.shape, last_hidden.shape

(torch.Size([4, 10, 25]), torch.Size([2, 4, 25]))

In [38]:
# all_outputs -> batch_size x seq_len x hidden_size || last layer hidden_state corresponsing to each element in the sequence
# last_hidden -> layers x batch x hidden_size || The last hidden_state corresponding to each layer for each element in batch

In [39]:
all_outputs[0,-1,:]

tensor([-0.1308, -0.2248,  0.1781,  0.1670,  0.1150, -0.2178,  0.3064,  0.0305,
         0.1996, -0.1727,  0.4197, -0.3187, -0.1914, -0.0441, -0.5463,  0.1929,
        -0.2758, -0.4946, -0.1328, -0.6834, -0.0140, -0.0068,  0.1660,  0.3116,
         0.7350], grad_fn=<SliceBackward0>)

In [41]:
last_hidden[1,0,:]

tensor([-0.1308, -0.2248,  0.1781,  0.1670,  0.1150, -0.2178,  0.3064,  0.0305,
         0.1996, -0.1727,  0.4197, -0.3187, -0.1914, -0.0441, -0.5463,  0.1929,
        -0.2758, -0.4946, -0.1328, -0.6834, -0.0140, -0.0068,  0.1660,  0.3116,
         0.7350], grad_fn=<SliceBackward0>)

# Bidirectional RNNs

In [56]:
num_layers=1
model = nn.RNN(input_size, hidden_size, num_layers=num_layers, bidirectional=True, batch_first=True)

In [57]:
h_0 = torch.zeros(2*num_layers, batch_size, hidden_size)

In [58]:
all_outputs, last_hidden = model(inp, h_0)

In [59]:
all_outputs.shape, last_hidden.shape

(torch.Size([4, 10, 50]), torch.Size([2, 4, 25]))

In [60]:
# all_outputs -> batch_size x seq_len x last_hidden*2 || concatenated hidden states for each direction
# last_hidden -> 2*layers x batch_size x last_hidden

In [67]:
all_outputs[0, -1, :25]

tensor([-0.0591, -0.2167,  0.3445,  0.2151,  0.6941, -0.7000,  0.1784,  0.5229,
        -0.0287, -0.2525,  0.5011,  0.5533,  0.2229,  0.6401,  0.7517, -0.0487,
         0.1078, -0.1037, -0.4245, -0.6141, -0.8736,  0.4709,  0.7587,  0.3324,
        -0.6094], grad_fn=<SliceBackward0>)

In [68]:
last_hidden[0, 0, :]

tensor([-0.0591, -0.2167,  0.3445,  0.2151,  0.6941, -0.7000,  0.1784,  0.5229,
        -0.0287, -0.2525,  0.5011,  0.5533,  0.2229,  0.6401,  0.7517, -0.0487,
         0.1078, -0.1037, -0.4245, -0.6141, -0.8736,  0.4709,  0.7587,  0.3324,
        -0.6094], grad_fn=<SliceBackward0>)

In [69]:
all_outputs[0, 0, 25:]

tensor([-0.6717,  0.7311, -0.5096,  0.2763,  0.1644, -0.5523,  0.6719,  0.4401,
        -0.3511, -0.3034,  0.2972, -0.0869,  0.0013, -0.2227, -0.0914, -0.3052,
         0.1803,  0.6271,  0.6227, -0.2678,  0.2000, -0.1552, -0.6472,  0.5953,
        -0.1103], grad_fn=<SliceBackward0>)

In [70]:
last_hidden[1, 0, :]

tensor([-0.6717,  0.7311, -0.5096,  0.2763,  0.1644, -0.5523,  0.6719,  0.4401,
        -0.3511, -0.3034,  0.2972, -0.0869,  0.0013, -0.2227, -0.0914, -0.3052,
         0.1803,  0.6271,  0.6227, -0.2678,  0.2000, -0.1552, -0.6472,  0.5953,
        -0.1103], grad_fn=<SliceBackward0>)

# LSTM

In [80]:
# For LSTM, you would require definig an additional initial cell state c_0 (like l_0)

In [44]:
rnn = nn.LSTM(input_size=10, hidden_size=20, batch_first=True)
input = torch.randn(3, 5, 10) # batch_size x seq_len x embedding
h0 = torch.randn(1, 3, 20) # layers x batch_size x hidden_size
c0 = torch.randn(1, 3, 20) # layers x batch_size x hidden_size
output, (hn, cn) = rnn(input, (h0, c0))

In [45]:
output.shape # batch_size x seq_len x hidden_size || hidden states corresponding to each element in the sequence

torch.Size([3, 5, 20])

In [46]:
hn.shape # last hidden state

torch.Size([1, 3, 20])

In [47]:
cn.shape # last cell state

torch.Size([1, 3, 20])

In [48]:
output[0,-1,:]

tensor([ 0.0723, -0.1376,  0.2080, -0.1957,  0.0174, -0.1336, -0.1078,  0.0292,
        -0.0376, -0.0027, -0.0177,  0.0217,  0.0013,  0.0073,  0.0323, -0.1544,
         0.1165,  0.1479,  0.1139,  0.3052], grad_fn=<SliceBackward0>)

In [49]:
hn[0,0,:]

tensor([ 0.0723, -0.1376,  0.2080, -0.1957,  0.0174, -0.1336, -0.1078,  0.0292,
        -0.0376, -0.0027, -0.0177,  0.0217,  0.0013,  0.0073,  0.0323, -0.1544,
         0.1165,  0.1479,  0.1139,  0.3052], grad_fn=<SliceBackward0>)

# Tasks

Design a GRU model (utilize multiple layers and bidirectionality) for sentiment classification on the SST dataset. First create a dataset class for the SST dataset. Train your model on the training set. At the end of each epoch measure the validation accuracy and save the model with the best validation accuracy. Evaluate you model and print the accuracy on the test set