In [1]:
import torch
import torch.nn as nn

In [2]:
import nltk

In [3]:
mma = nltk.corpus.gutenberg.sents('austen-emma.txt')

In [4]:
len(mma)

7717

In [5]:
mma[50]

['But',
 'James',
 'will',
 'not',
 'like',
 'to',
 'put',
 'the',
 'horses',
 'to',
 'for',
 'such',
 'a',
 'little',
 'way',
 ';--',
 'and',
 'where',
 'are',
 'the',
 'poor',
 'horses',
 'to',
 'be',
 'while',
 'we',
 'are',
 'paying',
 'our',
 'visit',
 '?"']

In [6]:
from collections import Counter

In [7]:
vocab = Counter()

In [8]:
for sent in mma:
    for w in sent:
        vocab[w]+=1

In [9]:
len(vocab)

7806

In [10]:
i = 1 # index 0 will be kept for padding
word2index = {}
index2word = {}
for w in vocab:
    if vocab[w]>5:
        word2index[w] = i
        index2word[i] = w
        i+=1

In [11]:
len(word2index)

2169

# RNN Language model

In [12]:
class RNN_model(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden_size = 256
        self.vocab_size = 2170
        self.emb_dim = 100
        self.emb = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx=0)
        self.rnn = nn.RNN(self.emb_dim, self.hidden_size) # input_dimension, hidden_dimension
        self.lin = nn.Linear(self.hidden_size, self.vocab_size)
        self.relu = nn.ReLU()
    
    def forward(self, inp_seq):
        inp = self.emb(inp_seq)
        h_0 = torch.rand(1, self.hidden_size)
        all_hidden_states, last_hidden_state = self.rnn(inp, h_0)
        out = self.lin(all_hidden_states)
        out = self.relu(out)
        return out

In [13]:
LM = RNN_model()

In [14]:
seq = mma[50]

In [15]:
seq

['But',
 'James',
 'will',
 'not',
 'like',
 'to',
 'put',
 'the',
 'horses',
 'to',
 'for',
 'such',
 'a',
 'little',
 'way',
 ';--',
 'and',
 'where',
 'are',
 'the',
 'poor',
 'horses',
 'to',
 'be',
 'while',
 'we',
 'are',
 'paying',
 'our',
 'visit',
 '?"']

In [16]:
seq_ind = [word2index[w] for w in seq]

In [17]:
seq_ind

[418,
 452,
 453,
 146,
 454,
 19,
 455,
 22,
 456,
 19,
 66,
 247,
 13,
 37,
 132,
 411,
 10,
 419,
 457,
 22,
 458,
 456,
 19,
 212,
 459,
 429,
 457,
 460,
 461,
 322,
 417]

In [18]:
seq_ind = torch.LongTensor(seq_ind)

In [19]:
output = LM(seq_ind) # generates hidden states corresponding to each input in the sequence....

In [20]:
output.shape

torch.Size([31, 2170])

In [21]:
len(seq_ind)

31

In [69]:
# Computing the loss --

# Output distribution from the first token should be able to predict the second one

# Output distribution from the second token should be able to predict the third one

# ...and so on

In [70]:
# second token is the ground-truth for the first

# third token is the ground-truth for the second

# ...and so on

In [71]:
# ground truth would be the sequence starting from the second token... we ignore the first...

# similarly the output from the last token in the sequence is not needeed as there is no token following it

In [22]:
criterion = nn.NLLLoss()
output = LM(seq_ind)

In [23]:
output_lm = output[:-1,:]

In [24]:
output_lm.shape

torch.Size([30, 2170])

In [25]:
labels = seq_ind[1:]

In [26]:
labels.shape

torch.Size([30])

In [27]:
softmax = nn.LogSoftmax(dim=1)

In [28]:
output_lm = softmax(output_lm)

In [29]:
criterion(output_lm.squeeze(0), labels.squeeze(0))

tensor(7.7225, grad_fn=<NllLossBackward0>)

# Generate new sequence

In [37]:
seq_len = 10
seq = [418] # index corresponding to 'But'
for _ in range(seq_len):
    out = LM(torch.IntTensor(seq))
    #next_token = torch.argmax(out[], dim=1)
    #seq.append
    out = out[-1,:]
    next_token = torch.argmax(out)
    seq.append(next_token.item())

In [38]:
seq

[418, 1676, 1609, 1989, 1701, 284, 1854, 1076, 558, 634, 81]

In [40]:
' '.join([index2word[i] for i in seq])

'But unpleasant gradually resumed intention suffering escaped inferior dreadfully should Miss'

# RNN based classifier

In [45]:
class RNN_Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden_size = 256
        self.vocab_size = 2170
        self.emb_dim = 100
        self.out_size = 2
        self.emb = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx=0)
        self.rnn = nn.RNN(self.emb_dim, self.hidden_size) # input_dimension, hidden_dimension
        self.lin = nn.Linear(self.hidden_size, self.out_size)
        self.relu = nn.ReLU()
    
    def forward(self, inp_seq):
        inp = self.emb(inp_seq)
        h_0 = torch.rand(1, self.hidden_size)
        all_hidden_states, last_hidden_state = self.rnn(inp, h_0)
        out = self.lin(last_hidden_state)
        out = self.relu(out)
        return out

In [46]:
clf = RNN_Classifier()

In [47]:
clf(seq_ind)

tensor([[0.3353, 0.1334]], grad_fn=<ReluBackward0>)

# Tasks

We will design a language model with this vanilla RNN

1) Train the language model considering all the sentences in the corpus. Each training step will include one sentence. Ignore all sentences with length 1. 

In [31]:
from torch.optim import Adam

In [35]:
from tqdm.notebook import tqdm

In [39]:
epochs = 1
criterion = nn.NLLLoss()
optimizer = Adam(LM.parameters(), lr=0.001)
for _ in range(epochs):
    for seq in tqdm(mma):
        seq_ind = [word2index[w] for w in seq if w in word2index]
        if len(seq_ind)>1:
            seq_ind = torch.LongTensor(seq_ind)
            output = LM(seq_ind)
            output_lm = output[:-1,:]
            labels = seq_ind[1:]
            output_lm = torch.log_softmax(output_lm, dim=1)
            loss = criterion(output_lm.squeeze(0), labels.squeeze(0))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

  0%|          | 0/7717 [00:00<?, ?it/s]

2) Use the model to generate some example sequences

In [40]:
seq_len = 10
seq = [418] # index corresponding to 'But'
for _ in range(seq_len):
    out = LM(torch.IntTensor(seq))
    #next_token = torch.argmax(out[], dim=1)
    #seq.append
    out = out[-1,:]
    next_token = torch.argmax(out)
    seq.append(next_token.item())

In [41]:
' '.join([index2word[i] for i in seq])

'But , and the of the of her , and ,'

3) Design a RNN based classifier model for the task of sentiment classification. Use the sentiment classification dataset from last time.

In [42]:
# The architecture is already provided. Use the code from previous weeks to train/evaluate the model.