# RNN

Ok, so I'm gonna try to make an RNN learn to speak like Hofstadter, trying to Google as little as possible :P

## Loading and processing the data

In [16]:
# LOAD GEBB
f = open("GEB.txt","r")
lines = list(f)

In [17]:
lines = list(filter(lambda x: x!= "\n",lines))

In [18]:
lines[:10]

['Contents \n',
 'Overview viii \n',
 'List of Illustrations xiv \n',
 'Words of Thanks xix \n',
 'Part I: GEB \n',
 'Introduction: A Musico-Logical Offering 3 \n',
 'Three-Part Invention 29 \n',
 'Chapter I: The MU-puzzle 33 \n',
 'Two-Part Invention 43 \n',
 'Chapter II: Meaning and Form in Mathematics 46 \n']

In [19]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'-"
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [20]:
lines = [unicodeToAscii(l) for l in lines]

In [21]:
lines[:10]

['Contents ',
 'Overview viii ',
 'List of Illustrations xiv ',
 'Words of Thanks xix ',
 'Part I GEB ',
 'Introduction A Musico-Logical Offering  ',
 'Three-Part Invention  ',
 'Chapter I The MU-puzzle  ',
 'Two-Part Invention  ',
 'Chapter II Meaning and Form in Mathematics  ']

In [22]:
import torch

In [23]:
n_letters=len(all_letters)
all_letters

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'-"

In [24]:
# One-hot matrix of first to last letters (not including EOS) for input
def inputTensor(line):
    tensor = torch.zeros(len(line),1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

In [25]:
# import pandas as pd
all_letters.find('a')

0

*Below are the inputs with which we train our model, which are sentences convereted to one-hot vectors*

In [26]:
# line = inputTensor(lines[0])

data = [inputTensor(l) for l in lines]

In [27]:
data[1201].shape
# torch.utils.data.DataLoader

torch.Size([81, 1, 58])

*Below we make the targets for our model, which are also one-hot vector'd sentences, but shifted by 1 character*

In [28]:
# LongTensor of second letter to end (EOS) for target
def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)

In [29]:
targets = [targetTensor(l) for l in lines]

## Defining the model
Ok, how do I make an RNN? Hmm

In [30]:
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F


In [31]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.hidden_size = hidden_size

        self.i2i = nn.Linear(input_size + hidden_size, input_size + hidden_size)
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        combined = F.relu(self.i2i(combined))
        hidden = F.tanh(self.i2h(combined))
        output = self.i2o(combined)
        out = self.softmax(output)
        logoutput = self.logsoftmax(output)
        return logoutput, out, hidden

    def initHidden(self):
        return Variable(torch.zeros(1, self.hidden_size))

n_hidden = 128


In [37]:
rnn = RNN(n_letters, n_hidden, n_letters)

*Function to get a minibatch randomly sampled from data*

In [32]:
import random
n = len(data)
def get_minibatch(batch_size):
    indices = random.sample(range(n),batch_size)
    return [(data[i],targets[i]) for i in indices]

In [33]:
# get_minibatch(2)[0]

### Testing the model

In [72]:
def sample_sentence():
    d = get_minibatch(10)[0][0]
    h = Variable(torch.zeros(1,n_hidden))
    let = Variable(d[0,:,:])
#     print(all_letters[torch.max(let,dim=1)[1]])
    lets = [let]
    for i in range(d.shape[0]):
        _,letter_distribution,h = rnn(let,h)
        let_index = torch.multinomial(letter_distribution,1).data[0][0]
        let = Variable(torch.zeros(1,n_letters))
        let[0,let_index]=1
        lets.append(let)
    return "".join([all_letters[torch.max(let,dim=1)[1]] for let in lets])

In [84]:
sample_sentence()

'Acharilice ang yme faveral bope -l'

In [96]:
#don't mind me, just checking what targets is
# lets[0].shape
# targets[0]

torch.Size([1, 58])

### Training

The `NLLLoss` stands for negative log-likelihood loss, which means (log of) the probability (likelihood) that each of the right {next characters} are obtained from the softmax distribution that the `rnn` outputs at each point in the sequence.

We work with the log of the probability for convenience (the numbers are more manageble, which the computer thanks us for, as it has to store something like -20, instead of 0.0000000000000000001 or something; this helps avoiding accuracy errors). For us, it means that instead of multiplying the likelihoods of the characters to get the likelihood of the sentence, we add the log-likelihoods.

We are going to use Adam optimizer, instead of the simpler SGD, because it tends to work better (really beacuse god Karpathy says so)

In [79]:
criterion = nn.NLLLoss()
optim = torch.optim.Adam(rnn.parameters())

In [None]:
from math import ceil
num_iters = 18000
# learning_rate = 0.001
batch_size = 100
for iteration in range(num_iters):
    loss = 0
    rnn.zero_grad()
    for b in get_minibatch(batch_size):
        sentence_chars = b[0]
        target_next_chars = b[1]
        h = Variable(torch.zeros(1,n_hidden))
        len_sentence = sentence_chars.shape[0]
        # I had some problems with very long sentences before, not sure why. Limiting it to 100 chars for now
        # but it's something to check out later.
        for i in range(min(len_sentence,100)):
            input_char = Variable(sentence_chars[i,:,:])
            loglet,_,h = rnn(input_char,h)
            target_char = Variable(torch.LongTensor([target_next_chars[i]]))
            loss += criterion(loglet,target_char)
    #         lets.append(let)
    loss /= batch_size
    loss.backward()
    optim.step()
# below is if we wanted to use SGD
#     for p in rnn.parameters():
#         p.data.add_(-learning_rate, p.grad.data)
    if iteration%(ceil(num_iters/1000))==0:
        print(iteration)
        print(loss.data.numpy()[0])

0
116.73786
18
111.70514
36
110.96231
54
111.41884
72
115.752106
90
117.673454
108
116.097855
126
108.972
144
108.51685


In [485]:
loss

Variable containing:
 191.1607
[torch.FloatTensor of size (1,)]

In [235]:
#code graveyard
# targets[0]
# let = Variable(d[0,:,:])
# h = Variable(torch.zeros(1,n_hidden))
# loss.backward()
# let,h = rnn(let,h)
# let,h