# RNN

Ok, so I'm gonna try to make an RNN learn to speak like Hofstadter, trying to Google as little as possible :P

## Loading and processing the data

In [2]:
# LOAD GEBB
f = open("GEB.txt","r")
lines = list(f)

In [3]:
lines = list(filter(lambda x: x!= "\n",lines))

In [4]:
lines[:10]

['Contents \n',
 'Overview viii \n',
 'List of Illustrations xiv \n',
 'Words of Thanks xix \n',
 'Part I: GEB \n',
 'Introduction: A Musico-Logical Offering 3 \n',
 'Three-Part Invention 29 \n',
 'Chapter I: The MU-puzzle 33 \n',
 'Two-Part Invention 43 \n',
 'Chapter II: Meaning and Form in Mathematics 46 \n']

In [5]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'-"
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [6]:
lines = [unicodeToAscii(l) for l in lines]

In [7]:
lines[:10]

['Contents ',
 'Overview viii ',
 'List of Illustrations xiv ',
 'Words of Thanks xix ',
 'Part I GEB ',
 'Introduction A Musico-Logical Offering  ',
 'Three-Part Invention  ',
 'Chapter I The MU-puzzle  ',
 'Two-Part Invention  ',
 'Chapter II Meaning and Form in Mathematics  ']

In [8]:
import torch

In [9]:
n_letters=len(all_letters)
all_letters

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'-"

In [10]:
# One-hot matrix of first to last letters (not including EOS) for input
def inputTensor(line):
    tensor = torch.zeros(len(line),1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

In [11]:
# import pandas as pd
all_letters.find('a')

0

*Below are the inputs with which we train our model, which are sentences convereted to one-hot vectors*

In [12]:
# line = inputTensor(lines[0])

data = [inputTensor(l) for l in lines]

In [13]:
data[1201].shape
# torch.utils.data.DataLoader

torch.Size([81, 1, 58])

*Below we make the targets for our model, which are also one-hot vector'd sentences, but shifted by 1 character*

In [14]:
# LongTensor of second letter to end (EOS) for target
def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)

In [15]:
targets = [targetTensor(l) for l in lines]

## Defining the model
Ok, how do I make an RNN? Hmm

In [16]:
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F


In [17]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.hidden_size = hidden_size

        self.i2i = nn.Linear(input_size + hidden_size, input_size + hidden_size)
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        combined = F.relu(self.i2i(combined))
        hidden = F.tanh(self.i2h(combined))
        output = self.i2o(combined)
        out = self.softmax(output)
        logoutput = self.logsoftmax(output)
        return logoutput, out, hidden

    def initHidden(self):
        return Variable(torch.zeros(1, self.hidden_size))

n_hidden = 128


In [18]:
rnn = RNN(n_letters, n_hidden, n_letters)

*Function to get a minibatch randomly sampled from data*

In [19]:
import random
n = len(data)
def get_minibatch(batch_size):
    indices = random.sample(range(n),batch_size)
    return [(data[i],targets[i]) for i in indices]

In [20]:
# get_minibatch(2)[0]

### Testing the model

In [21]:
def sample_sentence():
    d = get_minibatch(10)[0][0]
    h = Variable(torch.zeros(1,n_hidden))
    let = Variable(d[0,:,:])
#     print(all_letters[torch.max(let,dim=1)[1]])
    lets = [let]
    for i in range(d.shape[0]):
        _,letter_distribution,h = rnn(let,h)
        let_index = torch.multinomial(letter_distribution,1).data[0][0]
        let = Variable(torch.zeros(1,n_letters))
        let[0,let_index]=1
        lets.append(let)
    return "".join([all_letters[torch.max(let,dim=1)[1]] for let in lets])

In [45]:
sample_sentence()

'the Mashiom a does man proof as explain in the program has been the Proofic Paired gal'

In [96]:
#don't mind me, just checking what targets is
# lets[0].shape
# targets[0]

torch.Size([1, 58])

### Training

The `NLLLoss` stands for negative log-likelihood loss, which means (log of) the probability (likelihood) that each of the right {next characters} are obtained from the softmax distribution that the `rnn` outputs at each point in the sequence.

We work with the log of the probability for convenience (the numbers are more manageble, which the computer thanks us for, as it has to store something like -20, instead of 0.0000000000000000001 or something; this helps avoiding accuracy errors). For us, it means that instead of multiplying the likelihoods of the characters to get the likelihood of the sentence, we add the log-likelihoods.

We are going to use Adam optimizer, instead of the simpler SGD, because it tends to work better (really beacuse [god Karpathy says so](http://karpathy.github.io/2015/05/21/rnn-effectiveness/))

In [51]:
criterion = nn.NLLLoss()
optim = torch.optim.Adam(rnn.parameters())
# optim = torch.optim.RMSprop(rnn.parameters())

In [None]:
from math import ceil
num_iters = 10000
# learning_rate = 0.001
batch_size = 500
for iteration in range(num_iters):
    loss = 0
    rnn.zero_grad()
    for b in get_minibatch(batch_size):
        sentence_chars = b[0]
        target_next_chars = b[1]
        h = Variable(torch.zeros(1,n_hidden))
        len_sentence = sentence_chars.shape[0]
        # I had some problems with very long sentences before, not sure why. Limiting it to 100 chars for now
        # but it's something to check out later.
        for i in range(min(len_sentence,100)):
            input_char = Variable(sentence_chars[i,:,:])
            loglet,_,h = rnn(input_char,h)
            target_char = Variable(torch.LongTensor([target_next_chars[i]]))
            loss += criterion(loglet,target_char)
    #         lets.append(let)
    loss /= batch_size
    loss.backward()
    optim.step()
# below is if we wanted to use SGD
#     for p in rnn.parameters():
#         p.data.add_(-learning_rate, p.grad.data)
    if iteration%(ceil(num_iters/1000))==0:
        print(iteration)
        print(loss.data.numpy()[0])

0
80.49661
10
77.69018
20
79.78951
30
74.845024
40
82.25401
50
79.24931
60
79.22946
70
77.94086
80
81.32197
90
79.94307
100
77.467224
110
81.90501
120
78.57027
130
82.19566
140
82.321724
150
78.31985
160
77.19246
170
78.538635
180
78.44568
190
79.77867
200
78.18631
210
79.288826
220
76.40116
230
78.14143
240
75.48411
250
75.86159
260
79.45234
270
74.76865
280
75.23619
290
79.43416
300
77.186615
310
79.22845
320
78.879845
330
77.11562
340
78.25997
350
78.07991
360
78.06127
370
75.80647
380
76.499725
390
80.51418
400
76.425
410
77.14709
420
78.39279
430
79.62222
440
76.35435
450
79.27382
460
74.0003
470
79.40046
480
81.59311
490
74.797295
500
79.38137
510
77.05853
520
78.85457
530
76.55849
540
80.79766
550
78.41511
560
75.72512
570
78.11564
580
76.194016
590
76.21995
600
75.579185
610
76.376854
620
76.7463
630
79.5023
640
77.78413
650
81.6432
660
79.70269
670
78.53166
680
78.053474
690
81.41713
700
75.95135
710
77.263695
720
75.85729
730
76.71313
740
78.66514
750
77.28721
760
78.252464
7

In [485]:
loss

Variable containing:
 191.1607
[torch.FloatTensor of size (1,)]

In [53]:
#Saving trained net
import pickle
# pickle.dump(rnn.state_dict(), open("trained_simple_rnn2.pkl","wb"))
rnn.load_state_dict(pickle.load(open("trained_simple_rnn.pkl","rb")))

In [235]:
#code graveyard
# targets[0]
# let = Variable(d[0,:,:])
# h = Variable(torch.zeros(1,n_hidden))
# loss.backward()
# let,h = rnn(let,h)
# let,h