Making recurrent neural networks from scratch. Note that we will only use the fastai library on occasion. Otherwise, we will just use PyTorch

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *
from fastai.column_data import *

## Data setup

In [2]:
PATH = 'data/lotr'

In [3]:
os.makedirs(PATH, exist_ok=True)

In [4]:
url = 'https://archive.org/download/TheLordOfTheRing1TheFellowshipOfTheRing/The%20Lord%20Of%20The%20Ring%201-The%20Fellowship%20Of%20The%20Ring_djvu.txt'
filename = 'lotrfotr.txt'
filepath = os.path.join(PATH, filename)
get_data(url, filepath)

In [5]:
text = open(filepath).read()
print(f'corpus length: {len(text)}')

corpus length: 1018701


Lets see what the first 400 characters of lord of the rings looks like

In [6]:
text[:400]

"\n“THE LORD OF THE RINGS' \n\nV*art One \n\nTHE FELLOWSHIP \nOF THE RING \n\nJ.R.R.ToIkien \n\n\n\nComplete Table of Contents \n\n\n\nForeword \n\nPrologue \n\n1 . Concerning Hobbits \n\n2. Concerning Pipe-weed \n\n3. Of the Ordering of the Shire \n\n4. Of the Finding of the Ring \n\nnote on the shire records \n\n\n\n\nBook I \n\nChapter 1 \nChapter 2 \nChapter 3 \nChapter 4 \nChapter 5 \nChapter 6 \nChapter 7 \nChapter 8 \nChapter 9 \nChap"

Now create a list of all unique characters that are seen somewhere in the book

In [7]:
chars = sorted(list(set(text)))
n_vocab = len(chars) + 1
print(f'num chars: {n_vocab + 1}')

num chars: 85


In [8]:
chars.insert(0, '\0')
' '.join(chars)

'\x00 \n   ! " \' ( ) * , - . / 0 1 2 3 4 5 6 7 8 9 : ; = ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z ® — ’ “'

Create maps from:  
* chars to indices
* indices to chars

In [9]:
char_indices = {c:i for i, c in enumerate(chars)}
indices_char = {i:c for i, c in enumerate(chars)}

idx is the index equivalent of each character in text

In [10]:
idx = [char_indices[c] for c in text]
print(idx[:10])

[1, 83, 46, 34, 31, 2, 38, 41, 44, 30]


we can convert idx back to character format using indices_char, this is exactly the same as the text variable

In [11]:
''.join(indices_char[i] for i in idx[:100])

"\n“THE LORD OF THE RINGS' \n\nV*art One \n\nTHE FELLOWSHIP \nOF THE RING \n\nJ.R.R.ToIkien \n\n\n\nComplete Tabl"

## Three char model

### Create and format inputs

Create a list of every 4th character, starting at the 0th, 2nd, and 3rd characters

In [12]:
cs = 3
c1 = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2 = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3 = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4 = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

Create the arrays that will contain the training data for the three char model.

In [13]:
x1 = np.array(c1)
x2 = np.array(c2)
x3 = np.array(c3)

In [14]:
y = np.array(c4)

In [15]:
x1[:5],  x2[:5], x3[:5], y[:5]

(array([ 1, 34, 38, 30, 32]),
 array([83, 31, 41,  2,  2]),
 array([46,  2, 44, 41, 46]),
 array([34, 38, 30, 32, 34]))

All x1, x2, x3, and y must have the same number of elements.

In [16]:
assert(x1.shape == x2.shape and x3.shape == y.shape)

### Create and train model

In [17]:
n_hidden = 256
n_embed  = 40

In [18]:
class ThreeChar(nn.Module):
    def __init__(self, n_vocab, n_embed, n_hidden):
        super().__init__()
        self.e = nn.Embedding(n_vocab, n_embed)
        
        # apply this linear layer to each new character
        self.l_in = nn.Linear(n_embed, n_hidden)
        
        # this linear layer updates our hidden state. Note that 
        # it is a square matrix so it never changes dimension
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # "final" linear layer that predicts the next character
        self.l_out = nn.Linear(n_hidden, n_vocab)
        
    def forward(self, c1, c2, c3):
        # compute l_in with each embedding for character
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        # initialize the hidden state, then update for each character
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [19]:
xs = np.stack([x1, x2, x3], axis=1)
xs.shape

(339566, 3)

In [20]:
md = ColumnarModelData.from_arrays('.', [-1], xs, y, bs=512)

In [21]:
m = ThreeChar(n_vocab, n_embed, n_hidden).cuda()

In [22]:
it = iter(md.trn_dl)

Pack x1, x2, and x3 into *xs

In [23]:
*xs, yt = next(it)

In [24]:
len(xs)

3

Compute predictions

In [25]:
t = m(*V(xs))

In [26]:
len(V(xs))

3

Log probabilities of for each character

In [27]:
t

Variable containing:
-4.4123 -4.2664 -4.6312  ...  -4.6053 -4.4809 -4.5260
-4.4233 -4.5043 -4.1888  ...  -4.4875 -4.5065 -4.7225
-4.4609 -4.4149 -4.3325  ...  -4.6667 -4.4499 -4.6087
          ...             ⋱             ...          
-4.2953 -4.3401 -4.2761  ...  -4.4025 -4.5608 -4.6122
-4.3743 -4.3975 -4.4475  ...  -4.4979 -4.5918 -4.6197
-4.3936 -4.4370 -4.4310  ...  -4.4650 -4.5840 -4.6731
[torch.cuda.FloatTensor of size 512x84 (GPU 0)]

Now that we know the model works, let's train it. Note that we only have one three letter sequence in the validation set, so this val_loss is essentially useless.

In [28]:
opt = optim.Adam(m.parameters(), 1e-2)

In [29]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.931394   2.467796  



[2.4677958]

In [30]:
set_lrs(opt, 0.001)

In [31]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.680476   1.846977  



[1.8469772]

### Try out model

In [32]:
inp = 'rin'
idxs = T(np.array([char_indices[c] for c in inp]))
idxs


 71
 62
 67
[torch.cuda.LongTensor of size 3 (GPU 0)]

In [33]:
p = m(*V(idxs))

In [34]:
p.size()

torch.Size([1, 84])

In [35]:
i = np.argmax(to_np(p))
i

60

In [36]:
def get_next(inp):
    """ inp: three letter string """
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*V(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

Going to try swo(rd), arr(ow),  th(e), rin(g), fel(lowship)

In [37]:
get_next('swo')

'r'

In [38]:
get_next('arr')

'o'

In [39]:
get_next(' th')

'e'

In [40]:
get_next('rin')

'g'

In [41]:
get_next('and')

' '

In [42]:
get_next('fel')

'l'

In [43]:
def get_next_n(inp, n):
    """ computes get_next n times """
    s = inp
    for _ in range(n):
        c = get_next(inp)
        s = s+c
        inp = s[-3:]
    return s

In [44]:
get_next_n('swo', 2)

'sword'

In [45]:
get_next_n('arr', 2)

'arrow'

Note that our ThreeChar model has no long term dependence. It is unable to complete long words like fel(lowship)

In [46]:
get_next_n('fel', 10)

'fell the stre'

## First RNN

### Format inputs

In [47]:
cs = 8

For each o through 7, create a list of every cs character with that starting point. These are 8 inputs to model

In [48]:
c_in = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

In [49]:
c_out = [idx[cs+j] for j in range(len(idx)-cs)]

each element in c_in corresponds to an element in c_out, so they should have equal length

In [50]:
assert(len(c_in) == len(c_out))

now convert them into numpy arrays

In [51]:
xs = np.stack(c_in, axis=0)

In [52]:
y = np.stack(c_out)

In [53]:
xs.shape

(1018693, 8)

In [54]:
y.shape

(1018693,)

Each row is a series of 8 characters from the text. Notice that each subsequent row starts one character later and goes one character further in the text

In [55]:
xs[:5,:cs]

array([[ 1, 83, 46, 34, 31,  2, 38, 41],
       [83, 46, 34, 31,  2, 38, 41, 44],
       [46, 34, 31,  2, 38, 41, 44, 30],
       [34, 31,  2, 38, 41, 44, 30,  2],
       [31,  2, 38, 41, 44, 30,  2, 41]])

y is the character that comes just after the row

In [56]:
y[:cs]

array([44, 30,  2, 41, 32,  2, 46, 34])

### Create and train RNN

Create validation indices

In [67]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [68]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [69]:
class CharLoop(nn.Module):
    def __init__(self, n_vocab, n_embed, n_hidden):
        super().__init__()
        self.e = nn.Embedding(n_vocab, n_embed)
        self.l_in = nn.Linear(n_embed, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, n_vocab)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [70]:
m = CharLoop(n_vocab, n_embed, n_hidden).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [71]:
fit(m, md, 1, opt, F.nll_loss)
set_lrs(opt, 0.001)
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.868543   1.873667  



epoch      trn_loss   val_loss   
    0      1.600789   1.606765  



[1.6067652]

Because adding new characters (after applying l_in) to the hidden state is lossy, we will try concatenating in input to the hidden state instead. This requires that we change the size of l_in.

In [75]:
class CharLoopConcat(nn.Module):
    def __init__(self, n_vocab, n_embed, n_hidden):
        super().__init__()
        self.e = nn.Embedding(n_vocab, n_embed)
        self.l_in = nn.Linear(n_embed+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, n_vocab)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        
        for c in cs:
            # concat h and char embed together
            inp = torch.cat((h, self.e(c)), dim=1)
            
            # pass through l_in
            inp = F.relu(self.l_in(inp))
            
            # recompute hidden state
            h = F.tanh(self.l_hidden(inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [78]:
m = CharLoopConcat(n_vocab, n_embed, n_hidden).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [None]:
fit(m, md, 1, opt, F.nll_loss)
set_lrs(opt, 1e-4)
fit(m, md, 1, opt, F.nll_loss)

  5%|▍         | 73/1592 [00:03<01:15, 20.00it/s, loss=2.77]