In [1]:
#hide
from utils import *

# Making our RNN state of the art

In [2]:
#hide
from fastai2.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
text = ' . '.join([l.strip() for l in lines])
tokens = text.split(' ')
vocab = L(*tokens).unique()
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)

def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds

In [3]:
sl,bs = 16,64
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1])) for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs), group_chunks(seqs[cut:], bs), bs=bs, drop_last=True, shuffle=False)

## Multilayer RNNs

### The model

In [None]:
class LMModel5(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = torch.zeros(n_layers, bs, n_hidden)
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(res)
    
    def reset(self): self.h.zero_()

In [None]:
learn = Learner(dls, LMModel5(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelReseter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.048115,2.622384,0.434001,00:02
1,2.136388,1.763967,0.471191,00:02
2,1.689246,1.898718,0.364746,00:02
3,1.443545,1.74744,0.480387,00:01
4,1.271023,1.870939,0.47998,00:02
5,1.101259,1.794428,0.495361,00:02
6,0.94838,1.769644,0.511149,00:02
7,0.822373,1.800406,0.5354,00:01
8,0.731188,1.914065,0.522461,00:01
9,0.662659,1.987547,0.525798,00:02


### Handling exploding or disappearing activations

## LSTM

### Building an LSTM from scratch

In [None]:
class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.forget_gate = nn.Linear(ni + nh, nh)
        self.input_gate  = nn.Linear(ni + nh, nh)
        self.cell_gate   = nn.Linear(ni + nh, nh)
        self.output_gate = nn.Linear(ni + nh, nh)

    def forward(self, input, state):
        h,c = state
        h = torch.stack([x, input], dim=1)
        forget = torch.sigmoid(self.forget_gate(h))
        c = c * forget
        inp = torch.sigmoid(self.input_gate(h))
        cell = torch.tanh(self.cell_gate(h))
        c = c + inp * cell
        out = torch.sigmoid(self.output_gate(h))
        h = outgate * torch.tanh(c)
        return h, (h,c)

In [None]:
class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.ih = nn.Linear(ni,4*nh)
        self.hh = nn.Linear(nh,4*nh)

    def forward(self, input, state):
        h,c = state
        #One big multiplication for all the gates is better than 4 smaller ones
        gates = (self.ih(input) + self.hh(h)).chunk(4, 1)
        ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()

        c = (forgetgate*c) + (ingate*cellgate)
        h = outgate * c.tanh()
        return h, (h,c)

### Training a language model using LSTMs

In [None]:
class LMModel6(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(2, bs, n_hidden) for _ in range(n_layers)]
        
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)
    
    def reset(self): 
        for h in self.h: h.zero_()

In [None]:
learn = Learner(dls, LMModel6(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelReseter)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.031346,2.749381,0.279215,00:03
1,2.219651,2.08445,0.204753,00:03
2,1.659518,1.685639,0.479574,00:03
3,1.41055,1.666663,0.50944,00:03
4,1.204062,1.606485,0.541829,00:03
5,1.021459,1.529109,0.592448,00:03
6,0.785871,1.34028,0.642008,00:03
7,0.547519,1.27171,0.688802,00:03
8,0.339775,1.216605,0.753825,00:03
9,0.19755,1.218557,0.743652,00:02


## Regularizing an LSTM

### Dropout

In [None]:
class Dropout(Module):
    def __init__(self, p): self.p = p
    def forward(self, x):
        if self.training: return x
        mask = x.new(*x.shape).bernoulli_(1-p)
        return x * mask.div_(1-p)

### AR and TAR regularization

### Training a regularized LSTM

In [12]:
class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(2, bs, n_hidden) for _ in range(n_layers)]
        
    def forward(self, x):
        raw,h = self.rnn(self.i_h(x), self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(out),raw,out
    
    def reset(self): 
        for h in self.h: h.zero_()

In [15]:
learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.4), loss_func=CrossEntropyLossFlat(), 
                metrics=accuracy, cbs=[ModelReseter, RNNRegularizer(alpha=2, beta=1)])

In [14]:
learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.4), loss_func=CrossEntropyLossFlat(), 
                metrics=accuracy)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.145553,2.495994,0.437581,00:03
1,2.333189,1.674463,0.491862,00:03
2,1.678753,1.500536,0.553955,00:03
3,1.111904,1.040109,0.748779,00:03
4,0.707829,0.773369,0.807699,00:02
5,0.465899,0.621159,0.829346,00:03
6,0.335249,0.649926,0.839193,00:03
7,0.254418,0.586989,0.841064,00:03
8,0.205191,0.527288,0.850179,00:02
9,0.172876,0.460011,0.868652,00:02


## Conclusion