# 12. A Language Model from Scratch 

In [1]:
from fastai.text.all import * 

## The Data

In [2]:
print(URLs.HUMAN_NUMBERS)
path = untar_data(URLs.HUMAN_NUMBERS); path.ls()

https://s3.amazonaws.com/fast-ai-sample/human_numbers.tgz


(#2) [Path('/home/jupyter/.fastai/data/human_numbers/valid.txt'),Path('/home/jupyter/.fastai/data/human_numbers/train.txt')]

In [3]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [4]:
text = ' . '.join([l.strip() for l in lines]); 
text[:500], text[-500:]

('one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fourteen . fifteen . sixteen . seventeen . eighteen . nineteen . twenty . twenty one . twenty two . twenty three . twenty four . twenty five . twenty six . twenty seven . twenty eight . twenty nine . thirty . thirty one . thirty two . thirty three . thirty four . thirty five . thirty six . thirty seven . thirty eight . thirty nine . forty . forty one . forty two . forty three . forty four . forty fi',
 'eighty seven . nine thousand nine hundred eighty eight . nine thousand nine hundred eighty nine . nine thousand nine hundred ninety . nine thousand nine hundred ninety one . nine thousand nine hundred ninety two . nine thousand nine hundred ninety three . nine thousand nine hundred ninety four . nine thousand nine hundred ninety five . nine thousand nine hundred ninety six . nine thousand nine hundred ninety seven . nine thousand nine hundred ninety eight . nine thousand nine hundred nine

In [5]:
tokens = text.split(' ')
tokens[:10], L(tokens[-10:])

(['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.'],
 (#10) ['hundred','ninety','eight','.','nine','thousand','nine','hundred','ninety','nine'])

In [6]:
vocab = L(*tokens).unique()
print(vocab)

['one', '.', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 'thousand']


the whole vocab is pretty short! just 30 elements 

In [7]:
# convert our dataset of tokens into numbers 
word2idx = {w: i for i,w in enumerate(vocab)}
nums = L(word2idx[t] for t in tokens)
nums 

(#63095) [0,1,2,1,3,1,4,1,5,1...]

## Our First Language Model from Scratch 

In [8]:
L((tokens[i:i+3],tokens[i+3]) for i in range(0,len(tokens)-4,3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [9]:
# its important that the input variable is a tensor of ints, otherwise fastai fails with an obscure error later on 
seqs = L((tensor(nums[i:i+3]),nums[i+3]) for i in range(0,len(nums)-4,3)); seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [10]:
bs = 64 
cut = int(len(seqs)*0.8)
train, valid = seqs[:cut], seqs[cut:]
train, valid

((#16824) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...],
 (#4207) [(tensor([ 1,  8, 29]), 26),(tensor([26,  5,  1]), 8),(tensor([ 8, 29, 26]), 6),(tensor([6, 1, 8]), 29),(tensor([29, 26,  7]), 1),(tensor([ 1,  8, 29]), 26),(tensor([26,  8,  1]), 8),(tensor([ 8, 29, 26]), 9),(tensor([9, 1, 8]), 29),(tensor([29, 27,  1]), 8)...])

In [11]:
dls = DataLoaders.from_dsets(train,valid,bs=bs,shuffle=False)

In [12]:
dls.train_ds[2]

(tensor([4, 1, 5]), 1)

## Our Language Model in PyTorch 

In [13]:
class LMModel1(Module):
    def __init__(self, vocab_sz, n_hidden):
        # input -> hidden 
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        # hidden -> hidden
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
    
    def forward(self, x):
        h = F.relu(self.h_h(self.i_h(x[:,0])))
        h = h + self.i_h(x[:,1])
        h = h + F.relu(self.h_h(h))
        h = h + self.i_h(x[:,2])
        h = h + F.relu(self.h_h(h))
        return self.h_o(h)

In [14]:
dls.train_ds[:5], dls.valid_ds[:5]

((#5) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1)],
 (#5) [(tensor([ 1,  8, 29]), 26),(tensor([26,  5,  1]), 8),(tensor([ 8, 29, 26]), 6),(tensor([6, 1, 8]), 29),(tensor([29, 26,  7]), 1)])

In [15]:
learn = Learner(
    dls, 
    LMModel1(vocab_sz=len(vocab), n_hidden=64),
    # our model outputs a tensor of the size of vocab - the objective being to guess (classify) the next word. 
    # so this is essentially a classification problem, hence cross entropy loss      
    loss_func=F.cross_entropy, 
    metrics=accuracy
)
learn.fit_one_cycle(4,lr_max=1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.969537,2.078561,0.452817,00:02
1,1.483093,1.971367,0.462087,00:01
2,1.470783,1.734976,0.484668,00:01
3,1.452546,1.68631,0.49798,00:01


Let's find the most common token in the validation set, so that we can benchmark against that 

In [16]:
n,counts = 0,torch.zeros(len(vocab))
# x and y are batches! 
for x,y in dls.valid:
    n += y.shape[0]
    for i in y:
        counts[i] += 1
counts, L(tokens)

(tensor([106., 637., 159., 107., 106., 159., 108., 106., 464., 442.,   6.,   7.,
           6.,   6.,   7.,   6.,   6.,   7.,   6.,   6.,  64.,  63.,  63.,  64.,
          63.,  63.,  66.,  66., 600., 638.]),
 (#63095) ['one','.','two','.','three','.','four','.','five','.'...])

In [17]:
# the most common token is 'thousand', closely followed by '.'
i_max = torch.argmax(counts)
vocab[i_max], counts[i_max]/n

('thousand', tensor(0.1517))

In [18]:
class LMModel2(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
    
    def forward(self, x):
        h = 0 
        for i in range(3):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
        return self.h_o(h)

In [19]:
learn = Learner(dls, LMModel2(len(vocab), n_hidden=64), loss_func=F.cross_entropy,metrics=accuracy)
learn.fit_one_cycle(4,1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.841273,1.99093,0.462325,00:01
1,1.400708,1.818004,0.466366,00:01
2,1.418396,1.655742,0.489898,00:01
3,1.397689,1.639273,0.492988,00:01


### Maintaining the State of the RNN

In [20]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self, x):
        # x represents a sequence of tokens (ints), of length 3 in this case
        # the other dimension is the batch size
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h = self.h.detach()
        return out
    
    def reset(self): self.h = 0

We need to set up the batches nicely so that 'dls.train_ds[0]' is the first item in the first batch, 'dls.train_ds[1]' is the first item in the second batch, and so on. This was the sequence is correct for a given index in the batch as we move across batches 

In [21]:
m = len(seqs)//bs
m,bs,len(seqs)

(328, 64, 21031)

64 * 328 = 20992 , a bit short of 21031

batches should go:

(0, m, 2m, (bs-1)*m) 

(1, m + 1, 2m +1, ... )

In [22]:
def group_chunks(ds, bs):
    """
    Reorders a dataset so that sequences line up at every group of length bs. 
    Returns a dataset where: 0, bs, 2*bs, forms a contiguous sequence from the original ds 
    """
    m = len(ds)//bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i+m*j] for j in range(bs))
    return new_ds

The first element from each batch should form a sequence (and likewise with the nth) 

In [23]:
train_dset = group_chunks(seqs[:cut],bs=bs)
train_dset[:bs][:2], train_dset[bs:2*bs][:2], train_dset[2*bs:3*bs][:2] 

((#2) [(tensor([0, 1, 2]), 1),(tensor([11,  1,  2]), 28)],
 (#2) [(tensor([1, 3, 1]), 4),(tensor([28, 12,  1]), 2)],
 (#2) [(tensor([4, 1, 5]), 1),(tensor([ 2, 28, 13]), 1)])

In [24]:
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs,drop_last=True, shuffle=False
)

In [25]:
i = 0
dls.train_ds[0 + i], dls.train_ds[bs + i], dls.train_ds[2*bs + i] 

((tensor([0, 1, 2]), 1), (tensor([1, 3, 1]), 4), (tensor([4, 1, 5]), 1))

In [26]:
learn = Learner(dls, LMModel3(len(vocab),n_hidden=64), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter)

In [27]:
learn.fit_one_cycle(10,3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.688712,1.809808,0.485337,00:01
1,1.23861,1.779414,0.477163,00:01
2,1.0882,1.659898,0.49976,00:02
3,1.026641,1.786369,0.526683,00:01
4,0.967236,1.68678,0.541346,00:01
5,0.933995,1.81305,0.576923,00:01
6,0.8912,1.790356,0.553365,00:01
7,0.827113,1.87953,0.567308,00:01
8,0.794008,1.918127,0.575721,00:01
9,0.786994,1.914164,0.576442,00:01


### Creating More Signal 

Instead of only predicting the next word eery 3 words, we could try to predict the next word at every iteration in the input sequence 


Modify the model so output a prediction after every word in the sequence 

In [28]:
# how the old data looked
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [29]:
# how the "raw" dataset looks (after numericalization)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [60]:
sl = 16
bs = 64 
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [61]:
[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [62]:
dls.train_ds[0]

(tensor([0, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1]),
 tensor([1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9]))

In [63]:
class LMModel4(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        # actually, the dimensions of h are (bs, n_hidden)
        # i.e. it stored n_hidden number for each input in the batch
        # we just don't know the batch size in advance
        self.h = 0 # or torch.zeros(bs, n_hidden)
        
    def forward(self, x):
        outs = []
        for i in range(sl):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self): self.h = 0

This model now returns outputs of shape bs x sl x vocab_sz

In [64]:
def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [65]:
dls.bs

64

In [66]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func,
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(30, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.332706,3.218198,0.135498,00:00
1,2.876686,2.283478,0.281494,00:00
2,2.067094,1.835472,0.469564,00:00
3,1.67891,1.784485,0.470215,00:01
4,1.470798,1.818645,0.483398,00:00
5,1.335899,1.721651,0.466553,00:00
6,1.206056,1.725563,0.547689,00:00
7,1.110127,1.68615,0.5507,00:00
8,0.993419,1.820921,0.591146,00:00
9,0.898694,1.713631,0.634196,00:00


## Multilayer RNN 

In [76]:
class LMModel5(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        """
        vocab_sz = number of tokens in the vocab, i.e. input dimension, and also the output dimension, 
                   since its a classification problem to predict the next word. 
        n_hidden = dimension of the hidden vector(s) 
        n_layers = how deep the inner RNNs are stacked 
        """
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.RNN(
            input_size=n_hidden, 
            hidden_size=n_hidden,
            num_layers=n_layers,
            # If ``True``, then the input and output tensors are provided as `(batch, seq, feature)` instead of `(seq, batch, feature)`
            batch_first=True
        )
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        # NOTE dependence on batch size :D 
        self.h = torch.zeros(n_layers, bs, n_hidden)
        
    def forward(self, x):
        res, h = self.rnn(self.i_h(x),self.h)
        self.h = h.detach()
        return self.h_o(res)
    def reset(self): self.h.zero_()

In [77]:
learn = Learner(dls, LMModel5(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy,cbs=ModelResetter)
learn.fit_one_cycle(15,3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,2.983044,2.494934,0.459066,00:01
1,2.114896,1.734919,0.469564,00:00
2,1.693022,1.843952,0.372314,00:00
3,1.508971,1.784252,0.460042,00:00
4,1.341549,1.899744,0.503418,00:00
5,1.17162,1.981504,0.507324,00:00
6,1.038476,2.054833,0.516602,00:00
7,0.938852,2.111845,0.51123,00:00
8,0.848796,2.143358,0.5118,00:00
9,0.770825,2.168335,0.512614,00:00


We've now introduced loads more weights to train because we've added another layer of depth - making it harder to train this thing well. 

## LSTM 

We want the hidden state to have a far enough memory backwards to keep taking in account context, for example the gender someone a few tokens back in the sentence. 

But we also want a hidden state that helps us to make a prediction for the next word. 

<img src="../images/LSTM.png" id="lstm" caption="Architecture of an LSTM" alt="A graph showing the inner architecture of an LSTM" width="700">

In [89]:
class LSTMCell(Module):
    def __init__(self, ni, nh):
        # by using sigmoid activation, can "forget" elements of the cell state by multiplying them 
        # by near-zero values, or keep by using near-one values. 
        self.forget_gate = nn.Linear(ni + nh, nh)
        # works with the cell gate to update the cell state (note using + not *)
        self.input_gate = nn.Linear(ni + nh, nh)
        self.cell_gate = nn.Linear(ni + nh, nh)
        # determines the next hidden state (together with the cell state)         
        self.output_gate = nn.Linear(ni + nh, nh)
    # interesting that the hidden & cell states are passed to forward      
    def forward(self, input, state):
        h,c = state
        h = torch.cat([h,input],dim=1)
        forget = torch.sigmoid(self.forget_gate(h))
        c = c * forget 
        inp = torch.sigmoid(self.input_gate(h))
        cell = torch.tanh(self.cell_gate(h))
        c = c + (inp * cell)
        out = torch.sigmoid(self.output_gate(h))
        h = out * torch.tanh(c) 
        return h, (h,c)

More optimized (vectorized) version looks like this

In [91]:
class LSTMCell(Module):
    def __init__(self, ni,nh):
        self.ih = nn.Linear(ni + nh, nh)
        self.hh = nn.Linear(nh, 4*nh)
    
    def forward(self, input, state):
        h,c = state
        # one big matrix version of the 4 individual ones 
        gates = (self.ih(input) + self.hh(h)).chunk(4,1)
        ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()
        
        c = (forgetgate*c) + (ingate  * cellgate)
        h = outgate * c.tanh()
        return h, (h,c)

In [95]:
# this is how .chunk() works
t = torch.arange(0,11)
print(t)
t.chunk(2)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])


(tensor([0, 1, 2, 3, 4, 5]), tensor([ 6,  7,  8,  9, 10]))

In [102]:
class LMModel6(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        # using 2 here since we keep 2 hidden states: the hidden state and the cell state (of the same size)
        # NOTE dependence on batch size ;) - we keep a whole hidden state for every index in the batch 
        # (this is why we arranged the batches to line up contiguously)
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
    
    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)
    
    def reset(self):
        for h in self.h:
            h.zero_()

In [103]:
learn = Learner(
    dls, 
    LMModel6(len(vocab), n_hidden=64, n_layers=2),
    # we need to "flatten" the output and targets for cross entropy loss, because remember the model outputs a whole sequence, 
    # and compares to the target sequence 
    loss_func=CrossEntropyLossFlat(),
    metrics=accuracy, 
    # calls the reset() function of the model on every epoch to reset the hidden state (the sequence has now)     
    cbs=ModelResetter
)
learn.fit_one_cycle(15,1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.019561,2.709355,0.425049,00:01
1,2.129148,2.073267,0.27946,00:01
2,1.602866,1.881813,0.474691,00:01
3,1.3095,2.159304,0.505534,00:01
4,1.09477,2.244438,0.509928,00:01
5,0.849964,1.833465,0.583659,00:01
6,0.620447,1.891089,0.656982,00:01
7,0.433281,1.754252,0.651286,00:01
8,0.319305,1.793122,0.686686,00:01
9,0.248205,1.894193,0.708008,00:01


## Regularizing an LSTM 

In [None]:
class Dropout(Module):
    # a given neuron is dropped during training forward pass with probability p      
    def __init__(self, p): self.p = p
    def forward(self, x):
        if not self.training: return x
        mask = x.new(*x.shape).bernoulli_(1-p)
        return x * mask.div_(1-p)

In [120]:
x = torch.arange(1,10)
x.new(*x.shape).bernoulli_(0.5)

tensor([1, 0, 0, 0, 0, 1, 0, 1, 1])

In [132]:
torch.zeros(3).bernoulli_(0.5)

tensor([1., 0., 1.])

In [147]:
3*torch.ones(3).new(4,3)

tensor([[-1.5592e-37,  9.2851e-41,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00]])

In [151]:
class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p_drop):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p_drop)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        # this is interesting... surely they are inverse rather than equal 
        self.h_o.weight = self.i_h.weight
        # hidden and cell states for the LSTM 
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
    
    def forward(self, x):
        # refactored to make more intuitive for myself
        output, (h, c) = self.rnn(self.i_h(x), self.h)
        dropped = self.drop(output)
        # reassign hidden state as LSTM state output, but detaching gradients 
        self.h = [h.detach(), c.detach()]
        # return "actual" output, raw activations, and dropped-out activations 
        return self.h_o(output), output, dropped

    def reset(self):
        # re-zero the hidden and cell LSTM states          
        h, c = self.h
        h.zero_()
        c.zero_()

In [152]:
learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5), loss_func=CrossEntropyLossFlat(), metrics=accuracy,
               cbs=[ModelResetter, RNNRegularizer(alpha=2,beta=1)])

In [153]:
# equivalently, since TextLearner adds these by default 
learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.5), loss_func=CrossEntropyLossFlat(), metrics=accuracy)

In [154]:
learn.fit_one_cycle(15, 1e-2, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,2.215128,1.940104,0.52181,00:01
1,1.28485,1.240784,0.667155,00:01
2,0.592072,1.004446,0.794515,00:01
3,0.260202,0.799046,0.822998,00:01
4,0.119107,0.661496,0.84196,00:01
5,0.14752,0.594176,0.833496,00:01
6,0.074556,0.625001,0.840007,00:01
7,0.039842,0.611085,0.844157,00:01
8,0.024227,0.64057,0.844808,00:01
9,0.016854,0.655162,0.845866,00:01


In [213]:
x = dls.train_ds[2][0]; x

tensor([ 6,  1,  3, 28, 25,  7,  1,  3, 28, 25,  8,  1,  3, 28, 25,  9])

In [220]:
[vocab[i] for i in x]

['six', '.', 'three', 'hundred', 'seventy', 'seven', '.', 'three', 'hundred', 'seventy', 'eight', '.', 'three', 'hundred', 'seventy', 'nine']

In [229]:
btch=torch.stack(64*[x]) 

In [240]:
a,_,__ = learn.model(btch)

In [245]:
a.shape

torch.Size([64, 16, 30])

In [243]:
# %pprint

In [244]:
[vocab[i] for i in torch.argmax(a[0,:,:], axis=1)]

['.', 'three', 'thousand', 'seventy', 'seven', '.', 'three', 'hundred', 'seventy', 'eight', '.', 'three', 'hundred', 'seventy', 'nine', '.']

In [None]:
torch.stack()