In [6]:
import fastbook
fastbook.setup_book()

In [8]:
from fastai.text import *
from fastai.callback import *
from fastai.basics import *
from utils import *

In [9]:
path = untar_data(URLs.HUMAN_NUMBERS)
Path.BASE_PATH=path                #Truncates the big path and shows only the actual filename
path.ls()
#file=get_text_files(path/'train.txt') # Cant get with this command as there are no seperate files instead read whole text
#file

[Path('train.txt'), Path('valid.txt')]

In [10]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [11]:
#lines[0].strip()  # without \n
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [12]:
#text = '.' .join(l for l in lines) #tokens contain extra space at end w/o l.strip()
#text[:30]
#text = text.strip(' ')
#tokens = text.split('\n.')
#tokens[0:3]

In [13]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [14]:
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [15]:
word2idx={w:i for i,w in enumerate(vocab)} #dictionary stores word integer pair. enumerate does the job
nums = L(word2idx[i] for i in tokens)      #tokens has exactly same words as in vocab
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [16]:
L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))     #Run on 1-3 tokens to predict 4th token

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [18]:
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [19]:
bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)

In [20]:
class LMModel1(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        
    def forward(self, x):
        h = F.relu(self.h_h(self.i_h(x[:,0])))
        h = h + self.i_h(x[:,1])
        h = F.relu(self.h_h(h))
        h = h + self.i_h(x[:,2])
        h = F.relu(self.h_h(h))
        return self.h_o(h)

In [21]:
learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy, 
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)                                     

epoch,train_loss,valid_loss,accuracy,time
0,1.824297,1.970941,0.467554,00:03
1,1.386973,1.823242,0.467554,00:02
2,1.417556,1.654497,0.494414,00:02
3,1.37644,1.650849,0.494414,00:02


In [22]:
n,counts = 0,torch.zeros(len(vocab))
dls.valid
for x,y in dls.valid:
    n+=y.shape[0]
    for i in range_of(vocab): counts[i]+= (y==i).long().sum()
idx=torch.argmax(counts)
idx,vocab[idx.item()],counts[idx].item()/n

(tensor(29), 'thousand', 0.15165200855716662)

In [23]:
class LMModel2(Module):
    def __init__(self, vocab_sz,n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        
    def forward(self,x):
        h = 0
        for i in range(3):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
        return self.h_o(h)

In [24]:
learn = Learner(dls, LMModel2(len(vocab),64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4,1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.816274,1.964143,0.460185,00:02
1,1.423805,1.739964,0.473259,00:02
2,1.430327,1.685172,0.485382,00:02
3,1.38839,1.657033,0.470406,00:02


In [25]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0
        
    def forward(self, x):
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i])
            self.h= F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h= self.h.detach()
        return out
    
    def reset(self): self.h=0                                    

In [26]:
m= len(seqs)//bs
m, bs, len(seqs)

(328, 64, 21031)

In [27]:
def group_chunks(ds,bs):
    m = len(ds)//bs
    new_ds = L()
    for i in range(m): new_ds+= L(ds[i+m*j] for j in range(bs))
    return new_ds

In [28]:
cut = int(len(seqs)*0.8)
dls= DataLoaders.from_dsets(group_chunks(seqs[:cut], bs), group_chunks(seqs[cut:], bs), bs=bs, drop_last=True, shuffle=False)

In [29]:
learn = Learner(dls, LMModel3(len(vocab),64), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.677074,1.827367,0.467548,00:02
1,1.282722,1.870913,0.388942,00:02
2,1.090705,1.651794,0.4625,00:02
3,1.005216,1.61599,0.515144,00:02
4,0.963933,1.532558,0.541106,00:02
5,0.91149,1.649536,0.536779,00:02
6,0.905956,1.540368,0.562981,00:02
7,0.854337,1.645498,0.572356,00:02
8,0.815532,1.685226,0.575481,00:02
9,0.804688,1.682077,0.576442,00:02


In [30]:
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))   #Check the 3 seqs to make this code
         for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [31]:
[L(vocab[i] for i in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [32]:
class LMModel4(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self, x):
        outs = []
        for i in range(sl):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self): self.h = 0

In [33]:
def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [34]:
learn = Learner(dls, LMModel4(len(vocab),64), loss_func=loss_func, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.285931,3.072032,0.212565,00:01
1,2.330371,1.969522,0.425781,00:01
2,1.742317,1.841378,0.441488,00:01
3,1.47012,1.810856,0.494303,00:01
4,1.296829,1.827492,0.498942,00:01
5,1.177568,1.773365,0.491211,00:01
6,1.070489,1.761416,0.54012,00:01
7,0.979859,1.643734,0.558919,00:01
8,0.891755,1.675408,0.56901,00:01
9,0.830193,1.694036,0.565837,00:01


In [35]:
class LMModel5(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = torch.zeros(n_layers, bs, n_hidden)
    
    def forward(self,x):
        res, h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(res)
    
    def reset(self): self.h.zero_()

In [36]:
learn = Learner(dls, LMModel5(len(vocab),64,2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.04179,2.548714,0.455811,00:01
1,2.128514,1.708763,0.471029,00:01
2,1.699163,1.86605,0.340576,00:01
3,1.499681,1.738478,0.471517,00:01
4,1.33909,1.729538,0.494792,00:01
5,1.206317,1.835859,0.502848,00:01
6,1.08824,1.845548,0.520101,00:01
7,0.982787,1.856244,0.522624,00:01
8,0.890791,1.940332,0.525716,00:01
9,0.809585,2.028805,0.529785,00:01


In [37]:
class LSTMCell(Module):
    def __init__(self,ni,nh):
        h, c = state
        self.forget_gate = nn.Linear(ni+nh, nh)
        self.input_gate = nn.Linear(ni+nh, nh)
        self.cell_gate = nn.Linear(ni+nh, nh)
        self.output_gate = nn.Linear(ni+nh, nh)
        
    def forward(self, input, state):
        h, c = state
        h = torch.stack([input+h], dim=1)
        forget = torch.sigmoid(self.forget_gate(h))
        c= c*forget
        inp = torch.sigmoid(self.input_gate(h))
        cell = torch.tanh(self.cell_gate(h))
        c = c+inp*cell
        out = torch.sigmoid(self.output_gate(h))
        h=out*torch.tanh(c)
        return h, (h,c)   

In [38]:
class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.ih=nn.Linear(ni,4*nh)
        self.hh=nn.Linear(nh,4*nh)
        
    def forward(self, input, state):
        h,c = state
        gates = (self.ih(input)+self.hh(h)).chunk(4,1)
        ingate, forgetgate, outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()
        
        c=(c*forgetgate)+(cellgate*ingate)
        h=outgate*c.tanh()
        return h, (h,c)

In [39]:
t = torch.arange(0,10); t
t.chunk(4)

(tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7, 8]), tensor([9]))

In [40]:
class LMModel6(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
        
    def forward(self,x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)
    
    def reset(self):
        for h in self.h: h.zero_()

In [41]:
learn = Learner(dls, LMModel6(len(vocab),64,2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelResetter )
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.026113,2.772102,0.153076,00:02
1,2.216184,2.089064,0.269124,00:01
2,1.613939,1.826081,0.47876,00:01
3,1.315253,2.050628,0.503011,00:01
4,1.085567,1.965356,0.585531,00:01
5,0.856308,1.85614,0.640544,00:01
6,0.622123,2.036759,0.681071,00:02
7,0.427603,1.90388,0.7264,00:01
8,0.273226,1.749582,0.742269,00:01
9,0.167057,1.670335,0.766439,00:01


In [42]:
class Dropout(Module):
    def __init__(self,p): self.p=p
    def forward(self,x):
        if not self.training: return x
        mask = x.new(*x.shape).bernoulli_(1-p)
        return x*mask.div_(1-p)        

In [43]:
class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
        
    def forward(self, x):
        raw,h = self.rnn(self.i_h(x), self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(out),raw,out
    
    def reset(self): 
        for h in self.h: h.zero_()

In [47]:
learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5),
                loss_func=CrossEntropyLossFlat(), metrics=accuracy,
                cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)])

In [48]:
#learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.4), loss_func=CrossEntropyLossFlat(), metrics=accuracy)

In [49]:
learn.fit_one_cycle(15, 1e-2, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,2.685102,2.207286,0.443278,00:02
1,1.905883,1.712825,0.542887,00:01
2,1.201828,0.953411,0.713053,00:02
3,0.699839,0.615397,0.795736,00:02
4,0.43525,0.616063,0.80542,00:02
5,0.301749,0.529414,0.839844,00:02
6,0.231002,0.48729,0.850505,00:02
7,0.192021,0.50561,0.846761,00:01
8,0.166744,0.535498,0.838135,00:01
9,0.148056,0.483445,0.856608,00:02
