In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

In [2]:
PATH = 'data/aclImdb/'

TRN_PATH = 'train/all/'
VAL_PATH = 'test/all/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

imdbEr.txt  imdb.vocab  [0m[01;34mmodels[0m/  README  [01;34mtest[0m/  TEXT.pkl  [01;34mtmp[0m/  [01;34mtrain[0m/


In [33]:
trn_files = !ls {TRN}
trn_files[:10]

['0_0.txt',
 '0_3.txt',
 '0_9.txt',
 '10000_0.txt',
 '10000_4.txt',
 '10000_8.txt',
 '1000_0.txt',
 '10001_0.txt',
 '10001_10.txt',
 '10001_4.txt']

In [34]:
review = !cat {TRN}{trn_files[6]}
review[0]

"I have to say when a name like Zombiegeddon and an atom bomb on the front cover I was expecting a flat out chop-socky fung-ku, but what I got instead was a comedy. So, it wasn't quite was I was expecting, but I really liked it anyway! The best scene ever was the main cop dude pulling those kids over and pulling a Bad Lieutenant on them!! I was laughing my ass off. I mean, the cops were just so bad! And when I say bad, I mean The Shield Vic Macky bad. But unlike that show I was laughing when they shot people and smoked dope.<br /><br />Felissa Rose...man, oh man. What can you say about that hottie. She was great and put those other actresses to shame. She should work more often!!!!! I also really liked the fight scene outside of the building. That was done really well. Lots of fighting and people getting their heads banged up. FUN! Last, but not least Joe Estevez and William Smith were great as the...well, I wasn't sure what they were, but they seemed to be having fun and throwing out 

In [35]:
#check how many words are in the dataset
!find {TRN} -name '*.txt' | xargs cat | wc -w

17486581


In [36]:
!find {VAL} -name '*.txt' | xargs cat | wc -w

5686719


Tokenize text

In [37]:
spacy_tok = spacy.load('en')

In [38]:
' '.join([sent.string.strip() for sent in spacy_tok(review[0])])

"I have to say when a name like Zombiegeddon and an atom bomb on the front cover I was expecting a flat out chop - socky fung - ku , but what I got instead was a comedy . So , it was n't quite was I was expecting , but I really liked it anyway ! The best scene ever was the main cop dude pulling those kids over and pulling a Bad Lieutenant on them ! ! I was laughing my ass off . I mean , the cops were just so bad ! And when I say bad , I mean The Shield Vic Macky bad . But unlike that show I was laughing when they shot people and smoked dope.<br /><br />Felissa Rose ... man , oh man . What can you say about that hottie . She was great and put those other actresses to shame . She should work more often ! ! ! ! ! I also really liked the fight scene outside of the building . That was done really well . Lots of fighting and people getting their heads banged up . FUN ! Last , but not least Joe Estevez and William Smith were great as the ... well , I was n't sure what they were , but they see

In [7]:
#preprocess data using torchtext
TEXT = data.Field(lower=True, tokenize="spacy")

In [8]:
bs = 64; bptt = 70

In [14]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)
#The double ** will give keyword arguments in the dictionary by default unless specifically called on as dictionary argument

In [13]:
!ls {PATH}

imdbEr.txt  imdb.vocab	models	README	test  TEXT.pkl	tmp  train


In [12]:
pickle.dump(TEXT,open(f'{PATH}/TEXT.pkl','wb'))

In [None]:
#pickle.load()??

In [13]:
#Here are the: # batches; # unique tokens in the vocab; # tokens in the training set; # sentences
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(4583, 37392, 1, 20540756)

In [14]:
# 'itos': 'int-to-string'
TEXT.vocab.stoi['the']

2

In [15]:
md.trn_ds[0].text[:12]

['at',
 'first',
 ',',
 'i',
 'thought',
 'this',
 'was',
 'a',
 'sequel',
 'to',
 'entre',
 'nous']

In [16]:
TEXT.numericalize([md.trn_ds[0].text[:12]])

Variable containing:
    40
   102
     3
    12
   213
    13
    19
     6
   701
     8
 36172
     0
[torch.cuda.LongTensor of size 12x1 (GPU 0)]

In [17]:
next(iter(md.trn_dl))

(Variable containing:
     40     20     11  ...      20     11   2519
    102      6     16  ...    9324     27      4
      3   8852     31  ...      20      2      8
         ...            ⋱           ...         
    101     76     27  ...    3859     20      0
      7  13402    108  ...      68     22     18
     13     18   2026  ...      23     74  13003
 [torch.cuda.LongTensor of size 72x64 (GPU 0)], Variable containing:
    102
      6
     16
   ⋮   
      3
    125
   2439
 [torch.cuda.LongTensor of size 4608 (GPU 0)])

## Train

In [17]:
em_sz = 200 #size of each embedding vector
nh = 500 #number of hidden activations per layer
nl = 3 #number of layers

In [18]:
#create a version of Adam with less momentum then the default 0.9
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

fastai uses a variant of the state of the art AWD LSTM Language Model developed by Stephen Merity. A key feature of this model is that it provides excellent regularization through Dropout. There is no simple way known (yet!) to find the best values of the dropout parameters below - you just have to experiment...

However, the other parameters (alpha, beta, and clip) shouldn't generally need tuning.

In [19]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
            dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

As you can see below, I gradually tuned the language model in a few stages. I possibly could have trained it further (it wasn't yet overfitting), but I didn't have time to experiment more. Maybe you can see if you can train it to a better accuracy! (I used lr_find to find a good learning rate, but didn't save the output in this notebook. Feel free to try running it yourself now.)

In [21]:
learner.fit(3e-3, 4 ,wds=1e-6, cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      4.842665   4.716607  
    1      4.65098    4.519583                                
    2      4.546429   4.436825                                
    3      4.602143   4.462685                                
    4      4.515887   4.389134                                
    5      4.432896   4.335367                                
    6      4.409452   4.318304                                
    7      4.541228   4.410428                                
    8      4.508051   4.382419                                
    9      4.465801   4.351516                                
    10     4.427629   4.321759                                
    11     4.388611   4.292726                                
    12     4.347843   4.268147                                
    13     4.348232   4.255565                                
    14     4.319096   4.251805                                



[array([4.2518])]

In [22]:
learner.save_encoder('adam1_enc')

In [20]:
learner.load_encoder('adam1_enc')

In [21]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      4.473084   4.366554  
    1      4.473076   4.352812                                
    2      4.443891   4.335332                                
 54%|█████▍    | 2497/4583 [11:11<09:21,  3.72it/s, loss=4.42]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



    4      4.391959   4.290392                                
    5      4.357839   4.267294                                
    6      4.341933   4.248016                                
    7      4.295313   4.233097                                
    8      4.28796    4.225477                                
    9      4.293096   4.223716                                



[array([4.22372])]

In [22]:
learner.save_encoder('adam3_10_enc')

In [23]:
learner.load_encoder('adam3_10_enc')

Language modeling accuracy is generally measured using the metric perplexity, which is simply exp() of the loss function we used

In [25]:
math.exp(4.223)

68.23789119934766

In [None]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

## Test

In [54]:
m = learner.model
ss = """. So, it wasn't quite was I was expecting, but I really liked it anyway! The best"""
s = [spacy_tok(ss)]
t = TEXT.numericalize(s)

In [55]:
' '.join([sent.string.strip() for sent in s[0]]) #pull request

". So , it was n't quite was I was expecting , but I really liked it anyway ! The best"

In [56]:
# Set batch size to 1
m[0].bs = 1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res, *_ = m(t)
# Put the batch size back to what it was
m[0].bs = bs

Let's see what the top 10 predictions were for the next word after our short text:

In [57]:
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

['<unk>', '"', ')', ',', '.', '-', '!', '(', 'de', "'"]

In [58]:
print(ss,"\n")
for i in range(50):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = m(n[0].unsqueeze(0))
print('...')

. So, it wasn't quite was I was expecting, but I really liked it anyway! The best 

" the man who was the best " . the movie is a bit of a mess , but it 's a good movie . <eos> i saw this movie at the toronto film festival and i was very impressed . i was very impressed with the acting and the ...


In [59]:
TEXT = pickle.load(open(f'{PATH}/TEXT.pkl','rb'))

sequential=False tells torchtext that a text field should be tokenized (in this case, we just want to store the 'positive' or 'negative' single label).

splits is a torchtext method that creates train, test, and validation sets. The IMDB dataset is built into torchtext, so we can take advantage of that. Take a look at lang_model-arxiv.ipynb to see how to define your own fastai/torchtext datasets.

In [60]:
IMDB_LABEL = data.Field(sequential=False)
splits = torchtext.datasets.IMDB.splits(TEXT, IMDB_LABEL, 'data/')

downloading aclImdb_v1.tar.gz


In [61]:
t = splits[0].examples[0]

In [62]:
t.label, ' '.join(t.text[:16])

('pos',
 "fantastic documentary of 1924 . this early 20th century geography of today 's iraq was powerful")

fastai can create a ModelData object directly from torchtext splits.

In [63]:
md2 = TextData.from_splits(PATH, splits, bs)

In [64]:
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder(f'adam3_10_enc')

Because we're fine-tuning a pretrained model, we'll use differential learning rates, and also increase the max gradient for clipping, to allow the SGDR to work better.

In [65]:
m3.clip=25.
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [68]:
m3.freeze_to(-1)
m3.fit(lrs/2, 1, metrics=[accuracy])
m3.unfreeze()
m3.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.430745   0.27625    0.886217  



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.371038   0.254581   0.895113  



[array([0.25458]), 0.8951127031704881]

In [None]:
m3.fit(lrs, 7, metrics=[accuracy], cycle_len=2, cycle_save_name='imdb2')

HBox(children=(IntProgress(value=0, description='Epoch', max=14), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.367549   0.29389    0.896895  
    1      0.350346   0.263809   0.900843                    
    2      0.314626   0.237032   0.910373                    
    3      0.313352   0.226121   0.912936                    
    4      0.314341   0.208058   0.922734                    
    5      0.288357   0.214124   0.916495                    
    6      0.299939   0.204892   0.921347                    
    7      0.275409   0.209294   0.920063                    
    8      0.280992   0.202105   0.924654                    
    9      0.274338   0.196927   0.925499                    
    10     0.265389   0.200687   0.926864                    
 66%|██████▌   | 258/391 [02:07<01:05,  2.02it/s, loss=0.259]

In [None]:
m3.load_cycle('imdb2', 4)

In [None]:
accuracy_np(*m3.predict_with_targs())

In [None]:
m3.save('m3')