# ULMFit

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
#export
from exp.nb_12a import *

## Data

In [None]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [None]:
ll = pickle.load(open(path/'ld.pkl', 'rb'))

In [None]:
bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt)

In [None]:
vocab = ll.train.x.processors[1].vocab

## Finetuning the LM

Before tackling the classification task, we have to finetune our language model to the IMDB corpus. Make sure you have the pretrained.pth and vocab.pkl files in your IMDB data folder. 

In [None]:
path.ls()

[PosixPath('/home/ubuntu/.fastai/data/imdb/vocab.pkl'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/test'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/tmp_clas'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/README'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/unsup'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/train'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/tmp_lm'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/pretrained.pth'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/imdb.vocab'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/ld.pkl')]

In [None]:
dps = tensor([0.25, 0.1, 0.2, 0.02, 0.15]) * 0.25

In [None]:
emb_sz, nh, nl = 300, 300, 2
model = get_language_model(len(vocab), emb_sz, nh, nl, 0, input_p=dps[0], output_p=dps[1], weight_p=dps[2], 
                           embed_p=dps[3], hidden_p=dps[4])

Match embeddings

In [None]:
old_wgts  = torch.load(path/'pretrained.pth')
old_vocab = pickle.load(open(path/'vocab.pkl', 'rb'))

In [None]:
vocab.index('house'),old_vocab.index('house')

(349, 230)

In [None]:
house_wgt  = old_wgts['0.encoder.weight'][230]
house_bias = old_wgts['1.decoder.bias'][230] 

In [None]:
def match_embeds(old_wgts, old_vocab, new_vocab):
    wgts = old_wgts['0.encoder.weight']
    bias = old_wgts['1.decoder.bias']
    wgts_m,bias_m = wgts.mean(dim=0),bias.mean()
    new_wgts = wgts.new_zeros(len(new_vocab), wgts.size(1))
    new_bias = bias.new_zeros(len(new_vocab))
    for i,w in enumerate(new_vocab): 
        if w in old_vocab:
            idx = old_vocab.index(w)
            new_wgts[i],new_bias[i] = wgts[idx],bias[idx]
        else: new_wgts[i],new_bias[i] = wgts_m,bias_m
    old_wgts['0.encoder.weight']    = new_wgts
    old_wgts['0.encoder_dp.emb.weight'] = new_wgts
    old_wgts['1.decoder.weight']    = new_wgts
    old_wgts['1.decoder.bias']      = new_bias
    return old_wgts

In [None]:
wgts = match_embeds(old_wgts, old_vocab, vocab)

In [None]:
assert torch.allclose(wgts['0.encoder.weight'][349],house_wgt)
assert torch.allclose(wgts['1.decoder.bias'][349],house_bias)

In [None]:
model.load_state_dict(wgts)

In [None]:
torch.save(model.state_dict, path/'tmp_clas'/'init.pth')

Split

In [None]:
model

SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(60003, 300, padding_idx=0)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(60003, 300, padding_idx=0)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(300, 300, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(300, 300, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=300, out_features=60003, bias=True)
    (output_dp): RNNDropout()
  )
)

In [None]:
rnns = sum([list(rnn.parameters()) for rnn in model[0].rnns], [])
others = sum([list(m.parameters()) for m in [model[0].encoder, model[0].encoder_dp, model[1]]], [])
pgs = [rnns, others]

In [None]:
opt = adam_opt(others, lr=1e-2, mom=0.8)

In [None]:
lr = 1e-2
sched_lr  = combine_scheds([0.25,0.75], cos_1cycle_anneal(lr/10.,lr, 0))
sched_mom = combine_scheds([0.25,0.75], cos_1cycle_anneal(0.8,0.7, 0.8))

In [None]:
cbs = [partial(AvgStatsCallback,accuracy_flat),
       CudaCallback,
       Recorder,
       partial(GradientClipping, clip=0.1),
       partial(ParamScheduler, 'lr', sched_lr),
       partial(ParamScheduler, 'mom', sched_mom),
       partial(RNNTrainer, alpha=2., beta=1.),
       ProgressCallback]

In [None]:
learn = Learner(model, data, cross_entropy_flat, opt=opt, cb_funcs=cbs)

In [None]:
learn.fit(1)

In [None]:
opt = adam_opt([rnns, others], lr=1e-2, mom=0.8)

In [None]:
lr = 1e-3
sched_lr  = combine_scheds([0.25,0.75], cos_1cycle_anneal(lr/10.,lr, 0))
sched_lr1 = combine_scheds([0.25,0.75], cos_1cycle_anneal(lr/20.,lr/2., 0))
sched_mom = combine_scheds([0.25,0.75], cos_1cycle_anneal(0.8,0.7, 0.8))

In [None]:
cbs = [partial(AvgStatsCallback,accuracy_flat),
       CudaCallback,
       Recorder,
       partial(GradientClipping, clip=0.1),
       partial(ParamScheduler, 'lr', [sched_lr1, sched_lr]),
       partial(ParamScheduler, 'mom', sched_mom),
       partial(RNNTrainer, alpha=2., beta=1.),
       ProgressCallback]

In [None]:
learn = Learner(model, data, cross_entropy_flat, opt=opt, cb_funcs=cbs)

In [None]:
learn.fit(10)