In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from nb_005 import *
from collections import Counter

# Wikitext 2

## Data

Download the dataset [here](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip) and unzip it so it's in the folder wikitext.

In [4]:
EOS = '<eos>'
PATH=Path('data/wikitext')

Small helper function to read the tokens.

In [5]:
def read_file(filename):
    tokens = []
    with open(PATH/filename, encoding='utf8') as f:
        for line in f:
            tokens.append(line.split() + [EOS])
    return np.array(tokens)

In [6]:
train_tok = read_file('wiki.train.tokens')
valid_tok = read_file('wiki.valid.tokens')
test_tok = read_file('wiki.test.tokens')

In [7]:
len(train_tok), len(valid_tok), len(test_tok)

(36718, 3760, 4358)

In [8]:
' '.join(train_tok[4][:20])

'The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II'

In [9]:
cnt = Counter(word for sent in train_tok for word in sent)
cnt.most_common(10)

[('the', 113161),
 (',', 99913),
 ('.', 73388),
 ('of', 56889),
 ('<unk>', 54625),
 ('and', 50603),
 ('in', 39453),
 ('to', 39190),
 ('<eos>', 36718),
 ('a', 34237)]

Give an id to each token and add the pad token (just in case we need it).

In [10]:
itos = [o for o,c in cnt.most_common()]
itos.insert(0,'<pad>')

In [11]:
vocab_size = len(itos); vocab_size

33279

Creates the mapping from token to id then numericalizing our datasets.

In [12]:
stoi = collections.defaultdict(lambda : 5, {w:i for i,w in enumerate(itos)})

In [108]:
train_ids = np.array([([stoi[w] for w in s]) for s in train_tok])
valid_ids = np.array([([stoi[w] for w in s]) for s in valid_tok])
test_ids = np.array([([stoi[w] for w in s]) for s in test_tok])

In [118]:
class LanguageModelLoader():
    """ Returns a language model iterator that iterates through batches that are of length N(bptt,5)
    The first batch returned is always bptt+25; the max possible width.  This is done because of they way that pytorch
    allocates cuda memory in order to prevent multiple buffers from being created as the batch width grows.
    """
    def __init__(self, nums, bs, bptt, backwards=False):
        self.bs,self.bptt,self.backwards = bs,bptt,backwards
        self.data = self.batchify(nums)
        self.i,self.iter = 0,0
        self.n = len(self.data)

    def __iter__(self):
        self.i,self.iter = 0,0
        while self.i < self.n-1 and self.iter<len(self):
            if self.i == 0:
                seq_len = self.bptt + 5 * 5
            else:
                bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
                seq_len = max(5, int(np.random.normal(bptt, 5)))
            res = self.get_batch(self.i, seq_len)
            self.i += seq_len
            self.iter += 1
            yield res

    def __len__(self): return self.n // self.bptt - 1

    def batchify(self, data):
        nb = data.shape[0] // self.bs
        data = np.array(data[:nb*self.bs])
        data = data.reshape(self.bs, -1).T
        if self.backwards: data=data[::-1]
        return LongTensor(data)

    def get_batch(self, i, seq_len):
        source = self.data
        seq_len = min(seq_len, len(source) - 1 - i)
        return source[i:i+seq_len], source[i+1:i+1+seq_len].view(-1)

In [119]:
bs,bptt = 20,10
train_dl = LanguageModelLoader(np.concatenate(train_ids), bs, bptt)
valid_dl = LanguageModelLoader(np.concatenate(valid_ids), bs, bptt)

In [120]:
class LMDataBunch():
    def __init__(self, train_dl, valid_dl, bs=64, device=None):
        self.device = default_device if device is None else device
        self.train_dl = DeviceDataLoader(train_dl, self.device, progress_func=tqdm)
        self.valid_dl = DeviceDataLoader(valid_dl, self.device, progress_func=tqdm)

    @property
    def train_ds(self): return self.train_dl.dl.dataset
    @property
    def valid_ds(self): return self.valid_dl.dl.dataset

In [121]:
data = LMDataBunch(train_dl, valid_dl, bs)

## Model

### 1. Dropout

We want to use the AWD-LSTM from [Stephen Merity](https://arxiv.org/abs/1708.02182). First, we'll need all different kinds of dropouts. Dropout consists into replacing some coefficients by 0 with probability p. To ensure that the averga of the weights remains constant, we apply a correction to the weights that aren't nullified of a factor `1/(1-p)`.

In [14]:
def dropout_mask(x, sz, p):
    "Returns a dropout mask of the same type as x, size sz, with probability p to cancel an element."
    return x.new(*sz).bernoulli_(1-p)/(1-p)

In [15]:
x = torch.randn(10,10)
dropout_mask(x, (10,10), 0.5)

tensor([[0., 2., 0., 2., 0., 0., 0., 2., 0., 2.],
        [2., 0., 0., 0., 0., 2., 2., 0., 2., 2.],
        [0., 2., 2., 0., 0., 2., 0., 0., 0., 2.],
        [2., 0., 2., 2., 0., 2., 0., 2., 2., 0.],
        [2., 2., 2., 2., 2., 0., 2., 2., 2., 2.],
        [2., 2., 0., 2., 0., 2., 0., 0., 0., 0.],
        [0., 2., 2., 0., 0., 2., 0., 2., 0., 0.],
        [0., 2., 2., 0., 0., 0., 0., 0., 0., 2.],
        [0., 2., 0., 0., 0., 2., 2., 2., 2., 0.],
        [0., 0., 0., 0., 0., 2., 2., 2., 0., 0.]])

Once with have a dropout mask `m`, applying the dropout to `x` is simply done by `x = x * m`. We create our own dropout mask and don't rely on pytorch dropout because we want to nullify the coefficients on the batch dimension but not the token dimension (aka the same coefficients are replaced by zero for each word in the sentence). 

Inside a RNN, a tensor x will have three dimensions: seq_len, bs, vocab_size, so we create a dropout mask for the last two dimensions and broadcast it to the first dimension.

In [16]:
class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p=p

    def forward(self, x):
        if not self.training or not self.p: return x
        m = dropout_mask(x.data, (1, x.size(1), x.size(2)), self.p)
        return m * x

In [17]:
dp_test = RNNDropout(0.5)
x = torch.randn(2,5,10)
x, dp_test(x)

(tensor([[[-0.7618, -0.6116,  0.3955,  1.3657,  1.4364, -2.2120,  0.2474,
           -0.3893,  0.4495,  1.2473],
          [ 0.2402, -0.4327, -1.4691,  0.4653,  0.4280,  0.3873,  0.9973,
            0.3174,  0.9244, -1.3500],
          [-1.3865,  0.7202, -0.8552,  1.9565,  1.0824, -0.4621, -0.7159,
            0.1479,  0.6398,  0.9142],
          [ 0.4448, -0.9450,  0.0832,  0.0767,  0.9118,  2.2280,  1.0178,
           -0.6625,  0.2360, -0.2408],
          [ 0.3941, -1.6472, -0.5152,  0.4187, -0.1898,  0.8900, -0.7480,
           -0.9121, -1.7869,  1.0283]],
 
         [[ 0.5170,  0.6714, -0.0522, -0.3161,  1.2824, -0.0983, -1.5031,
           -1.8706, -0.1500,  0.9089],
          [ 1.3129,  0.7559, -0.2603,  1.1480, -0.0479, -0.7193, -1.2821,
           -0.7878,  0.5105,  0.9481],
          [ 1.1758, -0.4721, -2.1607, -0.8924,  0.6982,  0.4315, -0.2785,
           -0.1542, -1.1626,  0.0448],
          [ 2.0093, -0.0582, -1.2177,  0.3014,  0.8134,  0.2001,  0.1971,
            0.7769,

In [142]:
class WeightDropout(nn.Module):
    "A module that warps another layer in which some weights will be replaced by 0 during training."
    
    def __init__(self, module, dropout, layer_names=['weight_hh_l0']):
        super().__init__()
        self.module,self.dropout,self.layer_names = module,dropout,layer_names
    
    def _setweights(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            w1 = F.dropout(raw_w, p=self.dropout, training=self.training)
            module._parameters[layer] = w1
            
    def forward(self, *args):
        self._setweights()
        return self.module.forward(*args)
    
    def reset(self):
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
        if hasattr(self.module, 'reset'): self.module.reset()

In [143]:
module = nn.LSTM(20, 20)
dp_module = WeightDropout(module, 0.5)
dp_module.reset()
opt = optim.SGD(dp_module.parameters(), 10)
dp_module.train()

WeightDropout(
  (module): LSTM(20, 20)
)

In [145]:
x = torch.randn(2,5,20)
x.requires_grad_(requires_grad=True)
h = (torch.zeros(1,5,20), torch.zeros(1,5,20))
for _ in range(5): x,h = dp_module(x,h)

In [146]:
getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw')

(tensor([[-0.0966,  0.1848,  0.0000,  ...,  0.0870,  0.0000,  0.0000],
         [-0.0000, -0.0000, -0.3470,  ..., -0.4012,  0.0000, -0.0000],
         [ 0.0000,  0.0000, -0.2064,  ..., -0.0000,  0.1284, -0.1940],
         ...,
         [ 0.0000, -0.0622,  0.2181,  ..., -0.1798, -0.0411,  0.3447],
         [ 0.2740,  0.0000, -0.1230,  ...,  0.0000, -0.0000, -0.3689],
         [-0.0000, -0.0000,  0.0082,  ..., -0.2975,  0.0668, -0.1271]],
        grad_fn=<MulBackward0>), Parameter containing:
 tensor([[-0.0483,  0.0924,  0.0244,  ...,  0.0435,  0.1158,  0.0845],
         [-0.0036, -0.0728, -0.1735,  ..., -0.2006,  0.0105, -0.1592],
         [ 0.0174,  0.2131, -0.1032,  ..., -0.1210,  0.0642, -0.0970],
         ...,
         [ 0.2088, -0.0311,  0.1090,  ..., -0.0899, -0.0205,  0.1723],
         [ 0.1370,  0.0754, -0.0615,  ...,  0.0477, -0.0691, -0.1844],
         [-0.1813, -0.2008,  0.0041,  ..., -0.1487,  0.0334, -0.0635]],
        requires_grad=True))

In [147]:
target = torch.randint(0,20,(10,)).long()
loss = F.nll_loss(x.view(-1,20), target)
loss.backward()
opt.step()

In [148]:
w, w_raw = getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw')
w.grad, w_raw.grad

(None, tensor([[-0.0001,  0.0001, -0.0000,  ..., -0.0001,  0.0001,  0.0000],
         [ 0.0001,  0.0003, -0.0000,  ...,  0.0004,  0.0002,  0.0001],
         [-0.0001,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         ...,
         [ 0.0001,  0.0014, -0.0000,  ..., -0.0008,  0.0002, -0.0003],
         [-0.0005,  0.0000, -0.0000,  ..., -0.0000, -0.0000,  0.0001],
         [-0.0000,  0.0002, -0.0000,  ..., -0.0001,  0.0000, -0.0001]]))

In [149]:
getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw')

(tensor([[-0.0966,  0.1848,  0.0000,  ...,  0.0870,  0.0000,  0.0000],
         [-0.0000, -0.0000, -0.3470,  ..., -0.4012,  0.0000, -0.0000],
         [ 0.0000,  0.0000, -0.2064,  ..., -0.0000,  0.1284, -0.1940],
         ...,
         [ 0.0000, -0.0622,  0.2181,  ..., -0.1798, -0.0411,  0.3447],
         [ 0.2740,  0.0000, -0.1230,  ...,  0.0000, -0.0000, -0.3689],
         [-0.0000, -0.0000,  0.0082,  ..., -0.2975,  0.0668, -0.1271]],
        grad_fn=<MulBackward0>), Parameter containing:
 tensor([[-0.0471,  0.0914,  0.0246,  ...,  0.0449,  0.1147,  0.0845],
         [-0.0043, -0.0756, -0.1734,  ..., -0.2046,  0.0081, -0.1604],
         [ 0.0182,  0.2126, -0.1032,  ..., -0.1206,  0.0640, -0.0966],
         ...,
         [ 0.2082, -0.0453,  0.1092,  ..., -0.0819, -0.0221,  0.1757],
         [ 0.1418,  0.0751, -0.0614,  ...,  0.0478, -0.0686, -0.1859],
         [-0.1811, -0.2028,  0.0045,  ..., -0.1474,  0.0329, -0.0629]],
        requires_grad=True))

In [26]:
class EmbeddingDropout(nn.Module):

    "Applies dropout in the embedding layer by zeroing out some elements of the embedding vector."
    def __init__(self, emb):
        super().__init__()
        self.emb = emb
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None: self.pad_idx = -1

    def forward(self, words, dropout=0.1, scale=None):
        if dropout:
            size = (self.emb.weight.size(0),1)
            mask = dropout_mask(self.emb.weight.data, size, dropout)
            masked_emb_weight = mask * self.emb.weight
        else: masked_emb_weight = self.emb.weight
        if scale: masked_emb_weight = scale * masked_emb_weight
        return F.embedding(words, masked_emb_weight, self.pad_idx, self.emb.max_norm,
                           self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)

In [27]:
enc = nn.Embedding(100,20, padding_idx=0)
enc_dp = EmbeddingDropout(enc)

In [28]:
x = torch.randint(0,100,(25,)).long()

In [29]:
enc_dp(x, dropout=0.5)

tensor([[ 0.0000, -0.0000, -0.0000,  0.0000, -0.0000, -0.0000,  0.0000,  0.0000,
         -0.0000, -0.0000, -0.0000,  0.0000, -0.0000,  0.0000, -0.0000, -0.0000,
         -0.0000,  0.0000,  0.0000, -0.0000],
        [-3.0746,  0.0532, -0.9969, -1.5401, -2.2785, -3.1142, -5.2356, -1.9817,
         -0.4740,  0.7455, -0.4058,  1.4690,  0.7676,  0.4037, -0.0132, -4.1723,
          0.8027, -1.0026, -1.9151,  2.5202],
        [-2.6915, -1.3094,  2.5535, -2.0829,  1.0566, -2.4929,  0.1980,  2.1521,
         -1.9781,  0.4168, -1.6336,  1.8876,  1.8933,  2.5815,  1.3373,  0.4616,
         -1.5860,  2.5099,  2.6191,  0.9438],
        [ 0.0000, -0.0000, -0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         -0.0000, -0.0000,  0.0000, -0.0000, -0.0000,  0.0000,  0.0000,  0.0000,
         -0.0000,  0.0000,  0.0000, -0.0000],
        [ 0.2953,  1.9750,  3.3688,  2.1726, -1.6868,  0.7382,  0.7309,  2.3460,
         -3.7565, -0.3633, -0.3705,  1.0253,  2.0195,  0.2571, -1.9339, -2.8782,
      

### 2. AWD-LSTM

In [75]:
def repackage_var(h):
    "Detaches h from its history."
    return h.detach() if type(h) == torch.Tensor else tuple(repackage_var(v) for v in h)

In [89]:
class RNNCore(nn.Module):
    "AWD-LSTM/QRNN inspired by https://arxiv.org/abs/1708.02182"

    initrange=0.1

    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token, bidir=False,
                 dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5, qrnn=False):
        
        super().__init__()
        self.bs,self.qrnn,self.ndir = 1, qrnn,(2 if bidir else 1)
        self.emb_sz,self.n_hid,self.n_layers,self.dropoute = emb_sz,n_hid,n_layers,dropoute
        self.encoder = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.dp_encoder = EmbeddingDropout(self.encoder)
        if self.qrnn:
            #Using QRNN requires cupy: https://github.com/cupy/cupy
            from .torchqrnn.qrnn import QRNNLayer
            self.rnns = [QRNNLayer(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.ndir,
                save_prev_x=True, zoneout=0, window=2 if l == 0 else 1, output_gate=True) for l in range(n_layers)]
            if wdrop:
                for rnn in self.rnns:
                    rnn.linear = WeightDrop(rnn.linear, wdrop, layer_names=['weight'])
        else:
            self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.ndir,
                1, bidirectional=bidir) for l in range(n_layers)]
            if wdrop: self.rnns = [WeightDrop(rnn, wdrop) for rnn in self.rnns]
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.encoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.dropouti = RNNDropout(dropouti)
        self.dropouths = nn.ModuleList([RNNDropout(dropouth) for l in range(n_layers)])

    def forward(self, input):
        sl,bs = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        emb = self.dp_encoder(input, dropout=self.dropoute if self.training else 0)
        emb = self.dropouti(emb)
        raw_output = emb
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,drop) in enumerate(zip(self.rnns, self.dropouths)):
            #with warnings.catch_warnings():
            #    warnings.simplefilter("ignore")
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = drop(raw_output)
            outputs.append(raw_output)
        self.hidden = repackage_var(new_hidden)
        return raw_outputs, outputs

    def one_hidden(self, l):
        nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz)//self.ndir
        return self.weights.new(self.ndir, self.bs, nh).zero_()

    def reset(self):
        [r.reset() for r in self.rnns if hasattr(r, 'reset')]
        self.weights = next(self.parameters()).data
        if self.qrnn: self.hidden = [self.one_hidden(l) for l in range(self.n_layers)]
        else: self.hidden = [(self.one_hidden(l), self.one_hidden(l)) for l in range(self.n_layers)]

In [95]:
class LinearDecoder(nn.Module):
    "To go on top of a RNN_Core module"
    
    initrange=0.1
    
    def __init__(self, n_out, n_hid, dropout, tie_encoder=None, bias=True):
        super().__init__()
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.dropout = RNNDropout(dropout)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight

    def forward(self, input):
        raw_outputs, outputs = input
        output = self.dropout(outputs[-1])
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, raw_outputs, outputs

In [96]:
class SequentialRNN(nn.Sequential):
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()

In [97]:
def get_language_model(vocab_sz, emb_sz, n_hid, n_layers, pad_token, tie_weights=True, qrnn=False, bias=True,
                 dropout=0.4, dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5):
    "To create a full AWD-LSTM"
    rnn_enc = RNNCore(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token, qrnn=qrnn,
                 dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    enc = rnn_enc.encoder if tie_weights else None
    return SequentialRNN(rnn_enc, LinearDecoder(vocab_sz, emb_sz, dropout, tie_encoder=enc, bias=bias))

In [98]:
tst_model = get_language_model(500, 20, 100, 2, 0, wdrop=0.)

In [99]:
x = torch.randint(0, 500, (10,5)).long()
z = tst_model(x)

In [101]:
len(z)

3

### 3. Callbacks to train the model

In [127]:
@dataclass
class RNNTrainer(Callback):
    learn:Learner
    bptt:int
    alpha:float=0.
    beta:float=0.
    
    def on_loss_begin(self, last_output, **kwargs):
        #Save the extra outputs for later and only returns the true output.
        self.raw_out,self.out = last_output[1],last_output[2]
        return last_output[0]
    
    def on_backward_begin(self, last_loss, last_input, last_output, **kwargs):
        #Adjusts the lr to the bptt selected
        self.learn.opt.lr *= last_input.size(0) / self.bptt
        #AR and TAR
        if self.alpha != 0.:  last_loss += (self.alpha * self.out[-1].pow(2).mean()).sum()
        if self.beta != 0.:
            h = self.raw_out[-1]
            if len(h)>1: last_loss += (self.beta * (h[1:] - h[:-1]).pow(2).mean()).sum()
        return last_loss

In [128]:
model = get_language_model(vocab_size, 20, 100, 2, 0, wdrop=0.)
learn = Learner(data, model)

  "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "


In [129]:
cb = RNNTrainer(learn, bptt, alpha=2, beta=1)
learn.opt_fn = partial(optim.Adam)
learn.fit(1, 0.1, callbacks=[cb])

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10442), HTML(value='')))

KeyboardInterrupt: 