<a href="https://colab.research.google.com/github/gmum/natural-language-processing-classes/blob/master/lab-9-unsupervised-lm-training/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training unsupervised Language Models

## Excercise

Train an Language Model that obtains at least **111** perplexity on the test set.

Remember to:
-  use gradient clipping with value 0.25
-  use hidden state from previous batch in next batch, to keep the information longer. To do this, instead of initializing the hidden state with 0 each batch, we detach the hidden state from how it was previously produced. If we didn't, the model would try backpropagating all the way to start of the dataset. Use repackage_hidden to deal with that problem.

In [4]:
# Use GPU support!

from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

import time
import os
import math
import torch
import torch.nn as nn
from torch.autograd import Variable

torch.manual_seed(1)

# Use dataset from https://drive.google.com/drive/folders/1e-BUHYY61Vy9AGNuh2nungslO-mYuVox?usp=sharing
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

[31mtorch-0.4.1-cp36-cp36m-linux_x86_64.whl is not a supported wheel on this platform.[0m
[33mYou are using pip version 10.0.1, however version 19.2.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


<torch._C.Generator at 0x10f820930>

In [6]:
class Dictionary(object):
    """Build word2idx and idx2word from Corpus(train/val/test)"""
    def __init__(self):
        self.word2idx = {} # word: index
        self.idx2word = [] # position(index): word

    def add_word(self, word):
        """Create/Update word2idx and idx2word"""
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    """Corpus Tokenizer"""
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'ptb.train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'ptb.valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'ptb.test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                # line to list of token + eos
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids
      
      
def batchify(data, bsz, verbose=False):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    # See https://pytorch.org/docs/stable/torch.html#torch.narrow for more explaination
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    # .t() is transposition: https://pytorch.org/docs/stable/torch.html#torch.t
    # the contiguous function doesn't affect your target tensor at all, it just 
    # makes sure that it is stored in a contiguous chunk of memory.
    data = data.view(bsz, -1).t().contiguous()
    if verbose:
      print(data.size())
      for el in data[:50,0]:
        print(corpus.dictionary.idx2word[el.item()])
      
#     data = data.cuda()
    return data

# use path to where you store the datasets
corpus = Corpus(os.getcwd()+'/data')

batch_size = 20
eval_batch_size = 10
train_data = batchify(corpus.train, batch_size, verbose=True)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
ntokens = len(corpus.dictionary)

torch.Size([46479, 20])
aer
banknote
berlitz
calloway
centrust
cluett
fromstein
gitano
guterman
hydro-quebec
ipo
kia
memotec
mlx
nahb
punts
rake
regatta
rubens
sim
snack-food
ssangyong
swapo
wachter
<eos>
pierre
<unk>
N
years
old
will
join
the
board
as
a
nonexecutive
director
nov.
N
<eos>
mr.
<unk>
is
chairman
of
<unk>
n.v.
the
dutch


In [7]:
class LSTMModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp) # Token2Embeddings
        self.lstm = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.lstm(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
#         return (torch.zeros(self.nlayers, bsz, self.nhid).cuda(),
#                 torch.zeros(self.nlayers, bsz, self.nhid).cuda())
            return (torch.zeros(self.nlayers, bsz, self.nhid),
                torch.zeros(self.nlayers, bsz, self.nhid))

model = LSTMModel(ntokens, 150, 150, 1, 0.2)
# model.cuda()
print(model)

LSTMModel(
  (drop): Dropout(p=0.2)
  (encoder): Embedding(10000, 150)
  (lstm): LSTM(150, 150, dropout=0.2)
  (decoder): Linear(in_features=150, out_features=10000, bias=True)
)


  "num_layers={}".format(dropout, num_layers))


In [14]:
criterion = nn.CrossEntropyLoss()
# criterion.cuda()
seq_len = 30
log_interval = 100

def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == torch.Tensor:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)


def get_batch(source, i):
    s_len = min(seq_len, len(source) - 1 - i)
    data = Variable(source[i:i+s_len])
    target = Variable(source[i+1:i+1+s_len].view(-1))
    return data, target


def evaluate(data_source):
    """compute total loss on data_source dataset"""
  
    model.eval() # Turn on evaluation mode which disables dropout.
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    for i in range(0, data_source.size(0) - 1, seq_len):
      #run model on validation data and count loss
      #lstm return vector of probab which wrod can be next for all words
      #calc loss for lstm output and real probability of REAL next word
      data, targets = get_batch(data_source, i)
      output, hidden = model(data, hidden)
      output_flat = output.view(-1, ntokens)
      total_loss += len(data) * criterion(output_flat, targets).data
      hidden = repackage_hidden(hidden)
    return total_loss.item() / len(data_source)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, seq_len)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.25)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss.item() / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // seq_len, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

# Loop over epochs.
lr = 20
best_val_loss = None
# epochs = 40
epochs = 10
for epoch in range(1, epochs+1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    # Anneal the learning rate if no improvement has been seen in the validation dataset.
    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:
        lr /= 4.0

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)



| epoch   1 |   100/ 1549 batches | lr 20.00 | ms/batch 275.15 | loss  5.06 | ppl   157.26
| epoch   1 |   200/ 1549 batches | lr 20.00 | ms/batch 236.07 | loss  5.07 | ppl   159.61
| epoch   1 |   300/ 1549 batches | lr 20.00 | ms/batch 264.35 | loss  5.17 | ppl   176.31
| epoch   1 |   400/ 1549 batches | lr 20.00 | ms/batch 234.83 | loss  5.08 | ppl   160.34
| epoch   1 |   500/ 1549 batches | lr 20.00 | ms/batch 239.30 | loss  5.08 | ppl   160.89
| epoch   1 |   600/ 1549 batches | lr 20.00 | ms/batch 233.72 | loss  5.10 | ppl   163.74
| epoch   1 |   700/ 1549 batches | lr 20.00 | ms/batch 299.03 | loss  5.08 | ppl   160.66
| epoch   1 |   800/ 1549 batches | lr 20.00 | ms/batch 247.53 | loss  5.05 | ppl   156.77
| epoch   1 |   900/ 1549 batches | lr 20.00 | ms/batch 225.59 | loss  4.95 | ppl   141.15
| epoch   1 |  1000/ 1549 batches | lr 20.00 | ms/batch 257.95 | loss  5.00 | ppl   147.88
| epoch   1 |  1100/ 1549 batches | lr 20.00 | ms/batch 235.32 | loss  5.03 | ppl   152.69

| epoch   6 |   300/ 1549 batches | lr 20.00 | ms/batch 219.48 | loss  4.67 | ppl   106.99
| epoch   6 |   400/ 1549 batches | lr 20.00 | ms/batch 219.83 | loss  4.55 | ppl    94.43
| epoch   6 |   500/ 1549 batches | lr 20.00 | ms/batch 220.95 | loss  4.56 | ppl    95.35
| epoch   6 |   600/ 1549 batches | lr 20.00 | ms/batch 228.65 | loss  4.59 | ppl    98.96
| epoch   6 |   700/ 1549 batches | lr 20.00 | ms/batch 221.69 | loss  4.59 | ppl    98.57
| epoch   6 |   800/ 1549 batches | lr 20.00 | ms/batch 222.23 | loss  4.58 | ppl    97.25
| epoch   6 |   900/ 1549 batches | lr 20.00 | ms/batch 221.24 | loss  4.49 | ppl    88.75
| epoch   6 |  1000/ 1549 batches | lr 20.00 | ms/batch 221.11 | loss  4.55 | ppl    94.78
| epoch   6 |  1100/ 1549 batches | lr 20.00 | ms/batch 220.32 | loss  4.61 | ppl   100.20
| epoch   6 |  1200/ 1549 batches | lr 20.00 | ms/batch 224.00 | loss  4.65 | ppl   104.56
| epoch   6 |  1300/ 1549 batches | lr 20.00 | ms/batch 221.14 | loss  4.43 | ppl    84.15

Now let's generate a sentance of 17 words, first 2 words are `consumers` and `kia`, the rest will be created by model

In [15]:
init_line = [2211,11]
detoken = []
for first_word_indx in init_line:
    detoken.append(corpus.dictionary.idx2word[first_word_indx])
gen_text_len = 15
data_source = torch.LongTensor([[init_line[0]]])
model.eval() # Turn on evaluation mode which disables dropout.
ntokens = len(corpus.dictionary)
hidden = model.init_hidden(1)
output, output_flat = None, list()
for i in range(1, len(init_line)):
    data = Variable(data_source)
    output, hidden = model(data, hidden)
    output_flat.append(output.view(-1, ntokens))
    hidden = repackage_hidden(hidden)
    data_source = torch.LongTensor([[init_line[i]]])
    
for i in range(0, gen_text_len):
#     print(data_source)
    data = Variable(data_source)
#     print(data)
    output, hidden = model(data, hidden)
    output_flat.append(output.view(-1, ntokens))
    hidden = repackage_hidden(hidden)
    
    probs = list(output_flat[-1][0].detach().numpy())
    max_indx = probs.index(max(probs))
    detoken.append(corpus.dictionary.idx2word[max_indx])
#     print(max_indx)
    data_source = torch.LongTensor([[max_indx]])


# for line in output_flat:
#     probs = list(line.detach().numpy())
#     max_indx = probs.index(max(probs))
#     detoken.append(corpus.dictionary.idx2word[max_indx])

print(detoken)

['consumers', 'kia', '<unk>', '<unk>', '<eos>', 'the', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']


In [16]:
output_flat[-1][0].size(0)

10000

In [22]:
max(output_flat[0].detach().numpy())

array([-1.4186041 , -0.42088753, -0.71642345, ..., -0.43088904,
       -1.8156554 ,  0.11258146], dtype=float32)

In [23]:
probs = list(output_flat[0].detach().numpy())
max_indx = probs.index(max(probs))
max_indx

0