# Download PyTorch

PyTorch does not come with CoLab so every time we restart this notebook we have to redownload it.

In [0]:
import sys
sys.version

'3.6.6 (default, Sep 12 2018, 18:26:19) \n[GCC 8.0.1 20180414 (experimental) [trunk revision 259383]]'

In [0]:
!pip3 install http://download.pytorch.org/whl/cu80/torch-0.4.1-cp36-cp36m-linux_x86_64.whl



In [0]:
!pip3 install pytorch-nlp



In [0]:
!pip3 install torchvision



# Wikipedia data for LM pretraining

Our dad jokes dataset does not have that many samples so its performance is not great as-is. The major problem with having so few examples is that the model needs to learn both how to make jokes and how to compose words, and this is too much to ask of a small dataset. To learn how words are made, we are downloading a corpus based on Wikipedia to pretrain our model.

In [0]:
!wget http://www.marekrei.com/pub/lm-dataset.tar.gz

--2018-11-18 06:01:30--  http://www.marekrei.com/pub/lm-dataset.tar.gz
Resolving www.marekrei.com (www.marekrei.com)... 217.146.69.7, 2a02:29e8:770:0:3::17
Connecting to www.marekrei.com (www.marekrei.com)|217.146.69.7|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 76178112 (73M) [application/x-gzip]
Saving to: ‘lm-dataset.tar.gz.1’


2018-11-18 06:01:35 (15.9 MB/s) - ‘lm-dataset.tar.gz.1’ saved [76178112/76178112]



In [0]:
!tar xvzf lm-dataset.tar.gz

lm-dataset/lm-dataset.train.unk100.txt
lm-dataset/lm-dataset.dev.txt
lm-dataset/lm-dataset.test.unk100.txt
lm-dataset/lm-dataset.test.txt
lm-dataset/lm-dataset.train.unk100.top40K.txt
lm-dataset/README.md
lm-dataset/lm-dataset.dev.unk100.txt
lm-dataset/lm-dataset.dev.unk100.top1K.txt
lm-dataset/
lm-dataset/lm-dataset.train.txt


In [0]:
!ls -lh lm-dataset

total 191M
-rw-r--r-- 1 2544 2544  22M Apr  1  2015 lm-dataset.dev.txt
-rw-r--r-- 1 2544 2544 117K Apr  4  2015 lm-dataset.dev.unk100.top1K.txt
-rw-r--r-- 1 2544 2544  21M Apr  2  2015 lm-dataset.dev.unk100.txt
-rw-r--r-- 1 2544 2544  23M Mar 21  2015 lm-dataset.test.txt
-rw-r--r-- 1 2544 2544  21M Apr  2  2015 lm-dataset.test.unk100.txt
-rw-r--r-- 1 2544 2544  53M Mar 21  2015 lm-dataset.train.txt
-rw-r--r-- 1 2544 2544 4.7M Apr 14  2015 lm-dataset.train.unk100.top40K.txt
-rw-r--r-- 1 2544 2544  49M Apr  2  2015 lm-dataset.train.unk100.txt
-rw-rw-r-- 1 2544 2544  141 Apr  5  2015 README.md


# Define classes

In [0]:
import torch
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import *
from typing import List, Tuple
import os
import csv
import time

In [0]:
from torchnlp.samplers import BucketBatchSampler
from torchnlp.datasets import snli_dataset
from torchnlp.utils import datasets_iterator
from torchnlp.text_encoders import IdentityEncoder, CharacterEncoder, StaticTokenizerEncoder
from torchnlp import word_to_vector

In [0]:
import sys
import tqdm as tq
from tqdm import tqdm_notebook, tnrange


def in_ipynb():
    try:
        cls = get_ipython().__class__.__name__
        return cls == 'ZMQInteractiveShell'
    except NameError:
        return False


def in_notebook():
    try:
        from ipykernel.kernelapp import IPKernelApp
        return IPKernelApp.initialized()
    except ImportError:
        return False


def clear_tqdm():
    inst = getattr(tq.tqdm, '_instances', None)
    if not inst: return
    try:
        for i in range(len(inst)): inst.pop().close()
    except Exception:
        pass

if in_notebook():
    def tqdm(*args, **kwargs):
        clear_tqdm()
        return tq.tqdm(*args, file=sys.stdout, **kwargs)
    def trange(*args, **kwargs):
        clear_tqdm()
        return tq.trange(*args, file=sys.stdout, **kwargs)
else:
    from tqdm import tqdm, trange
    tnrange=trange
    tqdm_notebook=tqdm

In [0]:
class CharacterByteEncoder(StaticTokenizerEncoder):
    """ Encodes text into a tensor by encoding into bytes and splitting the text into individual 
    characters.

    Args:
        sample (list of strings): Sample of data to build dictionary on
        min_occurrences (int, optional): Minimum number of occurrences for a token to be added to
          dictionary.
        append_eos (bool, optional): If `True` append EOS token onto the end to the encoded vector.
    """
    # TODO add start and end token
    def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError('CharacterEncoder defines a tokenize callable per character')
        super().__init__(*args, tokenize=self._tokenize, **kwargs)

    def decode(self, tensor):
        tokens = [self.itos[index] for index in tensor]
        return ''.join(tokens)
    
    def _tokenize(self, s):
        return [c for c in s.encode()]

In [0]:
import os
os.getcwd()

'/content'

In [0]:
DATASET_PATH = "shortjokes.csv"
DATASET_FILE_ID = "1bplfuUrJEnpi6r78LQtO3IufzCSSVJaC"
DATASET_URL = "https://github.com/amoudgl/short-jokes-dataset/raw/master/shortjokes.csv"

WIKIPEDIA_DATA_PATH = "lm-dataset/lm-dataset.train.txt"

In [0]:
import urllib.request
...
# Download the file from `url` and save it locally under `file_name`:
urllib.request.urlretrieve(DATASET_URL, DATASET_PATH)

('shortjokes.csv', <http.client.HTTPMessage at 0x7fd03161b710>)

In [0]:
class DadJokesDataset(Dataset):
    def __init__(self, path):
        self.data = pd.read_csv(path, sep=',')['Joke'].tolist()
        self.text_encoder = CharacterEncoder(self.data, append_eos=True)
        self.samples = []
        for _ in range(len(self.data)):
            joke = self.data.pop()
            self.samples.append(self.generate_language_model_samples(self.text_encoder.encode(joke)))
        del self.data        
        
    def generate_language_model_samples(self, joke):
        res = {}
        res['text'] = joke[:-1]
        res['next'] = joke[1:]
        return res
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, i):
        return self.samples[i]
    
    def getitem_readable(self, i):
        return {'text': self.text_encoder.decode(self.samples[i]['text']),
               'next': self.text_encoder.decode(self.samples[i]['next'])}
    

In [0]:
class WikiDataset(Dataset):
    def __init__(self, path):
        with open(path, 'r') as fin:
          self.data = [line.strip() for line in fin.readlines()]
        self.text_encoder = CharacterEncoder(self.data, append_eos=True)
        self.samples = []
        for _ in range(len(self.data)):
            joke = self.data.pop()
            self.samples.append(self.generate_language_model_samples(self.text_encoder.encode(joke)))
        del self.data        
        
    def generate_language_model_samples(self, joke):
        res = {}
        res['text'] = joke[:-1]
        res['next'] = joke[1:]
        return res
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, i):
        return self.samples[i]
    
    def getitem_readable(self, i):
        return {'text': self.text_encoder.decode(self.samples[i]['text']),
               'next': self.text_encoder.decode(self.samples[i]['next'])}
    

In [0]:
wikipedia_dataset = WikiDataset(WIKIPEDIA_DATA_PATH)

In [0]:
wikipedia_dataset.getitem_readable(18)

{'next': 'e took over from lt gen iqbal khan who proceeded as the vice chief of the army staff ( voas ) , a newly created post .</s>',
 'text': 'he took over from lt gen iqbal khan who proceeded as the vice chief of the army staff ( voas ) , a newly created post .'}

In [0]:
class DadJokesGenerator(nn.Module):
    def __init__(self, n_chars, char_embedding_dim, lstm_dim, n_lstm_layers=1, dropout=0):
        super().__init__()
        self.n_chars = n_chars
        self.char_embedding_dim = char_embedding_dim
        
        # initialize lookup table of fixed dictionary and size, 0 = padding idx
        self.char_embedder = nn.Embedding(self.n_chars, self.char_embedding_dim, padding_idx=0)
        
        # lstm_dim = size of one lstm hidden layer
        # n_lstm_layers = number of stacked lstms
        self.lstm = nn.LSTM(char_embedding_dim, lstm_dim, n_lstm_layers, batch_first=True, dropout=dropout)
        
        # creates a pipeline to transform the output of the lstm into a list where each position is a char idx
        # and its values can be projected by ReLU to be the confidence for each character
        self.projection = nn.Sequential(*[
            nn.Linear(lstm_dim, self.n_chars),
            nn.ReLU(),
        ])
        
    def forward(self, char_ids):
        # char_ids of shape [b_sz, max_sq_len (w/ pads)]
        x = self.char_embedder(char_ids)  # -> shape [b_sz, max_sq_len, char_emb_sz]
        x, _ = self.lstm(x)  # -> shape [b_sz, max_sq_len, lstm_dim]  [b_sz, max_sq_len * lstm_dim]
        
        return self.projection(x)  # -> shape [b_sz, max_sq_len, total_num_characters]
        

In [0]:
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pad_packed_sequence

def padded_collate(batch, padding_idx=0):
    x = pad_sequence([elem['text'] for elem in batch], batch_first=True, padding_value=padding_idx)
    y = pad_sequence([elem['next'] for elem in batch], batch_first=True, padding_value=padding_idx)
    
    return {'text': x, 'next': y}

In [0]:
model = DadJokesGenerator(wikipedia_dataset.text_encoder.vocab_size, 16, 64)

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [0]:
def train(model, dataset, n_epochs, lr=0.01, batch_size=32, model_checkpoint_folder=None):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='elementwise_mean')
    
    train_sampler = BucketBatchSampler(dataset, batch_size, True, sort_key=lambda r: len(r['text']))
    data_loader = DataLoader(dataset, batch_sampler=train_sampler, collate_fn=padded_collate)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    plateau_scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, verbose=True)
    
    model = model.to(device).train()
    
    avg_loss = 0.0
    avg_mom=0.98
    losses = []
    for epoch in range(n_epochs):
        t = tqdm(iter(data_loader), leave=False, total=len(data_loader), miniters=0)
        epoch_losses = []
        for i, batch in enumerate(t):
            optimizer.zero_grad()

            label = batch['next'].to(device)
            pred = model(batch['text'].to(device))
            
            batch_sz, seq_len, n_chars = pred.shape
            
            loss = criterion(pred.view(batch_sz * seq_len, -1), label.view(batch_sz * seq_len))
            
            loss_numeric = float(loss)
            epoch_losses.append(loss_numeric)
            avg_loss = avg_loss * avg_mom + loss_numeric * (1-avg_mom)
            debias_loss = avg_loss / (1 - avg_mom**(i+1))
            
            
            lrs = ",".join([str(param_group['lr']) for param_group in optimizer.param_groups])
            
            t.set_postfix(loss=debias_loss, 
                          learning_rate=lrs,
                         )

            loss.backward()
            # update weights using optimizer formula
            optimizer.step()
        
        epoch_loss = sum(epoch_losses) / len(data_loader)
        losses.append(epoch_loss)
        plateau_scheduler.step(epoch_loss)
        epoch_losses = []
        # save model
        if model_checkpoint_folder:
            model_save_path = model_checkpoint_folder + "model_epoch_{}.pt".format(epoch)
            torch.save(model, model_save_path)
    return losses
    
        

In [0]:
MODEL_CHECKPOINT_FOLDER = "checkpoints/"
if not os.path.isdir(MODEL_CHECKPOINT_FOLDER):
  os.makedirs(MODEL_CHECKPOINT_FOLDER)

In [0]:
!ls -lh checkpoints/

total 9.9M
-rw-r--r-- 1 root root 1004K Nov 18 01:54 model_epoch_0.pt
-rw-r--r-- 1 root root 1004K Nov 18 02:20 model_epoch_1.pt
-rw-r--r-- 1 root root 1004K Nov 18 02:46 model_epoch_2.pt
-rw-r--r-- 1 root root 1004K Nov 18 03:11 model_epoch_3.pt
-rw-r--r-- 1 root root 1004K Nov 18 03:35 model_epoch_4.pt
-rw-r--r-- 1 root root 1004K Nov 18 04:00 model_epoch_5.pt
-rw-r--r-- 1 root root 1004K Nov 18 04:26 model_epoch_6.pt
-rw-r--r-- 1 root root 1004K Nov 18 04:52 model_epoch_7.pt
-rw-r--r-- 1 root root 1004K Nov 18 05:16 model_epoch_8.pt
-rw-r--r-- 1 root root 1004K Nov 18 05:40 model_epoch_9.pt
-rw-r--r-- 1 root root   43K Nov 18 01:27 text_enc.pickle


In [0]:
import pickle
from google.colab import files

In [0]:
with open(MODEL_CHECKPOINT_FOLDER + "text_enc.pickle", 'wb') as fout:
    pickle.dump(wikipedia_dataset.text_encoder.stoi, fout)
files.download(MODEL_CHECKPOINT_FOLDER + "text_enc.pickle")

In [0]:
losses = train(model, 
               wikipedia_dataset, 
               30, 
               lr=0.01, 
               batch_size=8, 
               model_checkpoint_folder=MODEL_CHECKPOINT_FOLDER
              )



  "type " + obj.__name__ + ". It won't be checked "


  8%|▊         | 4339/52409 [02:00<21:24, 37.43it/s, learning_rate=0.01, loss=2.72]Buffered data was truncated after reaching the output size limit.

In [0]:
model_save_path = MODEL_CHECKPOINT_FOLDER + "model_checkpoint_{}.pt".format('wikipedia')
torch.save(model, model_save_path)
files.download(model_save_path)

In [0]:
!ls -lh {MODEL_CHECKPOINT_FOLDER}

In [0]:
model = torch.load(MODEL_CHECKPOINT_FOLDER + "model_epoch_19.pt")

In [0]:
model

In [0]:
from matplotlib import pyplot as plt
plt.plot(losses)

## Now train on the actual dataset

In [0]:
dataset = DadJokesDataset(DATASET_PATH)

In [0]:
dataset.text_encoder.stoi['a']

# Generate

In [0]:
# initialize the hidden state.
starting_char = 'h'

def generation_forward(starting_char, model, max_timesteps=1000):
    letter = torch.LongTensor([dataset.text_encoder.stoi[starting_char]]).unsqueeze(0)
    generated = [starting_char]
    i = 0
    while i < max_timesteps:
        # letter of shape [1, sq_len]
        x = model.char_embedder(letter)  # -> [1, sq_len, char_embs_dim]
        if i == 0:
            x, hidden = model.lstm(x)
        # Step through the sequence one element at a time.
        # after each step, hidden contains the hidden state.
        else:
            x, hidden = model.lstm(x, hidden)
        prediction = F.softmax(model.projection(x), -1)

        confidence, letter = prediction.max(-1)

        next_letter = dataset.text_encoder.decode(letter)

        generated.append(next_letter)
        if next_letter == '</s>':
            break
        i += 1
    return generated

In [0]:
def tell_me_a_joke_starting_with(s):
    start = []
    for c in list(s):
        start.append(dataset.text_encoder.stoi[c])
    # transform into tensor
    encoded_joke = get_joke(start)
    joke = dataset.text_encoder.stoi(encoded_joke)
    return joke

def get_joke(start):
    cur = start
    while cur[-1] != tensor char of eos:
        output = model(cur)
        chars_prob = F.softmax(model())
        next_char = get_max_char(all_chars)
        cur.append(next_char)
    return cur

In [0]:
dataset.text_encoder.vocab_size

In [0]:
sample = dataset.text_encoder.encode('a').unsqueeze(0)

logits = model(sample)
#logits.shape

In [0]:
sample

In [0]:
out = F.softmax(logits, -1)
out.shape

In [0]:
confidence, values = out.max(-1)
values

In [0]:
value

In [0]:
b

In [0]:
dataset.text_encoder.encode('a')

In [0]:
dataset.text_encoder.stoi['a']

In [0]:
tell_me_a_joke_starting_with('a')

In [0]:
import numpy as np
start = np.asarray([10])
torch.unsqueeze((torch.FloatTensor([torch.FloatTensor(start)])), 0).shape

In [0]:
model((torch.FloatTensor([torch.FloatTensor(start)])))

In [0]:
dataset.text_encoder.itos[2]

In [0]:
torch.FloatTensor