In [1]:
import torch
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import *
from typing import List, Tuple
import os
import csv

In [2]:
from torchnlp.samplers import BucketBatchSampler
from torchnlp.datasets import snli_dataset
from torchnlp.utils import datasets_iterator
from torchnlp.text_encoders import IdentityEncoder, CharacterEncoder, StaticTokenizerEncoder
from torchnlp import word_to_vector

In [3]:
import sys
import tqdm as tq
from tqdm import tqdm_notebook, tnrange


def in_ipynb():
    try:
        cls = get_ipython().__class__.__name__
        return cls == 'ZMQInteractiveShell'
    except NameError:
        return False


def in_notebook():
    try:
        from ipykernel.kernelapp import IPKernelApp
        return IPKernelApp.initialized()
    except ImportError:
        return False


def clear_tqdm():
    inst = getattr(tq.tqdm, '_instances', None)
    if not inst: return
    try:
        for i in range(len(inst)): inst.pop().close()
    except Exception:
        pass

if in_notebook():
    def tqdm(*args, **kwargs):
        clear_tqdm()
        return tq.tqdm(*args, file=sys.stdout, **kwargs)
    def trange(*args, **kwargs):
        clear_tqdm()
        return tq.trange(*args, file=sys.stdout, **kwargs)
else:
    from tqdm import tqdm, trange
    tnrange=trange
    tqdm_notebook=tqdm

In [4]:
class CharacterByteEncoder(StaticTokenizerEncoder):
    """ Encodes text into a tensor by encoding into bytes and splitting the text into individual 
    characters.

    Args:
        sample (list of strings): Sample of data to build dictionary on
        min_occurrences (int, optional): Minimum number of occurrences for a token to be added to
          dictionary.
        append_eos (bool, optional): If `True` append EOS token onto the end to the encoded vector.
    """
    # TODO add start and end token
    def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError('CharacterEncoder defines a tokenize callable per character')
        super().__init__(*args, tokenize=self._tokenize, **kwargs)

    def decode(self, tensor):
        tokens = [self.itos[index] for index in tensor]
        return ''.join(tokens)
    
    def _tokenize(self, s):
        return [c for c in s.encode()]

In [5]:
DATASET_PATH = "DadJokes/shortjokes.csv"
# DATASET_PATH = "DadJokes/shortjokes_head.csv"

In [6]:
class DadJokesDataset(Dataset):
    def __init__(self, path):
        self.data = pd.read_csv(path, sep=',')['Joke'].tolist()
        self.text_encoder = CharacterEncoder(self.data, append_eos=True)
        self.samples = []
        for _ in range(len(self.data)):
            joke = self.data.pop()
            self.samples.append(self.generate_language_model_samples(self.text_encoder.encode(joke)))
        del self.data        
        
    def generate_language_model_samples(self, joke):
        res = {}
        res['text'] = joke[:-1]
        res['next'] = joke[1:]
        return res
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, i):
        return self.samples[i]
    
    def getitem_readable(self, i):
        return {'text': self.text_encoder.decode(self.samples[i]['text']),
               'next': self.text_encoder.decode(self.samples[i]['next'])}
    

In [7]:
dataset = DadJokesDataset(DATASET_PATH)

In [8]:
class DadJokesGenerator(nn.Module):
    def __init__(self, n_chars, char_embedding_dim, lstm_dim, n_lstm_layers=1, dropout=0):
        super().__init__()
        self.n_chars = n_chars
        self.char_embedding_dim = char_embedding_dim
        
        # initialize lookup table of fixed dictionary and size, 0 = padding idx
        self.char_embedder = nn.Embedding(self.n_chars, self.char_embedding_dim, padding_idx=0)
        
        # lstm_dim = size of one lstm hidden layer
        # n_lstm_layers = number of stacked lstms
        self.lstm = nn.LSTM(char_embedding_dim, lstm_dim, n_lstm_layers, batch_first=True, dropout=dropout)
        
        # creates a pipeline to transform the output of the lstm into a list where each position is a char idx
        # and its values can be projected by ReLU to be the confidence for each character
        self.projection = nn.Sequential(*[
            nn.Linear(lstm_dim, self.n_chars),
            nn.ReLU(),
        ])
        
    def forward(self, char_ids):
        # char_ids of shape [b_sz, max_sq_len (w/ pads)]
        x = self.char_embedder(char_ids)  # -> shape [b_sz, max_sq_len, char_emb_sz]
        x, _ = self.lstm(x)  # -> shape [b_sz, max_sq_len, lstm_dim]  [b_sz, max_sq_len * lstm_dim]
        
        return self.projection(x)  # -> shape [b_sz, max_sq_len, total_num_characters]
        

In [9]:
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pad_packed_sequence

def padded_collate(batch, padding_idx=0):
    x = pad_sequence([elem['text'] for elem in batch], batch_first=True, padding_value=padding_idx)
    y = pad_sequence([elem['next'] for elem in batch], batch_first=True, padding_value=padding_idx)
    
    return {'text': x, 'next': y}

In [10]:
model = DadJokesGenerator(dataset.text_encoder.vocab_size, 16, 64)

In [11]:
def train(model, dataset, n_epochs, lr=0.01, batch_size=32, model_checkpoint_folder=None):
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='elementwise_mean')
    
    train_sampler = BucketBatchSampler(dataset, batch_size, True, sort_key=lambda r: len(r['text']))
    data_loader = DataLoader(dataset, batch_sampler=train_sampler, collate_fn=padded_collate)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    plateau_scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, verbose=True)
    
    avg_loss = 0.0
    avg_mom=0.98
    losses = []
    for epoch in range(n_epochs):
        t = tqdm(iter(data_loader), leave=False, total=len(data_loader), miniters=0)
        epoch_losses = []
        for i, batch in enumerate(t):
            optimizer.zero_grad()

            label = batch['next']
            pred = model(batch['text'])
            
            batch_sz, seq_len, n_chars = pred.shape
            
            loss = criterion(pred.view(batch_sz * seq_len, -1), label.view(batch_sz * seq_len))
            
            loss_numeric = float(loss)
            epoch_losses.append(loss_numeric)
            avg_loss = avg_loss * avg_mom + loss_numeric * (1-avg_mom)
            debias_loss = avg_loss / (1 - avg_mom**(i+1))
            
            # ??
            lrs = ",".join([str(param_group['lr']) for param_group in optimizer.param_groups])
            # ??
            t.set_postfix(loss=debias_loss, 
                          learning_rate=lrs,
                         )

            loss.backward()
            # update weights using optimizer formula
            optimizer.step()
        
        epoch_loss = sum(epoch_losses) / len(data_loader)
        losses.append(epoch_loss)
        plateau_scheduler.step(epoch_loss)
        epoch_losses = []
        # save model
        if model_checkpoint_folder:
            torch.save(model, model_checkpoint_folder + "model_epoch_{}.pt".format(epoch))
    return losses
    
        

In [12]:
MODEL_CHECKPOINT_FOLDER = "/Users/ludovica/Documents/Training_problems/NN/checkpoints/"

In [13]:
import pickle

In [15]:
with open(MODEL_CHECKPOINT_FOLDER + "text_enc.pickle", 'wb') as fout:
    pickle.dump(dataset.text_encoder.stoi, fout)

In [None]:
with open(MODEL_CHECKPOINT_FOLDER + "text_enc.pickle", 'wb') as fout:
    pickle.dump(dataset.text_encoder.stoi, fout)

In [16]:
losses = train(model, 
               dataset, 
               20, 
               lr=0.01, 
               batch_size=32, 
               model_checkpoint_folder=MODEL_CHECKPOINT_FOLDER
              )

                                                                                  

  "type " + obj.__name__ + ". It won't be checked "


Epoch     8: reducing learning rate of group 0 to 1.0000e-03.                     
                                                                                   

In [17]:
!ls -lh {MODEL_CHECKPOINT_FOLDER}

total 5928
-rw-r--r--  1 ludovica  staff   134K Nov  8 10:41 model_epoch_0.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 10:50 model_epoch_1.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 12:13 model_epoch_10.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 12:23 model_epoch_11.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 12:34 model_epoch_12.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 12:43 model_epoch_13.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 12:53 model_epoch_14.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 13:04 model_epoch_15.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 13:14 model_epoch_16.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 13:24 model_epoch_17.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 13:34 model_epoch_18.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 13:45 model_epoch_19.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 10:59 model_epoch_2.pt
-rw-r--r--  1 ludovica  staff   134K Nov  8 11:07 model_epoch_3.pt
-rw-r--r--  1 ludovica  st

In [14]:
model = torch.load(MODEL_CHECKPOINT_FOLDER + "model_epoch_19.pt")

In [15]:
model

DadJokesGenerator(
  (char_embedder): Embedding(102, 16, padding_idx=0)
  (lstm): LSTM(16, 64, batch_first=True)
  (projection): Sequential(
    (0): Linear(in_features=64, out_features=102, bias=True)
    (1): ReLU()
  )
)

In [16]:
from matplotlib import pyplot as plt
plt.plot(losses)

NameError: name 'losses' is not defined

In [18]:
def generation_forward(starting_char, model, max_timesteps=1000):
    # initialize the hidden state.
    letter = torch.LongTensor([dataset.text_encoder.stoi[starting_char]]).unsqueeze(0)
    generated = [starting_char]
    i = 0
    while i < max_timesteps:
        # letter of shape [1, sq_len]
        x = model.char_embedder(letter)  # -> [1, sq_len, char_embs_dim]
        if i == 0:
            x, hidden = model.lstm(x)
        # Step through the sequence one element at a time.
        # after each step, hidden contains the hidden state.
        else:
            x, hidden = model.lstm(x, hidden)
        prediction = F.softmax(model.projection(x), -1)

        confidence, letter = prediction.max(-1)

        next_letter = dataset.text_encoder.decode(letter)

        generated.append(next_letter)
        if next_letter == '</s>':
            break
        i += 1
    return generated

In [19]:
# TODO add support for start of more than one letter
def tell_me_a_joke_starting_with(start):    
    return ''.join(generation_forward(start, model, 10))

In [20]:
print(tell_me_a_joke_starting_with('a'))
print('Uh...hilarious...')

a mat the s
Uh...hilarious...


As of now the generator does not work. Here are the next steps I intend to implement to improve it:
1 - Find more data
2 - Try dropout and a larger model
3 - Try different sampling scheme
4 - word encoding
5 - pretrain on language modeling on wikipedia data


Things that usually seems to be used in generative models but that I do not think make sense to use here:
1 - byte encoding
