In [1]:
#Libraries

#/usr/bin/python
from __future__ import print_function
import argparse
import pickle 
import numpy as np 
import os 
import math 
import random 
import sys
import pandas as pd
import matplotlib.pyplot as plt 
import scipy.io

In [2]:
from gensim.corpora import Dictionary

In [3]:
#Torch-bearer
import torch
from torch import nn, optim
from torch.nn import functional as F

from etm import ETM
from utils import nearest_neighbors, get_topic_coherence, get_topic_diversity

# 1. Data preprocessing 

In [4]:
abstracts = pd.read_csv("abstracts_eng.csv") #Replace with latest version
collection = list(abstracts['abstract'])
len(collection)

20494

#### Train-test sets

In [None]:
# NOTE: we need the whole collection for the embedding 
# Train and test (70-30)
print(len(documents))
set1_size = int(0.7*len(documents)) 
set2_size = int(0.3*len(documents)) 

random.shuffle(documents)

set1_docs = documents[0:set1_size]
set2_docs = documents[set1_size:]

len(set1_docs) + len(set2_docs) == len(documents)

In [8]:
# TODO: REWRITE THIS ONE for our new representation of the collection which is a list of lists of word ids
def get_batch(corpus, ind, vocab_size, device, emsize=300):
    """
    This function takes as input a corpus and a list of documents 
    and returns as output the torch tensor to feed into the net. 
    The list of documents defines the batch to work on. 
    """
    batch_size = len(ind)
    data_batch = np.zeros((batch_size, vocab_size))
    
    for i, doc_id in enumerate(ind):
        doc = corpus[doc_id]
        L = len(doc)
        if doc_id != -1:
            for unique_word in doc:
                data_batch[i, unique_word[0]] = unique_word[1]
    data_batch = torch.from_numpy(data_batch).float().to(device)
    return data_batch

In [None]:
# testing the function correctly works 
get_batch(corpus1,[0,3,6,8],len(dictionary1))

# 2. The network components

## The embedding layer

---

In [5]:
# For the embeddings we'll use a pre-trained contextual model 
# Here we can play a bit to get a sense of its working 

#!pip install transformers 
from transformers import DistilBertTokenizerFast, DistilBertModel

Why DistilBer? <br>
The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of BERT’s performances as measured on the GLUE language understanding benchmark.

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True)
inputs = tokenizer("Hello, we are three cool data scientists", return_tensors="pt")
outputs = model(**inputs)

In [11]:
outputs.last_hidden_state.size() # this is the embedding of the above sentence 

torch.Size([1, 10, 768])

---

Now there's a bit of work to do to prepare the embedding matrix. <br>
We want to use Bert to get the embedding for each word in our corpus. However, being Bert a contextual embedding we could end up with more than one embedding vector for each word. To solve this problem we associate each word a different token for each different embedding. 

In [7]:
# I will work during the developing phase on a subset of the collection because 
# otherwise the memory requirements will be to high.
# We should later apply the same computation that I apply here to all the batches inside the 
# collection.
subset_size = 10 
collection_subset = collection[0:subset_size]

In [8]:
tokenised_collection = tokenizer(collection_subset, return_tensors="pt", truncation=True, padding=True)
tokenised_collection["input_ids"][0] # we have added padding so that we can process the whole collection in a batch - we gain in speed

In [10]:
embedded_collection = model(**tokenised_collection)

Now we can finally start to build our vocabulary. <br>
We'll look at each word and add it to the vocabulary only if there's not already the same embedding_vector in the collection.

In [11]:
size = embedded_collection.last_hidden_state.size()
padding_size = size[1]
size

torch.Size([10, 371, 768])

In [61]:
idx2word = {} # vocabulary in the form (int,word) pairs
idx2bertIdx = {} # each index in our vocabulary is mapped to the Bert vocabulary index 
set_of_embeddings = set()


idx = 0 # initialise dictionary index 
new_token_ids = []

In [47]:
import time 

In [62]:
# the idea of this for loop is the following: 

# input : [[4535,564,2342,...],[423423,32432,...],...] sequence of bert tokens for our collection 
#         [[embedding1, embedding2, ...],[...],...] and respective embedding vectors 


# output: [[0, 1, 2, 1, 4, ...], [0, 4, ...], ...] sequence of our tokens for our collection 
#         [[ embedding of token 0 ]
#          [ embedding of token 1 ]
#          [ embedding of token 2 ]
#          [ embedding of token 3 ]
#          [ embedding of token 4 ]
#                   
#                    ...
#
#          [ embedding of last token ]]

# Note: different tokens in our vocabulary (say 1 and 3) can refer to the same word if the embedding vector is different 

# The model will take as input the new tokens' collection (in some processed form maybe) and the embedding matrix
start = time.time()
for i in range(subset_size):
    t1 = time.time()
    embedded_doc = embedded_collection.last_hidden_state[i][tokenised_collection["attention_mask"][i].bool()] # removing padding using the attention mask 
    tokens_ids = tokenised_collection["input_ids"][i]
    new_token_ids_doc = []
    for j,emb_vector in enumerate(embedded_doc): #TODO: add masking filtering here to avoid checking padding 
        token_id = tokens_ids[j].cpu().numpy() # bert current token 
        word = tokenizer.convert_ids_to_tokens([token_id])[0] # corresponding word
        if not any([torch.all(torch.eq(emb_vector, other)) for other in set_of_embeddings]): 
            # add new embedding to the set 
            set_of_embeddings.add(emb_vector)
            # increase the index and save the word in the dictionary 
            idx2word[idx] = word # save it in our vocabulary
            idx += 1
            new_token_ids_doc += [idx]
        else: # find the right id for the word and add it to our new tokenisation
            # invert the dictionary - no disambiguation problem in the else case 
            word2idx = {v: k for k, v in idx2word.items()}
            word_id = word2idx[word]
            new_token_ids_doc += [word_id]
    new_token_ids += [new_token_ids_doc]
    t2 = time.time()
    print("Document "+str(i)+" done. Time: "+str(round(t2-t1,2))+" s.")
end = time.time()
print("Total time: "+str(round(end-start,2))+" s.")

Document 0 done. Time: 0.07 s.
Document 1 done. Time: 0.8 s.
Document 2 done. Time: 4.17 s.
Document 3 done. Time: 7.89 s.
Document 4 done. Time: 4.96 s.
Document 5 done. Time: 12.99 s.
Document 6 done. Time: 16.36 s.
Document 7 done. Time: 2.05 s.
Document 8 done. Time: 11.96 s.
Document 9 done. Time: 8.27 s.
Total time: 69.53 s.


In [63]:
len(set_of_embeddings) == len(idx2word.keys())

True

In [None]:
# TODO: wrap the above code in an loop that goes through the whole collection of documents 
# in order to get the 'new_token_ids' list and 'set_of_embeddings' set 
# for the whole collection (now it only represents the first few documents)

In [None]:
# TODO: build the embedding matrix from complete 'set_of_embeddings'

In [None]:
# TODO: adjust the get_batch function so as to get the documents x vocabulary matrix 
# required by the model starting from the complete 'new_token_ids' list

## The model!

Hyperparameters

In [None]:
### model-related arguments
num_topics = 50
rho_size = 300 # dimension of rho 
emb_size = 768 # dimension of embeddings 
t_hidden_size = 800 # dimension of hidden space of q(theta)
theta_act = 'relu' # either tanh, softplus, relu, rrelu, leakyrelu, elu, selu, glu
train_embeddings = False
vocab_size = ... # tbd  

### optimization-related arguments
lr = 0.005
lr_factor = 4.0 #divide learning rate by this
epochs = 20 
mode = "train"
seed = 11
enc_drop = 0.0 # dropout rate on encoder
clip = 0.0 # gradient clipping
nonmono = 10 # number of bad hits allowed ...?
weight_decay = 1.2e-6
anneal_lr = False # whether to anneal the learning rate or not
bow_norm = True # normalize the bows or not 
_optimizer = "adam"

### evaluation, visualization, and logging-related arguments
num_docs_train = set1_size
num_docs_test = set2_size
num_words = 10 #number of words for topic viz'
log_interval = 2 #when to log training
visualize_every = 10 #when to visualize results
tc = False # whether to compute topic coherence or not
td = False # whether to compute topic diversity or not

### data and file related arguments
save_path = './results'
batch_size = 1000
eval_batch_size = 1000 #input batch size for evaluation
load_from = "" #the name of the ckpt to run evaluation from


Random seed 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Setting the random seed 
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

Checkpoints

In [None]:
if not os.path.exists(save_path):
    os.makedirs(save_path)
if mode == 'eval':
    ckpt = load_from 
else:
    ckpt = os.path.join(save_path, 
        'etm_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}'.format(
         num_topics, t_hidden_size, optimizer, clip, theta_act, lr, batch_size, rho_size, train_embeddings))


Optimizer 

In [None]:
if _optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
elif _optimizer == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
elif _optimizer == 'adadelta':
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
elif _optimizer == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
elif _optimizer == 'asgd':
    optimizer = optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
else:
    print('Defaulting to vanilla SGD')
    optimizer = optim.SGD(model.parameters(), lr=args.lr)

Finally the model 

In [None]:
# define model
model = ETM(num_topics = num_topics, 
            vocab_size = vocab_size, #TODO: remove this 
            t_hidden_size = t_hidden_size, 
            rho_size = rho_size, 
            emb_size = emb_size, 
            theta_act = theta_act, 
            embeddings = embeddings, # TODO remove this 
            train_embeddings = train_embeddings, 
            enc_drop = enc_drop).to(device)

print('model: {}'.format(model))


# 3. Training

In [None]:
def train(epoch, num_docs_train, batch_size, vocab_size, bow_norm, clip, log_interval):
    """ Just the training function ... """
    
    model.train() #setting the model in training mode
    # preparing all the data structures 
    acc_loss = 0
    acc_kl_theta_loss = 0
    cnt = 0
    indices = torch.randperm(num_docs_train)
    indices = torch.split(indices, batch_size)
    
    for idx, ind in enumerate(indices): # all our batches 
        optimizer.zero_grad()
        model.zero_grad()
        # TODO: modify this one --------------
        data_batch = data.get_batch(train_tokens, train_counts, ind, vocab_size, device)
        # --------------------
        sums = data_batch.sum(1).unsqueeze(1) # what are we summing ?? 
        
        # maybe normalising the input 
        if bow_norm: normalized_data_batch = data_batch / sums
        else: normalized_data_batch = data_batch
        # loss on the batch 
        recon_loss, kld_theta = model(data_batch, normalized_data_batch)
        total_loss = recon_loss + kld_theta
        total_loss.backward() # compute backpropagation
        # maybe clip the gradient 
        if clip > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step() # finally update the weights 
        # accumulate the total loss 
        acc_loss += torch.sum(recon_loss).item()
        acc_kl_theta_loss += torch.sum(kld_theta).item()
        cnt += 1
        
        # visualisation/print time! ('cur' stands for current ...)
        if idx % log_interval == 0 and idx > 0:
            cur_loss = round(acc_loss / cnt, 2) 
            cur_kl_theta = round(acc_kl_theta_loss / cnt, 2) 
            cur_real_loss = round(cur_loss + cur_kl_theta, 2)
            print('Epoch: {} .. batch: {}/{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}'.format(
                epoch, idx, len(indices), optimizer.param_groups[0]['lr'], cur_kl_theta, cur_loss, cur_real_loss))
    
    # Wrapping up the results of the epoch! 
    cur_loss = round(acc_loss / cnt, 2) 
    cur_kl_theta = round(acc_kl_theta_loss / cnt, 2) 
    cur_real_loss = round(cur_loss + cur_kl_theta, 2)
    print('*'*100)
    print('Epoch----->{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}'.format(
            epoch, optimizer.param_groups[0]['lr'], cur_kl_theta, cur_loss, cur_real_loss))
    print('*'*100)

In [None]:
def visualize(m, num_topics, num_words, vocab, show_emb=True):
    """ This is a cool visualisation function. 
    Takes as input the model so far and shows the discovered embeddings! """
    
    # We're going to save our results here 
    # TODO: parametrize this path 
    if not os.path.exists('./results'):
        os.makedirs('./results')

    m.eval() #set the net in evaluation mode 
    # set a few words to query 
    queries = ['andrew', 'computer', 'sports', 'religion', 'man', 'love', 
                'intelligence', 'money', 'politics', 'health', 'people', 'family']

    ## visualize topics using monte carlo (sampling from the posterior I guess)
    with torch.no_grad(): # no gradients computation - makes forward pass lighter 
        print('#'*100)
        print('Visualize topics...')
        topics_words = []
        gammas = m.get_beta() # topics distributions 
        for k in range(num_topics):
            gamma = gammas[k]
            top_words = list(gamma.cpu().numpy().argsort()[-num_words+1:][::-1])
            topic_words = [vocab[a] for a in top_words] 
            topics_words.append(' '.join(topic_words))
            print('Topic {}: {}'.format(k, topic_words))

        if show_emb:
            ## visualize word embeddings by using V to get nearest neighbors
            print('#'*100)
            print('Visualize word embeddings by using output embedding matrix')
            
            # extract the embeddings from the model! 
            try:embeddings = m.rho.weight  # Vocab_size x E
            except:embeddings = m.rho         # Vocab_size x E
            
            for word in queries:
                print('word: {} .. neighbors: {}'.format(
                    word, nearest_neighbors(word, embeddings, vocab))) # utility function 
            print('#'*100)

In [None]:
def evaluate(m, num_docs_test=num_docs_test, tc=tc, td=td, eval_batch_size=eval_batch_size, vocab_size=vocab_size, bow_norm=bow_norm):
    """
    Evaluating the trained model on the test set using either perplexity, or coherence and diversity. 
    Compute perplexity on document completion.
    """
    
    m.eval() # set model in evaluation mode 
    # load the data ... TODO change this 
    with torch.no_grad():
        indices = torch.split(torch.tensor(range(num_docs_test)), eval_batch_size)
        
        ## get \beta here
        beta = m.get_beta()

        ### do dc and tc here
        acc_loss = 0
        cnt = 0
        
        for idx, ind in enumerate(indices):
            data_batch = data.get_batch(ind, vocab_size, device)
            sums = data_batch.sum(1).unsqueeze(1)
            if bow_norm: normalized_data_batch = data_batch / sums
            else: normalized_data_batch = data_batch
                
            ## get theta
            theta, _ = m.get_theta(normalized_data_batch)
            ## get prediction loss
            res = torch.mm(theta, beta)
            preds = torch.log(res)
            recon_loss = -(preds * data_batch).sum(1)
            loss = recon_loss / sums.squeeze()
            loss = loss.mean().item()
            acc_loss += loss
            cnt += 1
        
        # Calculate final loss 
        cur_loss = acc_loss / cnt
        ppl_dc = round(math.exp(cur_loss), 1)
        print('*'*100)
        print('{} Doc Completion PPL: {}'.format(source.upper(), ppl_dc))
        print('*'*100)
        
        
        if tc or td: # calculate topic coherence or topic diversity 
            beta = beta.data.cpu().numpy()
            if tc:
                print('Computing topic coherence...')
                get_topic_coherence(beta, train_tokens, vocab)
            if td:
                print('Computing topic diversity...')
                get_topic_diversity(beta, 25)
        return ppl_dc

Here is the code that actually launches the training 

In [None]:
# Initialising the data structures 
best_epoch = 0
best_val_ppl = 1e9
all_val_ppls = []

# Let's get a sense of how bad the model is before training 
print('\n')
print('Visualizing model quality before training...')
visualize(model)
print('\n')


for epoch in range(1, epochs):
    
    train(epoch) # train 
    val_ppl = evaluate(model, 'val') # evaluate 
    
    # only saving the model if it's the best so far 
    if val_ppl < best_val_ppl: 
        with open(ckpt, 'wb') as f:
            torch.save(model, f)
        best_epoch = epoch 
        best_val_ppl = val_ppl
        
    else:
        ## check whether to anneal lr (aka decreasing it by a constant factor )
        lr = optimizer.param_groups[0]['lr']
        if args.anneal_lr and (len(all_val_ppls) > args.nonmono and val_ppl > min(all_val_ppls[:-args.nonmono]) and lr > 1e-5):
            optimizer.param_groups[0]['lr'] /= args.lr_factor
            
    #maybe visualise 
    if epoch % args.visualize_every == 0:
        visualize(model)
        
    #save perplexities 
    all_val_ppls.append(val_ppl)

And here's the code that launches the final evaluation

In [None]:
# load trained model and evaluate it  
with open(ckpt, 'rb') as f:
    model = torch.load(f)
model = model.to(device)
model.eval()

with torch.no_grad():
    ## ---------------
    ## Idea : get document completion perplexities
    test_ppl = evaluate(model) 

    ## ----------------
    ## Idea : get most used topics
    indices = torch.tensor(range(num_docs_train)) # training documents indices 
    indices = torch.split(indices, batch_size)
    #just initialising data structures 
    thetaAvg = torch.zeros(1, args.num_topics).to(device)
    thetaWeightedAvg = torch.zeros(1, args.num_topics).to(device)
    cnt = 0
    for idx, ind in enumerate(indices):
        data_batch = data.get_batch(ind, args.vocab_size, device) # TODO: fix here 
        sums = data_batch.sum(1).unsqueeze(1) 
        cnt += sums.sum(0).squeeze().cpu().numpy()
        # maybe normalise 
        if args.bow_norm:normalized_data_batch = data_batch / sums
        else: normalized_data_batch = data_batch
        # get the theta 
        theta, _ = model.get_theta(normalized_data_batch)
        thetaAvg += theta.sum(0).unsqueeze(0) / args.num_docs_train
        weighed_theta = sums * theta
        thetaWeightedAvg += weighed_theta.sum(0).unsqueeze(0)
        # let's print the progress as we go 
        if idx % 100 == 0 and idx > 0:
            print('batch: {}/{}'.format(idx, len(indices)))
    # finally the results are in 
    thetaWeightedAvg = thetaWeightedAvg.squeeze().cpu().numpy() / cnt
    print('\nThe 10 most used topics are {}'.format(thetaWeightedAvg.argsort()[::-1][:10]))

    # Now we show the topics
    # A nice visualisation is always welcome 
    beta = model.get_beta()
    topic_indices = list(np.random.choice(num_topics, 10)) # 10 random topics
    print('\n')
    for k in range(num_topics):#topic_indices:
        gamma = beta[k]
        top_words = list(gamma.cpu().numpy().argsort()[-num_words+1:][::-1])
        topic_words = [vocab[a] for a in top_words]
        print('Topic {}: {}'.format(k, topic_words))

    # Why not, also showing a few embeddings 
    if train_embeddings:
        # get embeddings from the model 
        try:rho_etm = model.rho.weight.cpu()
        except:rho_etm = model.rho.cpu()
        queries = ['andrew', 'woman', 'computer', 'sports', 'religion', 'man', 'love', 
                        'intelligence', 'money', 'politics', 'health', 'people', 'family']
        print('\n')
        print('ETM embeddings...')
        for word in queries:
            print('word: {} .. etm neighbors: {}'.format(word, nearest_neighbors(word, rho_etm, vocab)))
        print('\n')
