In [1]:
#Libraries

#/usr/bin/python
from __future__ import print_function
import time 
import argparse
import pickle 
import numpy as np 
import os 
import math 
import random 
import sys
import pandas as pd
import matplotlib.pyplot as plt 
import scipy.io

In [2]:
#Torch-bearer
import torch
from torch import nn, optim
from torch.nn import functional as F

from etm import ETM
from utils import *
from utils import nearest_neighbors, get_topic_coherence, get_topic_diversity

# 1. Data preprocessing 

In [3]:
abstracts = pd.read_csv("abstracts_eng.csv") #Replace with latest version
collection = list(abstracts['abstract'])
len(collection)

20494

#### Training in batches 
The idea is to simulate the real-time data stream.

In [4]:
seed = 11
random.seed(seed)
#random.shuffle(collection)

print(len(collection))
streaming_batch_size = 20
batch1 = collection[:streaming_batch_size]
batch2 = collection[streaming_batch_size:2*streaming_batch_size]
batch3 = collection[2*streaming_batch_size:3*streaming_batch_size]

20494


# 2. The network components

## The embedding layer

---

In [5]:
# For the embeddings we'll use a pre-trained contextual model 
# Here we can play a bit to get a sense of its working 

#!pip install transformers 
from transformers import DistilBertTokenizerFast, DistilBertModel

Why DistilBer? <br>
The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of BERTâ€™s performances as measured on the GLUE language understanding benchmark.

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True, output_hidden_states=True)
inputs = tokenizer("Hello, we are three cool data scientists", return_tensors="pt")
outputs = model(**inputs)

In [8]:
outputs.last_hidden_state[0].data.cpu().numpy().shape

(10, 768)

In [9]:
outputs[1][3:6]

(tensor([[[ 0.1393, -0.5393,  0.1046,  ..., -0.3981,  0.6327,  0.3209],
          [ 0.5595, -0.1247,  0.4549,  ...,  0.5223,  0.4912, -0.2058],
          [ 0.3140, -0.2623,  0.0068,  ...,  0.2120,  0.1021,  0.7718],
          ...,
          [ 0.8004, -0.3209,  0.3904,  ...,  0.1422,  0.7812, -0.7698],
          [-0.0889,  0.3921, -0.5252,  ..., -0.1267,  0.3539,  0.0588],
          [-0.0177, -0.0416,  0.0169,  ..., -0.0171, -0.0211, -0.0194]]],
        grad_fn=<NativeLayerNormBackward>),
 tensor([[[ 0.0403, -0.4845, -0.5697,  ..., -0.3822,  0.4573,  0.4213],
          [ 0.5752,  0.0935,  0.6670,  ...,  0.1831,  0.4715, -0.2758],
          [ 0.2022,  0.2348,  0.5722,  ..., -0.3376,  0.1674,  0.2746],
          ...,
          [ 0.6425, -0.2266,  0.1830,  ...,  0.0890,  0.7730, -0.3740],
          [ 0.1349,  0.5156, -0.4746,  ...,  0.2830,  0.0902, -0.4534],
          [-0.0149, -0.0101,  0.0296,  ..., -0.0533, -0.0553, -0.0175]]],
        grad_fn=<NativeLayerNormBackward>),
 tensor([[[ 0.

In [10]:
torch.sum(torch.stack(outputs[1][3:6], dim=0), dim=0).size()

torch.Size([1, 10, 768])

---

Now there's a bit of work to do to prepare the embedding matrix. <br>
We want to use Bert to get the embedding for each word in our corpus. However, being Bert a contextual embedding we could end up with more than one embedding vector for each word. To solve this problem we associate each word a different token for each different embedding. 

In [22]:
# I will work during the developing phase on a subset of the collection because 
# otherwise the memory requirements will be to high.
# We should later apply the same computation that I apply here to all the batches inside the 
# collection.
subset_size = 10 # len(collection)
collection_subset = collection[0:subset_size]

In [None]:
tokenised_collection = tokenizer(collection_subset, return_tensors="pt", truncation=False, padding=True)
tokenised_collection["input_ids"][0] # we have added padding so that we can process the whole collection in a batch - we gain in speed

In [None]:
embedded_collection = model(**tokenised_collection)

Now we can finally start to build our vocabulary. <br>
We'll look at each word and add it to the vocabulary only if there's not already the same embedding_vector in the collection.

In [None]:
size = embedded_collection.last_hidden_state.size()
padding_size = size[1]
size

In [None]:
idx2word = {} # vocabulary in the form (int,word) pairs
idx2bertIdx = {} # each index in our vocabulary is mapped to the Bert vocabulary index 
set_of_embeddings = set()


idx = 0 # initialise dictionary index 
new_token_ids = []

In [None]:
# the idea of this for loop is the following: 

# input : [[4535,564,2342,...],[423423,32432,...],...] sequence of bert tokens for our collection 
#         [[embedding1, embedding2, ...],[...],...] and respective embedding vectors 


# output: [[0, 1, 2, 1, 4, ...], [0, 4, ...], ...] sequence of our tokens for our collection 
#         [[ embedding of token 0 ]
#          [ embedding of token 1 ]
#          [ embedding of token 2 ]
#          [ embedding of token 3 ]
#          [ embedding of token 4 ]
#                   
#                    ...
#
#          [ embedding of last token ]]

# Note: different tokens in our vocabulary (say 1 and 3) can refer to the same word if the embedding vector is different 

# The model will take as input the new tokens' collection (in some processed form maybe) and the embedding matrix
start = time.time()
for i in range(subset_size):
    t1 = time.time()
    embedded_doc = embedded_collection.last_hidden_state[i][tokenised_collection["attention_mask"][i].bool()] # removing padding using the attention mask 
    tokens_ids = tokenised_collection["input_ids"][i]
    new_token_ids_doc = []
    for j,emb_vector in enumerate(embedded_doc):
        token_id = tokens_ids[j].cpu().numpy() # bert current token 
        word = tokenizer.convert_ids_to_tokens([token_id])[0] # corresponding word
        bool_list = [torch.all(torch.eq(emb_vector, other)) for other in set_of_embeddings]
        if not any(bool_list): 
            # add new embedding to the set 
            set_of_embeddings.add(emb_vector)
            # increase the index and save the word in the dictionary 
            idx2word[idx] = word # save it in our vocabulary
            idx2bertIdx[idx] = token_id # save in id to bert id mapping
            new_token_ids_doc += [idx]
            idx += 1
        else: # find the right id for the word and add it to our new tokenisation
            word_id = list(idx2word.values()).index(word)
            new_token_ids_doc += [word_id]
    new_token_ids += [new_token_ids_doc]
    t2 = time.time()
    print("Document "+str(i)+" done. Time: "+str(round(t2-t1,2))+" s.")
end = time.time()
print("Total time: "+str(round(end-start,2))+" s.")

In [None]:
len(set_of_embeddings) == len(idx2word.keys())

---

Now we write the loop that will process one batch in our collection. 

We also add 2 functionalities here: 
- **Cosine similarity clustering**. We add a filtering of the embedding vectors for each word based on cosine similarity, i.e. we only keep one of the vectors that are "*too close*" to each other. 
- **Stop-words removal**. We avoid to represent the stop-word in our embedding since they wouldn't anyway carry meaning for the topic and they would occupy a lot of memory since for [this](http://ai.stanford.edu/blog/contextual/) article they tend to be the most context dependent words in contextual embeddings.  

In [59]:
from nltk.corpus import stopwords
#TODO: remove punctuation
stop_words = stopwords.words('english') # note: the words in this list are only lower case but distilbert tokenizer incorporates lower casing so we should be fine! read more here: https://huggingface.co/transformers/_modules/transformers/tokenization_distilbert_fast.html

In [283]:
def process_subset_cosine(doc_subset, tokenizer, model, set_of_embeddings, 
                          idx2word, new_token_ids, threshold=0.9):
    """ 
    Processing of a subset of the batch using cosine similarity clustering. 
    
    Parameters 
    ----- 
    doc_subset: list of documents (aka list of strings)
    tokenizer: instance of Bert tokenizer 
    model: instance of Bert model 
    set_of_embeddings: set containing the embedding vectors already in the vocabulary 
    idx2word: vocabulary mapping our token ids to the corresponding word. 
            Notice that each word can be mapped to multiple token ids.
    new_token_ids: representation of the collection with our token ids. 
    threshold: cosine similarity threshold. 
            Vectors with cosine similarity above the threshold are considered equal. 
            
    Returns 
    -----
    Updated versions of set_of_embeddings, idx2word and new_token_ids
    
    """
    
    tokenised_collection = tokenizer(doc_subset, return_tensors="pt", padding=True)
    print("tokenisation done")
    embedded_collection = model(**tokenised_collection)
    embedded_collection.requires_grad = False
    # extract lower layers hidden states
    #lower_hiddens = torch.sum(torch.stack(embedded_collection[1][3:6], dim=0), dim=0)
    lower_hiddens = embedded_collection[1][6]

    print("embeddings done")
    
    
    ##  preparing the variables we need ---------
    if len(idx2word) == 0:idx = 0
    else:idx = len(set_of_embeddings)
        
    cos = torch.nn.CosineSimilarity(dim = 0)
    
    subset_size = len(doc_subset)
    start = time.time()
    
    ## processing the collection document by document ----------
    for i in range(subset_size):
        t1 = time.time()
        embedded_doc = lower_hiddens[i][tokenised_collection["attention_mask"][i].bool()] # removing padding using the attention mask 
        tokens_ids = tokenised_collection["input_ids"][i]
        new_token_ids_doc = []
        
        
        for j,emb_vector in enumerate(embedded_doc):
            
            token_id = tokens_ids[j].cpu().numpy() # bert current token 
            word = tokenizer.convert_ids_to_tokens([token_id])[0] # corresponding word
            # jump to the next token if the word is a stopword 
            if word in stop_words: continue 
            
            if word not in idx2word.values(): # we add the embedding anyway if we haven't encountered that word previously 
                # add new embedding to the set 
                set_of_embeddings.add(emb_vector)
                # increase the index and save the word in the dictionary 
                idx2word[idx] = word # save it in our vocabulary
                new_token_ids_doc += [idx]
                idx += 1
            else: # find the right id for the word and add it to our new tokenisation
                word_occurrences = [position for position, v in enumerate(list(idx2word.values())) if v == word]
                word_embeddings = [list(set_of_embeddings)[occ] for occ in word_occurrences]
                bool_list = [cos(emb_vector, other) >= threshold for other in word_embeddings] 
                if not any(bool_list): 
                    # add new embedding to the set 
                    set_of_embeddings.add(emb_vector)
                    # increase the index and save the word in the dictionary 
                    idx2word[idx] = word # save it in our vocabulary
                    new_token_ids_doc += [idx]
                    idx += 1
                else: 
                    word_id = list(idx2word.values()).index(word)
                    new_token_ids_doc += [word_id]
                
        new_token_ids += [new_token_ids_doc]
        t2 = time.time()
        #if i%(subset_size//3)==0:print("Document "+str(i)+" done. Time: "+str(round(t2-t1,2))+" s.")
            
    end = time.time()
    print("Total time for the subset: "+str(round(end-start,2))+" s.")
    
    return set_of_embeddings, idx2word, new_token_ids

In [284]:
# TESTING on a few example sentences 
idx2word = {} 
set_of_embeddings = set()
idx = 0 
new_token_ids = []

sen1 = "This mouse loves cheese"
sen2 = "I am using this mouse as a pointer"
sen3 = "This mouse is hungry"
doc = [sen1, sen2, sen3]
tokenised_collection = tokenizer(doc, return_tensors="pt", truncation=True, padding=True)
embedded_collection = model(**tokenised_collection)
set_of_embeddings, idx2word, new_token_ids = process_subset_cosine(doc, tokenizer, model, set_of_embeddings, idx2word, new_token_ids)

tokenisation done
embeddings done
Total time for the subset: 0.0 s.


In [285]:
def process_batch(batch, subset_size, tokeniser, model):
    """ Processing of a batch of documents in the collection. """
    
    ## initialisation 
    idx2word = {} 
    set_of_embeddings = set()
    idx = 0 
    new_token_ids = []
    subset_size = 25 
    streaming_batch_size =len(batch)
    
    start = time.time()

    ## processing the batch one subset at a time
    for s in range(0,streaming_batch_size,subset_size):
        print("Processing subset "+str(s//subset_size + 1))
        if s+subset_size < len(batch):batch_subset = batch[s:s+subset_size]
        else: batch_subset = batch[s:]
        set_of_embeddings, idx2word, new_token_ids = process_subset_cosine(batch_subset, tokenizer, model, 
                                                                           set_of_embeddings, idx2word, 
                                                                           new_token_ids, threshold=0.9)
        print("Number of word vectors so far: "+str(len(idx2word)))    
        print()
        
    end = time.time()
    print("Total time: "+str(round(end-start,2))+" s.")
    
    return set_of_embeddings, idx2word, new_token_ids

In [281]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? n
Nothing done.


In [286]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True, output_hidden_states=True)
set_of_embeddings, idx2word, new_token_ids = process_batch(batch2, 25, tokenizer, model)

Processing subset 1
tokenisation done
embeddings done
Total time for the subset: 2.84 s.
Number of word vectors so far: 2641

Total time: 14.24 s.


In [287]:
# build the embedding matrix from complete 'set_of_embeddings'
embedding = torch.stack(list(set_of_embeddings))
embedding.size()

torch.Size([2641, 768])

In [288]:
vocab_output_path = "vocab_etm"
embedding_output_path = "embedding_etm"
new_collection_output_path = "new_collection_etm"

In [289]:
# Saving to binary 
with open(vocab_output_path, "wb") as fp: 
    pickle.dump(idx2word, fp)
with open(embedding_output_path, "wb") as fp: 
    pickle.dump(embedding, fp)
with open(new_collection_output_path, "wb") as fp: 
    pickle.dump(new_token_ids, fp)

In [290]:
# 8.767.883 (approx (MB)) bytes for 2k words

In [291]:
# Loading from binary 
with open(vocab_output_path, "rb") as fp:  
    idx2word = pickle.load(fp)
with open(embedding_output_path, "rb") as fp: 
    embedding = pickle.load(fp)
with open(new_collection_output_path, "rb") as fp: 
    new_token_ids = pickle.load(fp)

--- 
So now we finally have a vocabulary and an embedding matrix: everything is ready for our model! 

In [292]:
vocab_size = len(idx2word)
vocab_size

2641

In [293]:
# adjust the get_batch function so as to get the documents x vocabulary matrix 
# required by the model starting from the complete 'new_token_ids' list

In [294]:
def get_batch(corpus, ind, vocab_size, device, emsize=300):
    """
    This function takes as input a list of tokenised documents (corpus)
    and the indices of the documents in the batch (ind)
    and returns as output the torch tensor to feed into the net. 
    The list of documents defines the batch to work on. 
    """
    batch_size = len(ind)
    data_batch = np.zeros((batch_size, vocab_size))
    
    for i, doc_id in enumerate(ind):
        doc = corpus[doc_id]
        L = len(doc)
        if doc_id != -1:
            for word_id in doc:
                counts = doc.count(word_id)
                data_batch[i, word_id] = counts
    data_batch = torch.from_numpy(data_batch).float().to(device)
    return data_batch

## The model!

Hyperparameters

In [295]:
### model-related arguments
num_topics = 5
rho_size = 768 # dimension of rho 
emb_size = 768 # dimension of embeddings 
t_hidden_size = 600 # dimension of hidden space of q(theta)
theta_act = 'relu' # either tanh, softplus, relu, rrelu, leakyrelu, elu, selu, glu
train_embeddings = False
seed = 11

### optimization-related arguments
lr = 0.05
lr_factor = 5.0 #divide learning rate by this
epochs = 100 
mode = "train"
enc_drop = 0.0 # dropout rate on encoder
clip = 0.0 # gradient clipping
nonmono = 10 # number of bad hits allowed ...?
weight_decay = 1.2e-6
anneal_lr = False # whether to anneal the learning rate or not
bow_norm = True # normalize the bows or not 
_optimizer = "adam"

### evaluation, visualization, and logging-related arguments
num_docs_train = 18
num_words = 5 #number of words for topic viz'
log_interval = 2 #when to log training
visualize_every = 10 #when to visualize results
tc = False # whether to compute topic coherence or not
td = False # whether to compute topic diversity or not

### data and file related arguments
save_path = './results'
batch_size = 20
eval_batch_size = 20 #input batch size for evaluation
load_from = "" #the name of the ckpt to run evaluation from


Random seed 

In [296]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Setting the random seed 
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

Checkpoints

In [297]:
if not os.path.exists(save_path):
    os.makedirs(save_path)
if mode == 'eval':
    ckpt = load_from 
else:
    ckpt = os.path.join(save_path, 
        'etm_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}'.format(
         num_topics, t_hidden_size, _optimizer, clip, theta_act, lr, batch_size, rho_size, train_embeddings))


Optimizer 

Finally the model 

In [298]:
# define model
etm_model = ETM(num_topics = num_topics, 
            vocab_size = vocab_size, 
            t_hidden_size = t_hidden_size, 
            rho_size = rho_size, 
            emsize = emb_size, 
            theta_act = theta_act, 
            embeddings = embedding,
            train_embeddings = train_embeddings, 
            enc_drop = enc_drop).to(device)

print('model: {}'.format(etm_model))


model: ETM(
  (t_drop): Dropout(p=0.0, inplace=False)
  (theta_act): ReLU()
  (alphas): Linear(in_features=768, out_features=5, bias=False)
  (q_theta): Sequential(
    (0): Linear(in_features=2641, out_features=600, bias=True)
    (1): ReLU()
    (2): Linear(in_features=600, out_features=600, bias=True)
    (3): ReLU()
  )
  (mu_q_theta): Linear(in_features=600, out_features=5, bias=True)
  (logsigma_q_theta): Linear(in_features=600, out_features=5, bias=True)
)


In [299]:
if _optimizer == 'adam':
    optimizer = optim.Adam(etm_model.parameters(), lr=lr, weight_decay=lr_factor)
elif _optimizer == 'adagrad':
    optimizer = optim.Adagrad(etm_model.parameters(), lr=lr, weight_decay=lr_factor)
elif _optimizer == 'adadelta':
    optimizer = optim.Adadelta(etm_model.parameters(), lr=lr, weight_decay=lr_factor)
elif _optimizer == 'rmsprop':
    optimizer = optim.RMSprop(etm_model.parameters(), lr=lr, weight_decay=lr_factor)
elif _optimizer == 'asgd':
    optimizer = optim.ASGD(etm_model.parameters(), lr=lr, t0=0, lambd=0., weight_decay=lr_factor)
else:
    print('Defaulting to vanilla SGD')
    optimizer = optim.SGD(etm_model.parameters(), lr=lr)

# 3. Training

In [300]:
train_corpus = new_token_ids[:num_docs_train]
test_corpus = new_token_ids[num_docs_train:]

In [306]:
def train(model, epoch, corpus, num_docs_train=num_docs_train, batch_size=batch_size, vocab_size=vocab_size, 
          bow_norm=bow_norm, clip=clip, log_interval=log_interval):
    """ Just the training function ... """
    
    model.train() #setting the model in training mode
    # preparing all the data structures 
    acc_loss = 0
    acc_kl_theta_loss = 0
    cnt = 0
    indices = torch.randperm(num_docs_train)
    indices = torch.split(indices, batch_size)
    
    for idx, ind in enumerate(indices): # all our batches 
        optimizer.zero_grad()
        data_batch = get_batch(corpus, ind, vocab_size, device)
        sums = data_batch.sum(1).unsqueeze(1) # what are we summing ?? 
        
        # maybe normalising the input 
        if bow_norm: normalized_data_batch = data_batch / sums
        else: normalized_data_batch = data_batch
        # loss on the batch 
        recon_loss, kld_theta = model(data_batch, normalized_data_batch)
        total_loss = recon_loss + kld_theta
        total_loss.backward(retain_graph=True) # compute backpropagation
        # maybe clip the gradient 
        if clip > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step() # finally update the weights 
        # accumulate the total loss 
        acc_loss += torch.sum(recon_loss).item()
        acc_kl_theta_loss += torch.sum(kld_theta).item()
        cnt += 1
        
        # visualisation/print time! ('cur' stands for current ...)
        if idx % log_interval == 0 and idx > 0:
            cur_loss = round(acc_loss / cnt, 2) 
            cur_kl_theta = round(acc_kl_theta_loss / cnt, 2) 
            cur_real_loss = round(cur_loss + cur_kl_theta, 2)
            print('Epoch: {} .. batch: {}/{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}'.format(
                epoch, idx, len(indices), optimizer.param_groups[0]['lr'], cur_kl_theta, cur_loss, cur_real_loss))
    
    # Wrapping up the results of the epoch! 
    cur_loss = round(acc_loss / cnt, 2) 
    cur_kl_theta = round(acc_kl_theta_loss / cnt, 2) 
    cur_real_loss = round(cur_loss + cur_kl_theta, 2)
    print('-'*50)
    print('Epoch----->{} .. LR: {} .. KL_theta: {} .. Rec_loss: {} .. NELBO: {}'.format(
            epoch, optimizer.param_groups[0]['lr'], cur_kl_theta, cur_loss, cur_real_loss))

In [302]:
def visualize(m, num_topics=num_topics, num_words=num_words, 
              vocab=idx2word, show_emb=True, tokenizer=tokenizer, bert_model=model):
    """ This is a cool visualisation function. 
    Takes as input the model so far and shows the discovered embeddings! """
    
    # We're going to save our results here 
    # TODO: parametrize this path 
    if not os.path.exists('./results'):
        os.makedirs('./results')

    m.eval() #set the net in evaluation mode 
    # set a few words to query 
    queries = ['insurance', 'weather', 'particles', 'religion', 'man', 'love', 
                'intelligence', 'money', 'politics', 'health', 'people', 'family']

    ## visualize topics using monte carlo (sampling from the posterior I guess)
    with torch.no_grad(): # no gradients computation - makes forward pass lighter 
        print('-'*50)
        print('Visualize topics...')
        topics_words = []
        gammas = m.get_beta() # topics distributions 
        for k in range(num_topics):
            gamma = gammas[k]
            top_words = list(gamma.cpu().numpy().argsort()[-num_words+1:][::-1])
            topic_words = [vocab[a] for a in top_words] 
            topics_words.append(' '.join(topic_words))
            print('Topic {}: {}'.format(k, topic_words))

        if show_emb:
            ## visualize word embeddings by using V to get nearest neighbors
            print('-'*50)
            print('Visualize word embeddings by using output embedding matrix')
            
            # extract the embeddings from the model! 
            try:embeddings = m.rho.weight  # Vocab_size x E
            except:embeddings = m.rho         # Vocab_size x E
            
            
            for word in queries:
                # extracting Bert representation of the word
                inputs = tokenizer(word, return_tensors="pt")
                outputs = bert_model(**inputs).last_hidden_state[0]
                outputs.requires_grad = False
                if outputs.size()[0]>1: #aggregate
                    outputs = torch.sum(outputs, dim=0)
                nns = utils.nearest_neighbors(q=outputs, 
                                        embeddings=embeddings, vocab=list(vocab.values()))
                print('word: {} .. neighbors: {}'.format(word, nns)) # utility function 

In [303]:
def evaluate(m, corpus, num_docs_test, tc=tc, td=td, 
             eval_batch_size=eval_batch_size, vocab_size=vocab_size, 
             bow_norm=bow_norm):
    """
    Evaluating the trained model on the test set using either perplexity, or coherence and diversity. 
    Compute perplexity on document completion.
    """
    
    m.eval() # set model in evaluation mode 
    with torch.no_grad():
        indices = torch.split(torch.tensor(range(num_docs_test)), eval_batch_size)
        
        ## get \beta here
        beta = m.get_beta()

        ### do dc and tc here
        acc_loss = 0
        cnt = 0
        
        for idx, ind in enumerate(indices):
            data_batch = get_batch(corpus, ind, vocab_size, device)
            sums = data_batch.sum(1).unsqueeze(1)
            if bow_norm: normalized_data_batch = data_batch / sums
            else: normalized_data_batch = data_batch
                
            ## get theta
            theta, _ = m.get_theta(normalized_data_batch)
            ## get prediction loss
            res = torch.mm(theta, beta)
            preds = torch.log(res)
            recon_loss = -(preds * data_batch).sum(1)
            loss = recon_loss / sums.squeeze()
            loss = loss.mean().item()
            acc_loss += loss
            cnt += 1
        
        # Calculate final loss 
        cur_loss = acc_loss / cnt
        ppl_dc = round(math.exp(cur_loss), 1)
        print('Eval Doc Completion PPL: {}'.format(ppl_dc))
        
        
        if tc or td: # calculate topic coherence or topic diversity 
            beta = beta.data.cpu().numpy()
            if tc:
                print('Computing topic coherence...')
                get_topic_coherence(beta, train_tokens, vocab)
            if td:
                print('Computing topic diversity...')
                get_topic_diversity(beta, 25)
        return ppl_dc

Here is the code that actually launches the training 

In [170]:
import importlib
import utils
utils = importlib.reload(utils)

In [307]:
# Initialising the data structures 
best_epoch = 0
best_val_ppl = 1e9
all_val_ppls = []

# Let's get a sense of how bad the model is before training 
print('\n')
print('Visualizing model quality before training...')
visualize(etm_model)
print('\n')


for epoch in range(1, epochs):
    
    train(etm_model, epoch, train_corpus) # train 
    num_docs_test = len(new_token_ids) - num_docs_train
    val_ppl = evaluate(etm_model, test_corpus, num_docs_test) # evaluate 
    
    # only saving the model if it's the best so far 
    if val_ppl < best_val_ppl: 
        with open(ckpt, 'wb') as f:
            torch.save(etm_model, f)
        best_epoch = epoch 
        best_val_ppl = val_ppl
        
    else:
        ## check whether to anneal lr (aka decreasing it by a constant factor )
        lr = optimizer.param_groups[0]['lr']
        if anneal_lr and (len(all_val_ppls) > nonmono and val_ppl > min(all_val_ppls[:-nonmono]) and lr > 1e-5):
            optimizer.param_groups[0]['lr'] /= lr_factor
            
    #maybe visualise 
    if epoch % visualize_every == 0:
        visualize(etm_model)
        
    #save perplexities 
    all_val_ppls.append(val_ppl)



Visualizing model quality before training...
--------------------------------------------------
Visualize topics...
Topic 0: ['plans', '##de', '##s', '##mis']
Topic 1: ['##ability', 'emerged', '##lizer', 'accepted']
Topic 2: ['##tical', '##gt', 'better', '##die']
Topic 3: ['.', '[CLS]', 'dispatch', '##mb']
Topic 4: ['behavior', ',', 'rhino', 'united']
--------------------------------------------------
Visualize word embeddings by using output embedding matrix
word: insurance .. neighbors: [',', 'behavior', 'individually', 'russell', 'uses', 'exchange', '##g', 'semi', 'case', 'approaches']
word: weather .. neighbors: [',', 'behavior', 'individually', 'approaches', 'uses', '##ens', 'exchange', 'st', 'res', 'remains']
word: particles .. neighbors: [',', 'uses', 'semi', 'res', 'behavior', 'system', 'carried', 'target', 'remains', 'russell']
word: religion .. neighbors: [',', 'behavior', 'uses', 'train', 'target', 'individually', 'exchange', 'united', 'semi', 'real']
word: man .. neighbor

word: politics .. neighbors: [',', 'behavior', 'train', 'united', ',', 'railway', '##e', 'po', 'tv', '-']
word: health .. neighbors: [',', 'behavior', 'train', 'united', '-', 'developed', 'po', 'railway', '##e', ',']
word: people .. neighbors: ['individually', ',', 'approaches', 'behavior', 'exchange', 'uses', '##ens', 'res', 'st', 'semi']
word: family .. neighbors: ['individually', ',', 'behavior', 'approaches', 'exchange', 'st', '##ens', 'uses', 'res', 'remains']
--------------------------------------------------
Epoch----->21 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1076.56 .. NELBO: 1076.57
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->22 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1076.6 .. NELBO: 1076.61
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->23 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1075.89 .. NELBO: 1075.9
Eval Doc Completion PPL: 2371.9
----------------------------------

--------------------------------------------------
Epoch----->46 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1076.65 .. NELBO: 1076.66
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->47 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1077.21 .. NELBO: 1077.22
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->48 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1078.93 .. NELBO: 1078.94
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->49 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1074.44 .. NELBO: 1074.45
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->50 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1075.76 .. NELBO: 1075.77
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Visualize topics...
Topic 0: ['plans', '##de', '##s', '##mis']
Topic 1: ['##ability', 'emerged', '##lizer', 'accepted']

word: particles .. neighbors: [',', 'uses', 'semi', 'res', 'behavior', 'system', 'carried', 'target', 'remains', 'russell']
word: religion .. neighbors: [',', 'behavior', 'uses', 'train', 'target', 'individually', 'exchange', 'united', 'semi', 'real']
word: man .. neighbors: ['individually', 'approaches', 'exchange', ',', 'uses', 'st', '##ens', 'behavior', 'countries', 'res']
word: love .. neighbors: ['individually', ',', 'behavior', 'approaches', 'exchange', 'st', 'uses', '##ens', 'res', 'semi']
word: intelligence .. neighbors: [',', 'behavior', 'semi', 'uses', 'propagation', 'individually', '##ens', 'assignment', 'exchange', 'target']
word: money .. neighbors: [',', 'behavior', 'train', 'united', 'individually', 'railway', 'approaches', 'res', 'uses', 'semi']
word: politics .. neighbors: [',', 'behavior', 'train', 'united', ',', 'railway', '##e', 'po', 'tv', '-']
word: health .. neighbors: [',', 'behavior', 'train', 'united', '-', 'developed', 'po', 'railway', '##e', ',']
word: peopl

--------------------------------------------------
Epoch----->93 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1076.21 .. NELBO: 1076.22
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->94 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1074.89 .. NELBO: 1074.9
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->95 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1076.76 .. NELBO: 1076.77
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->96 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1076.67 .. NELBO: 1076.68
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->97 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1075.23 .. NELBO: 1075.24
Eval Doc Completion PPL: 2371.9
--------------------------------------------------
Epoch----->98 .. LR: 0.05 .. KL_theta: 0.01 .. Rec_loss: 1074.98 .. NELBO: 1074.99
Eval Doc Completion PPL: 2371.9
-----

And here's the code that launches the final evaluation

In [305]:
# load trained model and evaluate it  
with open(ckpt, 'rb') as f:
    etm_model = torch.load(f)
etm_model = etm_model.to(device)
etm_model.eval()

with torch.no_grad():
    ## ---------------
    ## Idea : get document completion perplexities
    test_ppl = evaluate(etm_model, test_corpus, num_docs_test)

    ## ----------------
    ## Idea : get most used topics
    indices = torch.tensor(range(num_docs_test)) # training documents indices 
    indices = torch.split(indices, batch_size)
    #just initialising data structures 
    thetaAvg = torch.zeros(1, num_topics).to(device)
    thetaWeightedAvg = torch.zeros(1, num_topics).to(device)
    cnt = 0
    for idx, ind in enumerate(indices):
        data_batch = get_batch(test_corpus,ind, vocab_size, device) # TODO: fix here 
        sums = data_batch.sum(1).unsqueeze(1) 
        cnt += sums.sum(0).squeeze().cpu().numpy()
        # maybe normalise 
        if bow_norm:normalized_data_batch = data_batch / sums
        else: normalized_data_batch = data_batch
        # get the theta 
        theta, _ = etm_model.get_theta(normalized_data_batch)
        thetaAvg += theta.sum(0).unsqueeze(0) /num_docs_train
        weighed_theta = sums * theta
        thetaWeightedAvg += weighed_theta.sum(0).unsqueeze(0)
        # let's print the progress as we go 
        if idx % 100 == 0 and idx > 0:
            print('batch: {}/{}'.format(idx, len(indices)))
    # finally the results are in 
    thetaWeightedAvg = thetaWeightedAvg.squeeze().cpu().numpy() / cnt
    print('\nThe 10 most used topics are {}'.format(thetaWeightedAvg.argsort()[::-1][:10]))

    # Now we show the topics
    # A nice visualisation is always welcome 
    beta = etm_model.get_beta()
    topic_indices = list(np.random.choice(num_topics, 10)) # 10 random topics
    print('\n')
    for k in range(num_topics):#topic_indices:
        gamma = beta[k]
        top_words = list(gamma.cpu().numpy().argsort()[-num_words+1:][::-1])
        topic_words = [idx2word[a] for a in top_words]
        print('Topic {}: {}'.format(k, topic_words))

    # Why not, also showing a few embeddings 
    if train_embeddings:
        # get embeddings from the model 
        try:rho_etm = etm_model.rho.weight.cpu()
        except:rho_etm = etm_model.rho.cpu()
        queries = ['andrew', 'woman', 'computer', 'sports', 'religion', 'man', 'love', 
                        'intelligence', 'money', 'politics', 'health', 'people', 'family']
        print('\n')
        print('ETM embeddings...')
        for word in queries:
            print('word: {} .. etm neighbors: {}'.format(word, nearest_neighbors(word, rho_etm, vocab)))
        print('\n')


Eval Doc Completion PPL: 2371.9

The 10 most used topics are [3 4 1 0 2]


Topic 0: ['plans', '##de', '##s', '##mis']
Topic 1: ['##ability', 'emerged', '##lizer', 'accepted']
Topic 2: ['##tical', '##gt', 'better', '##die']
Topic 3: ['.', '[CLS]', 'dispatch', '##mb']
Topic 4: ['behavior', ',', 'rhino', 'united']


# Alternative approach: Static BERT embeddings

Idea: Lower hidden layers of BERT are less contextualized. We attempt to build a static embedding where we project the vectors from a hidden layer to the first principal component of PCA for each word in the vocabulary.

In [6]:
import string
from nltk.corpus import stopwords
#TODO: remove punctuation
stop_words = stopwords.words('english') # note: the words in this list are only lower case but distilbert tokenizer incorporates lower casing so we should be fine! read more here: https://huggingface.co/transformers/_modules/transformers/tokenization_distilbert_fast.html

In [7]:
def process_subset_static(doc_subset, tokenizer, model, 
                          idx2word, new_token_ids, word2uniquevec, word2manyvec, selected_layer = 1):
    """ 
    Processing of a subset of the batch, returning a dictionary of the unseen words with all its embedded vectors. 
    
    Parameters 
    ----- 
    doc_subset: list of documents (aka list of strings)
    tokenizer: instance of Bert tokenizer 
    model: instance of Bert model 
    idx2word: vocabulary mapping our token ids to the corresponding word. 
            Notice that each word can be mapped to multiple token ids.
    new_token_ids: representation of the collection with our token ids.
    word2uniquevec: global dictionary mapping words to one vector
    word2manyvec: batch local dictionary mapping (the words not in word2uniquevec) to all vectors found 
        in the embedding space for that word
    selected_layer: hidden layer to be selected for near-static embeddings (should be small) 
            
    Returns 
    -----
    Updated versions of idx2word, new_token_ids, word2manyvec
    
    """
    tokenised_collection = tokenizer(doc_subset, return_tensors="pt", padding=True)
    model.resize_token_embeddings(len(tokenizer))
    print("tokenisation done")
    try:
        with torch.no_grad():
            embedded_collection = model(**tokenised_collection)
    except Exception as e:
        print("There was a problem with this subset, we'll skip it!")
        print(e)
        return idx2word, new_token_ids, word2uniquevec
    embedded_collection.requires_grad = False
    # extract lower layers hidden states
    #lower_hiddens = torch.sum(torch.stack(embedded_collection[1][3:6], dim=0), dim=0)
    lower_hiddens = embedded_collection[1][selected_layer].cpu()

    print("embeddings done")
    
    
    ##  preparing the variables we need ---------
    if len(idx2word) == 0:
        idx = 0
    else:
        idx = len(idx2word)
    subset_size = len(doc_subset)
    start = time.time()
    
    ## processing the collection document by document ----------
    for i in range(subset_size):
        t1 = time.time()
        embedded_doc = lower_hiddens[i][tokenised_collection["attention_mask"][i].bool()] # removing padding using the attention mask 
        tokens_ids = tokenised_collection["input_ids"][i]
        new_token_ids_doc = []
        
        
        for j,emb_vector in enumerate(embedded_doc):
            
            token_id = tokens_ids[j].cpu().numpy() # bert current token 
            word = tokenizer.convert_ids_to_tokens([token_id])[0] # corresponding word
    
            # jump to the next token if the word is a stopword 
            if word in stop_words or word.startswith("##") or word in string.punctuation: continue   
            
            if word not in word2uniquevec.keys(): # only consider words without unique vector representation (i.e. new words)
                if word not in word2manyvec.keys(): # create new entry if we encounter word for the first time
                    word2manyvec[word] = np.array([emb_vector.detach().numpy()])
                    idx2word[idx] = word # save it in our vocabulary
                    new_token_ids_doc += [idx]
                    idx += 1
                else: # append to list if already encountered the word in batch
                    word2manyvec[word] = np.append(word2manyvec[word], [emb_vector.detach().numpy()], axis = 0) # note: using numpy array as lists manipulate inplace
                    word_id = list(idx2word.values()).index(word)
                    new_token_ids_doc += [word_id]
            else:
                word_id = list(idx2word.values()).index(word)
                new_token_ids_doc += [word_id]
                
        new_token_ids += [new_token_ids_doc]
        t2 = time.time()
        try:
            if i % (subset_size // 3) == 0:
                print("Document " + str(i) + " done. Time: " + str(round(t2 - t1, 2)) + " s.")
        except Exception as _:
            pass  # case subset_size//3 = 0 we get a division by 0 (subset_size must be < 3)
       
            
    end = time.time()
    print("Total time for the subset: "+str(round(end-start,2))+" s.")
    
    return idx2word, new_token_ids, word2manyvec

In [8]:
def process_batch_static(batch, subset_size, tokeniser, model, idx2word, word2uniquevec):
    """ Processing of a batch of documents in the collection. """
    from sklearn.decomposition import PCA #, FastICA
    
    ## initialisation 
    new_token_ids = [] 
    word2manyvec = {}
    streaming_batch_size = len(batch)

    
    
    start = time.time()

    ## processing the batch one subset at a time
    for s in range(0,streaming_batch_size,subset_size):
        print("Processing subset "+str(s//subset_size + 1))
        if s+subset_size < len(batch):
            batch_subset = batch[s:s+subset_size]
        else: 
            batch_subset = batch[s:]
        # iteratively update idx2word, new_token_ids, word2manyvec throughout the batch
        idx2word, new_token_ids, word2manyvec = process_subset_static(batch_subset, tokenizer, model, 
                                                                        idx2word, new_token_ids, 
                                                                        word2uniquevec, word2manyvec, selected_layer = 1)
        print("Number of word vectors so far: "+str(len(idx2word)))    
    
    # for every new word discover, find unique vector representation by taking first PCA prinicipal component
    for word, veclist in word2manyvec.items():
        if len(veclist) == 1: # otherwise we would just get the first standard unit vector from PCA
            word2uniquevec[word] = torch.Tensor(veclist[0])
        else: 
            pca = PCA(n_components = 1)
            pca.fit(veclist)
            word2uniquevec[word] = torch.Tensor(pca.components_[0])
            print(type(pca.components_[0]))
            
    
    end = time.time()
    print("Total time: "+str(round(end-start,2))+" s.")
    
    return word2uniquevec, idx2word, new_token_ids

In [9]:
# TESTING on a few example sentences 

word2uniquevec = {}
idx2word = {}
subset_size = 1
sen1 = "This mouse loves cheese"
sen2 = "I am using this mouse as a pointer"
sen3 = "This mouse is hungry"
batch1 = [sen1, sen2, sen3]
sen4 = "The mouse moved out of the house"
sen5 = "I grew up in Italy"
sen6 = "I am giving up on this shit"
batch2 = [sen4, sen5,sen6]

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True, output_hidden_states=True)
model.eval()
#model.to(device)

word2uniquevec, idx2word, new_token_ids1 = process_batch_static(batch1, subset_size, tokenizer, model, idx2word, 
                                                                          word2uniquevec)


word2uniquevec, idx2word, new_token_ids2 = process_batch_static(batch2, subset_size, tokenizer, model, idx2word, 
                                                                          word2uniquevec)


Processing subset 1
tokenisation done
embeddings done
Total time for the subset: 0.0 s.
Number of word vectors so far: 5
Processing subset 2
tokenisation done
embeddings done
Total time for the subset: 0.0 s.
Number of word vectors so far: 7
Processing subset 3
tokenisation done
embeddings done
Total time for the subset: 0.0 s.
Number of word vectors so far: 8
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Total time: 0.35 s.
Processing subset 1
tokenisation done
embeddings done
Total time for the subset: 0.0 s.
Number of word vectors so far: 10
Processing subset 2
tokenisation done
embeddings done
Total time for the subset: 0.0 s.
Number of word vectors so far: 12
Processing subset 3
tokenisation done
embeddings done
Total time for the subset: 0.0 s.
Number of word vectors so far: 14
Total time: 0.17 s.


In [10]:
new_token_ids1

[[0, 1, 2, 3, 4], [0, 5, 1, 6, 4], [0, 1, 7, 4]]

In [49]:
word2uniquevec["mouse"]

tensor([ 0.0226, -0.0245, -0.0307, -0.0304,  0.0010,  0.0323, -0.0708, -0.0031,
        -0.0080,  0.0454, -0.0472,  0.0289, -0.0066, -0.0069, -0.0313,  0.0657,
         0.0215,  0.0543, -0.0078, -0.0371,  0.0283,  0.0537,  0.0294, -0.0227,
        -0.0518,  0.0025, -0.0096, -0.0687, -0.0587, -0.0261,  0.0162, -0.0051,
         0.0128, -0.0534, -0.0087, -0.0132, -0.0029,  0.0304,  0.0217,  0.0068,
         0.0065,  0.0318,  0.0305, -0.0221, -0.0145,  0.0317, -0.0072, -0.0226,
        -0.0221,  0.0112,  0.0291, -0.0709,  0.0035,  0.0387,  0.0106,  0.0839,
         0.0559,  0.0428,  0.0206, -0.0099,  0.0119,  0.0264, -0.0367, -0.0473,
         0.0461, -0.0269,  0.0323,  0.0061, -0.0048, -0.0106,  0.0149, -0.0045,
         0.0348, -0.0285, -0.0177, -0.0325, -0.0211, -0.1015, -0.0204,  0.0301,
        -0.0474,  0.0246, -0.0310,  0.0905,  0.0187,  0.0183,  0.0264,  0.0109,
        -0.0482,  0.0280,  0.0055,  0.0553,  0.0247, -0.0692,  0.0196, -0.0042,
         0.0325, -0.0616,  0.0284,  0.04

In [45]:
type(torch.Tensor(set_of_embeddings)[0])

torch.Tensor

In [50]:
set_of_embeddings = list(word2uniquevec.values())
embedding = torch.stack(set_of_embeddings)

In [51]:
embedding

tensor([[-0.0783,  0.0037, -0.0416,  ..., -0.0035, -0.0109,  0.0325],
        [ 0.0226, -0.0245, -0.0307,  ..., -0.0327, -0.0435,  0.0052],
        [ 1.3711, -0.2180,  0.6082,  ..., -0.2962,  0.3602, -0.1579],
        ...,
        [-0.3060,  0.2731, -0.0352,  ..., -0.2040, -1.1930, -0.0599],
        [-0.5460, -0.4722,  1.3046,  ..., -0.8042,  0.3584, -0.4593],
        [ 2.0691,  0.0259,  1.0179,  ..., -0.1770, -0.2215, -0.9847]])

In [21]:
list(word2uniquevec.values())

[array([-7.83041641e-02,  3.68435704e-03, -4.15908396e-02, -6.39180392e-02,
         1.23506766e-02, -2.19464209e-02, -3.65209840e-02,  1.16070476e-03,
         1.01042744e-02, -5.12355305e-02, -1.84954912e-03, -1.51269126e-03,
        -3.12230587e-02,  1.14855105e-02, -8.26475676e-03,  1.81913078e-02,
         3.20828892e-02, -7.57741183e-03,  5.61412191e-03, -1.40703050e-02,
        -5.50648905e-02, -4.13631871e-02,  2.54745148e-02, -2.40750052e-02,
        -6.19810708e-02, -3.43214273e-02,  1.50560951e-02,  6.79481681e-03,
        -2.72180457e-02,  3.50033529e-02,  1.26859071e-02, -6.14599464e-03,
        -7.31483251e-02,  2.64722649e-02,  2.35825721e-02, -2.94118524e-02,
        -2.93706581e-02,  6.34710267e-02,  1.09152431e-02,  1.41946273e-02,
         2.77515631e-02, -2.35104724e-03,  2.66647208e-02,  2.53169797e-02,
        -1.43280374e-02,  5.09072212e-04, -1.50810257e-01,  5.56470361e-03,
        -4.35568616e-02, -4.95804986e-03,  2.11095288e-02,  1.48499748e-02,
         3.2

In [226]:
# Note: to get set_of_embeddings as previously we just take the values of word2uniquevec
word2uniquevec

{'[CLS]': array([-7.83041269e-02,  3.68435960e-03, -4.15908322e-02, -6.39180094e-02,
         1.23506794e-02, -2.19464228e-02, -3.65209728e-02,  1.16070826e-03,
         1.01042790e-02, -5.12355156e-02, -1.84954377e-03, -1.51268719e-03,
        -3.12230512e-02,  1.14855105e-02, -8.26476142e-03,  1.81913022e-02,
         3.20828818e-02, -7.57741788e-03,  5.61412238e-03, -1.40703078e-02,
        -5.50648794e-02, -4.13631871e-02,  2.54745055e-02, -2.40749866e-02,
        -6.19810559e-02, -3.43214199e-02,  1.50560765e-02,  6.79482194e-03,
        -2.72180494e-02,  3.50033492e-02,  1.26859006e-02, -6.14599977e-03,
        -7.31483251e-02,  2.64722612e-02,  2.35825628e-02, -2.94118617e-02,
        -2.93706525e-02,  6.34710118e-02,  1.09152412e-02,  1.41946189e-02,
         2.77515613e-02, -2.35104770e-03,  2.66647153e-02,  2.53169723e-02,
        -1.43280225e-02,  5.09070640e-04, -1.50810227e-01,  5.56470314e-03,
        -4.35568579e-02, -4.95805079e-03,  2.11095270e-02,  1.48499794e-02,
   