In [1]:
import tomotopy as tp
import pandas as pd
import numpy as np
import sys
import nltk
from nltk import word_tokenize, RegexpTokenizer,PunktSentenceTokenizer, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import re

# Topics embeddings pipeline 
Here we implement the embedding pipeline for the topics extracted by the collection. <br>
The results will be the following: 
1. Topic embeddings 
2. Document embeddings 

## 1. Loading topic model and collection

In [20]:
model = tp.LDAModel.load("lda_model150.bin")
data = pd.read_csv("abstracts_eng.csv")
collection = list(data['abstract'])
num_topics = 150

In [15]:
batch = collection[0:500]

Reproducing the preprocessing steps taken to build the model. 

In [14]:
# Step 1
word_stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
stop_words = stopwords.words('english')
def normalisation(document, stemming = True, lemmatising = True, min_word_len = 3):
    tokens = gensim.utils.simple_preprocess(str(document), deacc=True, max_len = sys.maxsize)
    cleaned = [word for word in tokens if word not in stop_words]
    if stemming:
        cleaned = [word_stemmer.stem(word) for word in cleaned]
    if lemmatising:
        cleaned = [lemmatiser.lemmatize(word) for word in cleaned]
    cleaned = [word for word in cleaned if (min_word_len<=len(word))]
    return cleaned

In [16]:
# preprocess the batch 
batch_normalised = [normalisation(doc) for doc in batch]

Extracting the topics. 

In [3]:
def get_top_topics(document, model, min_score=0.8):
    """ 
    Extracting top n topics for each document. 
    Selects the n most likely topics whose p(topic|document) sum to min_score.
    """
    # inserting the document in the model
    new_doc = model.make_doc(document)
    _,_ = model.infer(new_doc)
    # ordering from most probable topic to least one 
    dist = new_doc.get_topic_dist()
    indices = np.flip(np.argsort(dist))
    score = 0
    indices_kept = []
    probs_kept = []
    for index in indices:
        if score > min_score: break
        score += dist[index]
        indices_kept.append(index)
        probs_kept.append(dist[index])
    return list(zip(indices_kept, probs_kept))

In [18]:
# extracting topics for the original collection
batch2topics = [get_top_topics(doc, model) for doc in batch_normalised]

In [22]:
def get_top_words(topic, model, min_score=0.8):
    """
    Extracting top n words for each document. 
    Selects the n most likely words whose p(word|topic) sum to min_score.
    """
    dist = model.get_topic_word_dist(topic)
    indices = np.flip(np.argsort(dist))
    score = 0
    word_kept = []
    word_prob_kept = []
    for index in indices:
        if score > min_score: break
        score += dist[index]
        word_kept.append(model.used_vocabs[index])
        word_prob_kept.append(dist[index])
    return list(zip(word_kept, word_prob_kept))

In [23]:
topics2words = [get_top_words(i, model, min_score=0.25) for i in range(num_topics)]

## 2. Embedding of topics

We will use Glove Embeddings to embed the words in the topics. 

In [70]:
import pickle 
import numpy as np 
import os 
import time
import torch

In [25]:
# Loading from binary the glove vocabulary and embedding 
glove_vocab_path = "glove_vocab"
glove_embedding_path = "glove_embedding"
with open(glove_vocab_path, "rb") as fp:  
    glove_vocab = pickle.load(fp)
with open(glove_embedding_path, "rb") as fp: 
    glove_embedding = pickle.load(fp)

In [43]:
glove_vocab_normalised = {k:lemmatiser.lemmatize(word_stemmer.stem(v)) for k,v in glove_vocab.items()}

In [48]:
def get_list_embeddings_topic(topic, vocab, embedding):
    """ Topic is represented as a list of tuples (word, word weight)"""
    matched = 0 
    total = 0
    topic_embeddings = []
    topic_weights = []
    
    start = time.time()
    
    for item in topic: 
        word, weight = item
        total+=1
        #check if the word appears in vocabulary 
        if word in vocab.values(): 
            matched+=1
            emb = embedding[list(vocab.values()).index(word)]
            topic_embeddings += [emb.numpy()]
            topic_weights += [weight]
    
    end = time.time()
    
    print("Total time: "+str(round(end-start,2))+" s.")
    print("Proportion of matched words: "+str(round(matched/total,2)))
    return topic_embeddings, topic_weights

In [49]:
topic_embeddings, topic_weights = get_list_embeddings_topic(topics2words[0], glove_vocab_normalised, glove_embedding)

Total time: 0.34 s.
Proportion of matched words: 0.92


In [64]:
def get_convex_topic_embedding(topic_weights, topic_embeddings):
    """ Creates a topic embeddings as convex combination of embedding vectors according 
    to the weights provided."""
    weight_vec = np.asarray(topic_weights)
    topic_vec = np.asarray(topic_embeddings)
    normalized_weights = weight_vec / np.sqrt(np.sum(weight_vec**2))
    return normalized_weights.dot(topic_vec)

In [65]:
topic_emb = get_convex_topic_embedding(topic_weights, topic_embeddings)

In [66]:
def get_convex_topics_embeddings(batch_topics, vocab, embedding):
    """ Runs the above 2 functions to get the embedding for each topic in the batch."""
    topics_embs = []
    for topic2word in topics2words:
        topic_embeddings, topic_weights = get_list_embeddings_topic(topic2word, glove_vocab_normalised, glove_embedding)
        topic_emb = get_convex_topic_embedding(topic_weights, topic_embeddings)
        topics_embs += [topic_emb]
    return topics_embs

In [67]:
topics_embs = get_convex_topics_embeddings(topics2words[0], glove_vocab_normalised, glove_embedding)

Total time: 0.35 s.
Proportion of matched words: 0.92
Total time: 0.18 s.
Proportion of matched words: 1.0
Total time: 0.26 s.
Proportion of matched words: 0.94
Total time: 0.51 s.
Proportion of matched words: 0.9
Total time: 0.24 s.
Proportion of matched words: 1.0
Total time: 0.32 s.
Proportion of matched words: 1.0
Total time: 0.35 s.
Proportion of matched words: 0.86
Total time: 0.03 s.
Proportion of matched words: 1.0
Total time: 0.3 s.
Proportion of matched words: 0.94
Total time: 0.33 s.
Proportion of matched words: 0.95
Total time: 0.34 s.
Proportion of matched words: 1.0
Total time: 0.29 s.
Proportion of matched words: 1.0
Total time: 0.43 s.
Proportion of matched words: 1.0
Total time: 0.14 s.
Proportion of matched words: 1.0
Total time: 0.26 s.
Proportion of matched words: 1.0
Total time: 0.58 s.
Proportion of matched words: 1.0
Total time: 0.25 s.
Proportion of matched words: 0.75
Total time: 0.44 s.
Proportion of matched words: 1.0
Total time: 0.42 s.
Proportion of matched

Now let's have a look at the results!

In [72]:
def nearest_neighbors(topic, topics):
    """Returns the most similar topics to the given one in the listo of topics"""
    cos = torch.nn.CosineSimilarity(dim = -1)
    ranks = cos(torch.tensor(topic), torch.tensor(topics))
    mostSimilar = []
    return ranks.numpy().argsort()[::-1] 

In [74]:
# let's look at the first topic 
nns_0 = nearest_neighbors(topics_embs[0], topics_embs)

In [84]:
def visualise_most_similar(topic_id, topics_embs, topics2words, n=10):
    """Prints the words of the topic and its neareast neighbors."""
    nns = nearest_neighbors(topics_embs[topic_id], topics_embs)
    print("-"*10)
    print("Topic "+ str(topic_id))
    print(" ".join(item[0] for item in topics2words[0]))
    print("-"*10)
    print(str(n)+" most similar topics")
    for i in range(n):
        print("Topic "+str(i+1))
        print(" ".join(item[0] for item in topics2words[nns[i+1]]))

In [85]:
visualise_most_similar(0, topics_embs, topics2words)

----------
Topic 0
flow fluid transport pressur veloc measur air turbul channel water liquid bubbl heat_transfer particl viscos two regim ga simul convect capillari porou_medium surfac droplet permeabl
----------
10 most similar topics
Topic 1
simul transport atmospher event region convect wind precipit km surfac vertic trajectori air_mass associ day block local advect air flow weather temperatur variabl strong larg_scale mesoscal near_surfac moistur storm intens forecast winter valley system meteorolog anomali cyclon
Topic 2
diffus water mixtur solvent system liquid diffus_coeffici predict data rh coeffici organ compon calcul describ methanol experiment_data interact
Topic 3
sediment carbon oxygen sourc cr fraction isotop oc methan miner dissolv age deposit organ_carbon releas cycl concentr flux ocean marin water suggest valu organ_matter deriv pool format produc degrad indic signatur lake process potenti terrestri sedimentari transport river seawat
Topic 4
particl temperatur droplet 

### Visualisation

## 3. Documents embeddings