In [1]:
import pandas as pd
import os 
import numpy as np
import re
import random
import nltk
import pickle
from nltk import word_tokenize, RegexpTokenizer,PunktSentenceTokenizer, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
abstracts = pd.read_csv("abstracts_eng.csv")
abs_list = list(abstracts['abstract'])

In [5]:
tokenised = []
count = 0
for abstract in abs_list:
    raw = abstract
    tokens = gensim.utils.simple_preprocess(str(raw), deacc=True)
    tokenised.append(tokens)
    count += len(tokens)
print(str(count)+" tokens created")

3855522 tokens created


In [6]:
c = []
for doc in tokenised: 
    c+=doc
len(set(c))

71429

In [7]:
stop_words = stopwords.words('english')
print(len(stop_words))
stop_words = stop_words + stopwords.words('german')
print(len(stop_words))
stop_words = stop_words + stopwords.words('french')
print(len(stop_words))
cleaned = [[word for word in doc if word not in stop_words] for doc in tokenised]

179
411
568


In [6]:
c = []
for doc in cleaned: 
    c+=doc
len(set(c))

71081

In [7]:
#word_stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
word_stemmer = PorterStemmer()
lemmatized = [[lemmatiser.lemmatize(word_stemmer.stem(word)) for word in doc] for doc in cleaned]

In [8]:
c = []
for doc in lemmatized: 
    c+=doc
len(set(c))

50809

In [9]:
cleaned = lemmatized

In [8]:
cleaned[0]

['goal',
 'note',
 'introduce',
 'new',
 'classes',
 'operator',
 'ideals',
 'moreover',
 'new',
 'way',
 'constructing',
 'classes',
 'application',
 'operators',
 'asymptotic',
 'structure',
 'recently',
 'introduced',
 'maurey',
 'milman',
 'tomczak',
 'jaegermann',
 'op',
 'th',
 'adv',
 'appl']

In [9]:
pickle.dump(cleaned, open( "collection_cleaned_fullwords.p", "wb" ))

In [68]:
pickle.dump(cleaned, open( "collection_cleaned.p", "wb" ))

### Training models

In [10]:
import sys
import tomotopy as tp

In [11]:
SEED = 11
random.seed(SEED)
random.shuffle(cleaned)

In [12]:
# Defining hyperparameters
tw = tp.TermWeight.IDF # term weighting scheme in TermWeight. The default value is TermWeight.ONE
k = 100 # number of topics...
min_cf=3 # minimum collection frequency of words. Words with a smaller collection frequency than min_cf are excluded from the model. The default value is 0, which means no words are excluded.
min_df=0 # minimum document frequency of words. Words with a smaller document frequency than min_df are excluded from the model. The default value is 0, which means no words are excluded
rm_top=5 # the number of top words to be removed. If you want to remove too common words from model, you can set this value to 1 or more. The default value is 0, which means no top words are removed.
alpha = 0.1 # hyperparameter of Dirichlet distribution for document-topic
eta = 0.01 # hyperparameter of Dirichlet distribution for topic-word
seed = 41 # random seed
model_burn_in = 100 
train_updates = 1000
train_iter = 10
save_path = "lda_model150.bin" #.bin format

In [13]:
def train_LDA(documents, k, tw, min_cf=0, min_df=0, rm_top=0, alpha=0.1, eta=0.01, model_burn_in=100, 
              train_updates = 1000, train_iter = 10, seed=41):
    
    # instantiate
    model = tp.LDAModel(tw=tw, min_df=min_df, min_cf=min_cf, rm_top=rm_top, k=k, alpha = alpha, eta = eta, seed=seed)
    
    # add documents to model
    for doc in documents: model.add_doc(doc)
    
    # training**
    model.burn_in = model_burn_in
    # initialising 
    model.train(iter=0)
    print('Num docs:', len(model.docs), ', Vocab size:', len(model.used_vocabs), ', Num words:', model.num_words)
    print('Removed top words:', model.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    # actual training 
    time = []
    LLs = []
    for i in range(0, train_updates, train_iter):
        model.train(train_iter)
        if i%100==0:print('Iteration: {}'.format(i))
        time.append(i)
        LLs.append(model.ll_per_word)
    
    return model, LLs, time

### Top Models Topics from gridsearch

In [15]:
# Defining hyperparameters
tw = tp.TermWeight.IDF # term weighting scheme in TermWeight. The default value is TermWeight.ONE
min_cf=3 # minimum collection frequency of words. Words with a smaller collection frequency than min_cf are excluded from the model. The default value is 0, which means no words are excluded.
min_df=0 # minimum document frequency of words. Words with a smaller document frequency than min_df are excluded from the model. The default value is 0, which means no words are excluded
rm_top=8 # the number of top words to be removed. If you want to remove too common words from model, you can set this value to 1 or more. The default value is 0, which means no top words are removed.
seed = 41 # random seed
model_burn_in = 100 
train_updates = 1000
train_iter = 10

In [17]:
Subset = 5000
parameters = [{'k':175, 'alpha':5.71E-05,'eta':2.82E-05},{'k':150, 'alpha':0.000666667,'eta':2.82E-05},
              {'k':125, 'alpha':0.0008,'eta':2.82E-05},{'k':100, 'alpha':0.0001,'eta':2.82E-05},
              {'k':75, 'alpha':0.000133333,'eta':2.82E-05}]
models = []
LLs = []
batch = cleaned[10000:3*Subset]
for dicti in parameters:
    model, loglikes, _ = train_LDA(batch, **dicti, tw=tw, min_cf=min_cf, rm_top=rm_top, 
                                 model_burn_in=model_burn_in, 
                                 train_updates = train_updates, train_iter = train_iter, seed = seed)
    models.append(model)
    LLs.append(loglikes)    

Training...


Num docs: 5000 , Vocab size: 17520 , Num words: 568962
Removed top words: ['data', 'model', 'based', 'results', 'using', 'two', 'high', 'study']
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900


Training...


Num docs: 5000 , Vocab size: 17520 , Num words: 568962
Removed top words: ['data', 'model', 'based', 'results', 'using', 'two', 'high', 'study']
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900


Training...


Num docs: 5000 , Vocab size: 17520 , Num words: 568962
Removed top words: ['data', 'model', 'based', 'results', 'using', 'two', 'high', 'study']
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800


KeyboardInterrupt: 

### Evaluating manually results from top 5 grid-search models

In [63]:
print(models[2].summary())
#print(LLs[0])

<Basic Info>
| LDAModel (current version: 0.9.1)
| 5000 docs, 568621 words
| Total Vocabs: 24636, Used Vocabs: 11897
| Entropy of words: -7.95917
| Removed Vocabs: use model result data studi base differ system
|
<Training Info>
| Iterations: 1000, Burn-in steps: 100
| Optimization Interval: 10
| Log-likelihood per word: -29.35916
|
<Initial Parameters>
| tw: TermWeight.IDF
| min_cf: 3 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 8 (the number of top words to be removed)
| k: 125 (the number of topics between 1 ~ 32767)
| alpha: 0.0008 (hyperparameter of Dirichlet distribution for document-topic)
| eta: 2.82e-05 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 41 (random seed)
| trained in version 0.9.1
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distributions)
|  [0.01890876 0.03327749 0.02756519 0.03001748 0.30038354 0.08986171
|   0.01580108 0.08982162 0.02185369 0.02927051 0.02233893 0.051

In [37]:
test = cleaned[15000:]
test_inf=[models[3].make_doc(doc) for doc in test]
tpc_dist, ll = models[3].infer(test_inf)

In [39]:
len(tpc_dist)

5494

In [59]:
topic_test = np.array(tpc_dist[1])
np.argsort(topic_test, axis=0)


array([74, 25, 96, 95,  0, 20, 10, 17, 93, 99, 43, 91, 56, 86, 40, 38, 42,
       52, 65, 77, 57, 51, 50, 60,  8, 68, 66, 45, 33, 70, 14, 23, 89, 78,
       48, 31, 22, 84, 36, 73,  4, 16, 81, 54, 41,  7, 71, 94, 63,  6, 19,
       32, 98, 55, 64, 53, 21, 15, 92,  9, 46,  3, 39, 11, 90, 75, 26, 28,
       83, 72, 79, 62, 58,  1, 12, 88, 30, 37, 44, 49, 82, 97, 13, 80, 24,
       67, 35, 34,  2, 61, 69, 29, 85, 59, 27,  5, 18, 76, 87, 47],
      dtype=int64)

In [61]:
models[3].get_topic_words(76)

[('particl', 0.04570361599326134),
 ('aerosol', 0.023621078580617905),
 ('nucleat', 0.0215678159147501),
 ('droplet', 0.01915661059319973),
 ('liquid', 0.018538711592555046),
 ('concentr', 0.01778131164610386),
 ('dust', 0.014575991779565811),
 ('measur', 0.013146561570465565),
 ('chemic', 0.01259393710643053),
 ('phase', 0.012286645360291004)]

In [57]:
' '.join(cleaned[15001])

'atom load optic lattic process gradual turn lattic almost adiabat paper investig temperatur chang go gapless superfluid phase gap mott phase along isentrop line calcul entropi singl band bose hubbard model variou densiti interact strength temperatur one two dimens homogen trap system theori abl reproduc experiment observ visibl therefor strongli support view current experi remain quantum regim consid lattic depth low temperatur minim heat'

### Finally training and storing best models for 4 batches

In [18]:
models = []
LLs = []
num_batches = 4
batches = np.array_split(cleaned, num_batches)
dicti = {'k':125, 'alpha':0.0008,'eta':2.82E-05}
for batch_num in range(0, num_batches):

    batch = batches[batch_num].tolist()
    model, loglikes, _ = train_LDA(batch, **dicti, tw=tw, min_cf=min_cf, rm_top=rm_top, 
                                 model_burn_in=model_burn_in, 
                                 train_updates = train_updates, train_iter = train_iter)
    models.append(model)
    LLs.append(loglikes)
    

Training...


Num docs: 5124 , Vocab size: 17574 , Num words: 575391
Removed top words: ['data', 'model', 'results', 'based', 'using', 'study', 'time', 'two']
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900


Training...


Num docs: 5124 , Vocab size: 17506 , Num words: 575061
Removed top words: ['data', 'model', 'results', 'based', 'using', 'two', 'study', 'time']
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900


Training...


Num docs: 5123 , Vocab size: 17670 , Num words: 581054
Removed top words: ['data', 'model', 'based', 'results', 'using', 'two', 'high', 'study']
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900


Training...


Num docs: 5123 , Vocab size: 17453 , Num words: 570023
Removed top words: ['data', 'model', 'results', 'based', 'using', 'study', 'two', 'high']
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900


In [21]:
names = ['LDA1batch1.bin','LDA1batch2.bin','LDA1batch3.bin','LDA1batch4.bin']
for i,model in enumerate(models):
    model.save(names[i])

##### Choice based on ll of 125 topics per 5000

##### Future work: Training on one batch and checking topic on another seems to give coherent results. This is an indication that aggregation of topics through a divergence or clustering technique may be relevant.

##### TODO: Check the function to determine number of topics per documents.

### Topic embeddings and comparisons

In [58]:
def get_top_topics(document, model, min_score=0.8):
    """ 
    Extracting top n topics for each document. 
    Selects the n most likely topics whose p(topic|document) sum to min_score.
    """
    # inserting the document in the model
    new_doc = model.make_doc(document)
    _,_ = model.infer(new_doc)
    # ordering from most probable topic to least one 
    dist = new_doc.get_topic_dist()
    indices = np.flip(np.argsort(dist))
    score = 0
    indices_kept = []
    probs_kept = []
    for index in indices:
        if score > min_score: break
        score += dist[index]
        indices_kept.append(index)
        probs_kept.append(dist[index])
    return list(zip(indices_kept, probs_kept))

In [59]:
def get_top_words(topic, model, min_score=0.8):
    """
    Extracting top n words for each document. 
    Selects the n most likely words whose p(word|topic) sum to min_score.
    """
    dist = model.get_topic_word_dist(topic)
    indices = np.flip(np.argsort(dist))
    score = 0
    word_kept = []
    word_prob_kept = []
    for index in indices:
        if score > min_score: break
        score += dist[index]
        word_kept.append(model.used_vocabs[index])
        word_prob_kept.append(dist[index])
    return list(zip(word_kept, word_prob_kept))

In [63]:
num_topics = k
topics2words_list = []
for batch in range(0, num_batches):
    topics2words_list.append([get_top_words(i, models[batch], min_score=0.25) for i in range(num_topics)])

In [67]:
topics2words_list[2]

[[('urban', 0.050270602),
  ('housing', 0.0266549),
  ('city', 0.024721002),
  ('singapore', 0.014500451),
  ('building', 0.014409492),
  ('semper', 0.012176565),
  ('art', 0.01048656),
  ('architect', 0.009767988),
  ('urbanization', 0.009495137),
  ('architecture', 0.009068691),
  ('park', 0.009034045),
  ('area', 0.008838711),
  ('heat', 0.008206975),
  ('cultural', 0.008192347),
  ('flap', 0.0080487365),
  ('m²', 0.007941464),
  ('island', 0.0079045165),
  ('architectural', 0.007722412),
  ('photograph', 0.007466815)],
 [('de', 0.029385291),
  ('sanitation', 0.026249573),
  ('mf', 0.02443935),
  ('ppm', 0.022096686),
  ('toilet', 0.016968342),
  ('patent', 0.016009055),
  ('con', 0.015961478),
  ('max', 0.014739622),
  ('mine', 0.014156663),
  ('kof', 0.012269286),
  ('fertility', 0.01191171),
  ('mm', 0.011577112),
  ('et', 0.011533953),
  ('wu', 0.010911403),
  ('aggression', 0.009820397),
  ('une', 0.009820397)],
 [('science', 0.013667408),
  ('research', 0.012726501),
  ('scien

In [68]:
import pickle 
import numpy as np 
import os 
import time
import torch

In [72]:
# Loading from binary the glove vocabulary and embedding 
glove_vocab_path = "glove_vocab.uu"
glove_embedding_path = "glove_embedding.uu"
with open(glove_vocab_path, "rb") as fp:  
    glove_vocab = pickle.load(fp)
with open(glove_embedding_path, "rb") as fp: 
    glove_embedding = pickle.load(fp)

In [83]:
glove_vocab_normalised = {k:lemmatiser.lemmatize(v) for k,v in glove_vocab.items()}

In [77]:
def get_list_embeddings_topic(topic, vocab, embedding):
    """ Topic is represented as a list of tuples (word, word weight)"""
    matched = 0 
    total = 0
    topic_embeddings = []
    topic_weights = []
    
    start = time.time()
    
    for item in topic: 
        word, weight = item
        total+=1
        #check if the word appears in vocabulary 
        if word in vocab.values(): 
            matched+=1
            emb = embedding[list(vocab.values()).index(word)]
            topic_embeddings += [emb.numpy()]
            topic_weights += [weight]
    
    end = time.time()
    
    print("Total time: "+str(round(end-start,2))+" s.")
    print("Proportion of matched words: "+str(round(matched/total,2)))
    return topic_embeddings, topic_weights

In [84]:
topic_embeddings, topic_weights = get_list_embeddings_topic(topics2words_list[0][0], glove_vocab_normalised, 
                                                            glove_embedding)

Total time: 0.15 s.
Proportion of matched words: 0.69


In [85]:
def get_convex_topic_embedding(topic_weights, topic_embeddings):
    """ Creates a topic embeddings as convex combination of embedding vectors according 
    to the weights provided."""
    weight_vec = np.asarray(topic_weights)
    topic_vec = np.asarray(topic_embeddings)
    normalized_weights = weight_vec / np.sqrt(np.sum(weight_vec**2))
    return normalized_weights.dot(topic_vec)

In [86]:
topic_emb = get_convex_topic_embedding(topic_weights, topic_embeddings)

In [99]:
def get_convex_topics_embeddings(batch_topics, vocab, embedding):
    """ Runs the above 2 functions to get the embedding for each topic in the batch."""
    topics_embs = []
    for topic2word in batch_topics:
        topic_embeddings, topic_weights = get_list_embeddings_topic(topic2word, glove_vocab_normalised, glove_embedding)
        topic_emb = get_convex_topic_embedding(topic_weights, topic_embeddings)
        topics_embs += [topic_emb]
    return topics_embs

In [101]:
topics_embs1 = get_convex_topics_embeddings(topics2words_list[0], glove_vocab_normalised, glove_embedding)
topics_embs2 = get_convex_topics_embeddings(topics2words_list[0], glove_vocab_normalised, glove_embedding)
topics_embs3 = get_convex_topics_embeddings(topics2words_list[0], glove_vocab_normalised, glove_embedding)
topics_embs = topics_embs1.add(topics_embs2.add(topics_embs3))

Total time: 0.16 s.
Proportion of matched words: 0.69
Total time: 0.21 s.
Proportion of matched words: 0.96
Total time: 0.16 s.
Proportion of matched words: 1.0
Total time: 0.14 s.
Proportion of matched words: 1.0
Total time: 0.21 s.
Proportion of matched words: 1.0
Total time: 0.11 s.
Proportion of matched words: 1.0
Total time: 0.24 s.
Proportion of matched words: 0.67
Total time: 0.05 s.
Proportion of matched words: 1.0
Total time: 0.13 s.
Proportion of matched words: 1.0
Total time: 0.4 s.
Proportion of matched words: 0.89
Total time: 0.12 s.
Proportion of matched words: 1.0
Total time: 0.07 s.
Proportion of matched words: 1.0
Total time: 0.11 s.
Proportion of matched words: 1.0
Total time: 0.1 s.
Proportion of matched words: 1.0
Total time: 0.18 s.
Proportion of matched words: 0.97
Total time: 0.03 s.
Proportion of matched words: 0.67
Total time: 0.08 s.
Proportion of matched words: 0.93
Total time: 0.07 s.
Proportion of matched words: 1.0
Total time: 0.11 s.
Proportion of matched

Total time: 0.07 s.
Proportion of matched words: 1.0
Total time: 0.13 s.
Proportion of matched words: 1.0
Total time: 0.05 s.
Proportion of matched words: 1.0
Total time: 0.08 s.
Proportion of matched words: 1.0
Total time: 0.19 s.
Proportion of matched words: 0.97
Total time: 0.05 s.
Proportion of matched words: 1.0
Total time: 0.46 s.
Proportion of matched words: 0.84
Total time: 0.09 s.
Proportion of matched words: 1.0
Total time: 0.12 s.
Proportion of matched words: 0.9
Total time: 0.11 s.
Proportion of matched words: 0.95
Total time: 0.15 s.
Proportion of matched words: 1.0
Total time: 0.04 s.
Proportion of matched words: 0.88
Total time: 0.08 s.
Proportion of matched words: 0.93
Total time: 0.12 s.
Proportion of matched words: 1.0
Total time: 0.16 s.
Proportion of matched words: 1.0
Total time: 0.11 s.
Proportion of matched words: 1.0
Total time: 0.12 s.
Proportion of matched words: 1.0
Total time: 0.14 s.
Proportion of matched words: 0.74
Total time: 0.19 s.
Proportion of matche

AttributeError: 'list' object has no attribute 'add'

In [89]:
def nearest_neighbors(topic, topics):
    """Returns the most similar topics to the given one in the listo of topics"""
    cos = torch.nn.CosineSimilarity(dim = -1)
    ranks = cos(torch.tensor(topic), torch.tensor(topics))
    mostSimilar = []
    return ranks.numpy().argsort()[::-1]

In [90]:
# let's look at the first topic 
nns_0 = nearest_neighbors(topics_embs[0], topics_embs)

In [91]:
def visualise_most_similar(topic_id, topics_embs, topics2words, n=10):
    """Prints the words of the topic and its neareast neighbors."""
    nns = nearest_neighbors(topics_embs[topic_id], topics_embs)
    print("-"*10)
    print("Topic "+ str(topic_id))
    print(" ".join(item[0] for item in topics2words[0]))
    print("-"*10)
    print(str(n)+" most similar topics")
    for i in range(n):
        print("Topic "+str(i+1))
        print(" ".join(item[0] for item in topics2words[nns[i+1]]))

In [92]:
visualise_most_similar(0, topics_embs, topics2words)

----------
Topic 0
cc dsm dsms covariates pcv worldview cheat gcp cmet ωarag gcps paul envelope stereo nanosilver ph
----------
10 most similar topics
Topic 1
sf jnk malt displacement ip myo ssp midkine bt yield ductility ngo structure pig pacemaker shark
Topic 2
dot nogo pd qds ag icp microdroplets sc cdse nc ma polygyrus rab igg sp hemts hydroxythujone signal endosomes
Topic 3
cluster vapor liquid filter rbr uvi equilibrium diamond photoelectron cation dca metastable cvd ultraviolet alkaloid lattice
Topic 4
np ph wood acid hcl tio composite glass drying pore wt scaffold presence bioactive dental synthesis phosphate polymer material acetic adsorption ion peg formation nano light metal
Topic 5
oc mouse liver hepatic lipid cholesterol adipose fat agnp metabolic hscs hfd pparγ dbt fish creatine endometrial obesity
Topic 6
msc adhesion mirna cell myelin mirnas migration ecm sii grk mmp collagen tf regenerative differentiation tremor axon rkip alg fibronectin extracellular matrix dog cultu