# Topic modeling on abstracts

## 1. Preprocessing

In [None]:
import pandas as pd
import os 
import numpy as np
import re
import random

In [None]:
import nltk
from nltk import word_tokenize, RegexpTokenizer,PunktSentenceTokenizer, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#nltk.download('stopwords')

In [None]:
#!pip install gensim

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

### 1.1 Load the data 

In [None]:
#abstracts = pd.read_csv("abstracts.csv")
abstracts = pd.read_csv("abstracts_eng.csv")

In [None]:
abstracts.head()

In [None]:
abs_list = list(abstracts['abstract'])

In [None]:
len(abs_list)

**21421** abstracts in total  
**20494** abstracts in english

### 1.2 Step 1 
- tokenization 
- punctuation removal 
- lowercasing

In [None]:
tokenised = []
count = 0
for abstract in abs_list:
    raw = abstract
    tokens = gensim.utils.simple_preprocess(str(raw), deacc=True)
    tokenised.append(tokens)
    count += len(tokens)
print(str(count)+" tokens created")

In [None]:
c = []
for doc in tokenised: 
    c+=doc
len(set(c))

So we have **83831** / **71429** (de/en) unique words in the beginning

### 1.3 Step 2 
- removing stopwords 
- (removing other words based on different strategies - like word length thresholding) 

In [None]:
stop_words = stopwords.words('english')
cleaned = [[word for word in doc if word not in stop_words] for doc in tokenised]

In [None]:
# Consider extending the stopwords ...
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
# Consider removing words with less than [x] characters 

In [None]:
c = []
for doc in cleaned: 
    c+=doc
len(set(c))

Now after stopwords removal we have **83695** / **71293** terms (136 less)

### 1.4 Step 3 
- stemming 
- lemmatizing 

In [None]:
#nltk.download('wordnet')

In [None]:
word_stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
lemmatized = [[lemmatiser.lemmatize(word_stemmer.stem(word)) for word in doc] for doc in cleaned]

In [None]:
c = []
for doc in lemmatized: 
    c+=doc
len(set(c))

And after stemming and lemmatization we have **61182** / **50948** terms (22,513 less)

Maybe we could also lemmatise keeping only noun, adjective, verb, adverb

data_lemmatized = lemmatization(bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
abstract_clean[23]

### 1.5 Build n-grams

This seems to be done before lemmatization and stemming in a lot of tutorials

In [None]:
# n-grams creation hyperparameters 
# leaving here the hyperparameters so that we can tune them properly
# min_count (float, optional) – Ignore all words and bigrams with total collected count lower than this value.
b_min_c = 5 
t_min_c = 5
# threshold (float, optional) – Represent a score threshold for forming the phrases (higher means fewer phrases)
b_thre = 50
t_thre = 5
# scoring ({'default', 'npmi', function}, optional) –Specify how potential phrases are scored
# for now we go with default storing

In [None]:
bigram = gensim.models.Phrases(lemmatized, min_count=b_min_c, threshold=b_thre) 
bigram_mod = gensim.models.phrases.Phraser(bigram)


Two interesting results from the bigram model : 

In [None]:
# Case 1: no change --> no bigrams found 
lemmatized[0]==bigram_mod[lemmatized[0]]

In [None]:
# Case 2: some change but we actually lose vocabulary ...
len(bigram_mod[lemmatized[110]])-len(lemmatized[110])

In [None]:
# MOREOVER, we have german words inside!!
print(bigram_mod[lemmatized[110]]) 

In [None]:
trigram = gensim.models.Phrases(bigram[lemmatized], min_count=t_min_c, threshold=t_thre)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
# Take a look at some of the trigrams
print(trigram_mod[lemmatized[31]])

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
c

In [None]:
c = []
for doc in bigrammed: 
    c+=doc
len(set(c))

**5470** bigrams

In [None]:
c = []
for doc in trigrammed: 
    c+=doc
len(set(c))

**12894** trigrams

After grouping words that occur commonly together we have 69312 / **69312** terms (17,741 more)

### 1.6 Analyse the vocabulary

In [None]:
cleaned = trigrammed

In [None]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [None]:
wordlist = flatten(cleaned)

In [None]:
from collections import Counter
counts = Counter(wordlist)

In [None]:
counts.most_common(20)

In [None]:
hist(counts.most_common(10))

## 2. Modelling

## 2.1 LDA

In [None]:
#!pip3.7 install tomotopy

In [None]:
import sys
import tomotopy as tp

### 2.1.1 Training example

----


In [None]:
# Defining hyperparameters
tw = tp.TermWeight.ONE # term weighting scheme in TermWeight. The default value is TermWeight.ONE
k = 150 # number of topics...
min_cf=3 # minimum collection frequency of words. Words with a smaller collection frequency than min_cf are excluded from the model. The default value is 0, which means no words are excluded.
min_df=0 # minimum document frequency of words. Words with a smaller document frequency than min_df are excluded from the model. The default value is 0, which means no words are excluded
rm_top=5 # the number of top words to be removed. If you want to remove too common words from model, you can set this value to 1 or more. The default value is 0, which means no top words are removed.
alpha = None # hyperparameter of Dirichlet distribution for document-topic
eta = None # hyperparameter of Dirichlet distribution for topic-word
seed = 41 # random seed
model_burn_in = 100 
train_updates = 1000
train_iter = 10
save_path = "lda_model150.bin" #.bin format

In [None]:
# Instantiating the model
model = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top, k=k)

In [None]:
# adding documents to the model 
for doc in cleaned: model.add_doc(doc)

In [None]:
# training**
model.burn_in = model_burn_in
# initialising 
model.train(iter=0)
print('Num docs:', len(model.docs), ', Vocab size:', len(model.used_vocabs), ', Num words:', model.num_words)
print('Removed top words:', model.removed_top_words)
print('Training...', file=sys.stderr, flush=True)
# actual training 
time = []
LLs = []
for i in range(0, train_updates, train_iter):
    model.train(train_iter)
    if i%100==0:print('Iteration: {}'.format(i))
    time.append(i)
    LLs.append(model.ll_per_word)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(time,LLs)
plt.ylabel('Log-likelihood')
plt.xlabel('Iteration')

In [None]:
#loading a model
model1000 = tp.LDAModel.load("./lda1000.bin")

In [None]:
model = tp.LDAModel.load("./lda_model150.bin")

In [None]:
model1000.summary()

In [None]:
model.summary()

In [None]:
print('Saving...', file=sys.stderr, flush=True)
model.save(save_path, full=True) # If full is True, the model with its all documents and state will be saved. If you want to train more after, use full model. If False, only topic parameters of the model will be saved. This model can be only used for inference of an unseen document.

Let's have a better look at the results!

In [None]:
for k in range(model.k):
    print('Topic #{}'.format(k))
    for word, prob in model.get_topic_words(k):
        print('\t', word, prob, sep='\t')

### 2.1.2 Hyperparameter tuning by optimizing log-likelihood  

---

Note: log-likelihood is generally not considered a good measure for topic model performance...

In [None]:
def train_LDA(documents, k, min_cf=0, min_df=0, rm_top=0, alpha=0.1, eta=0.01, model_burn_in=100, 
              train_updates = 1000, train_iter = 10):
    
    # instantiate
    model = tp.LDAModel(tw=tp.TermWeight.ONE, min_df=min_df, min_cf=min_cf, rm_top=rm_top, k=k, alpha = alpha, 
                        eta = eta)
    
    # add documents to model
    for doc in documents: model.add_doc(doc)
    
    # training**
    model.burn_in = model_burn_in
    # initialising 
    model.train(iter=0)
    print('Num docs:', len(model.docs), ', Vocab size:', len(model.used_vocabs), ', Num words:', model.num_words)
    print('Removed top words:', model.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    # actual training 
    time = []
    LLs = []
    for i in range(0, train_updates, train_iter):
        model.train(train_iter)
        if i%100==0:print('Iteration: {}'.format(i))
        time.append(i)
        LLs.append(model.ll_per_word)
    
    return model, LLs, time

In [None]:
# simple loop for minimizing perplexity on the training set

topics = [10,20,30]
perplexity_score = np.array([])
for k in topics:
    print("Training for "+str(k)+" topics")
    model, LLs, time = train_LDA(cleaned, k = k, train_updates = 600)
    perplexity_score = np.append(perplexity_score, model.perplexity)
    print("Perplexity = "+str(model.perplexity))

topics[np.argmin(perplexity_score)]

In [None]:
plt.plot(topics,perplexity_score)
plt.ylabel('Perplexity')
plt.xlabel('Number of topics')

In [None]:
# split data in train and test set

print(len(cleaned))
train_size = int(0.8*len(cleaned))

random.shuffle(cleaned)
train_docs = cleaned[0:train_size]
test_docs = cleaned[train_size:]

assert len(train_docs) + len(test_docs) == len(cleaned)

In [None]:
def get_test_LL(test_docs, model):
    
    # make a list of documents of type required by tp
    test_set = []
    for doc in test_docs:
        test_set.append(model.make_doc(doc))
    
    # return topic distribution and log-likelihood of new documents
    topic_dist, likelihood = model.infer(test_set)
    
    # use mean log-likelihood as performance measure
    return np.mean(likelihood)

In [None]:
# loop for maximizing mean likelihood of test set

topics = [10,20,30]
log_likelihoods = np.array([])
for k in topics:
    print("Training for "+str(k)+" topics")
    model, LLs, time = train_LDA(train_docs, k = k, train_updates = 800)
    log_likelihoods = np.append(log_likelihoods, get_test_LL(test_docs, model))
    print("Log likelihood = "+str(get_test_LL(test_docs, model)))

topics[np.argmax(log_likelihoods)]

In [None]:
plt.plot(topics,log_likelihoods)
plt.ylabel('Likelihood')
plt.xlabel('Number of topics')

In [None]:
print(likelihoods)

### 2.1.3 Topic coherence 

---

Topic Coherence measures score a single topic by measuring the degree of semantic similarity between high scoring words in the topic. These measurements help distinguish between topics that are semantically interpretable topics and topics that are artifacts of statistical inference.

In [None]:
num_topics = 10

In [None]:
extract_word = lambda x: x[0] # get_topic_words returns both the word and its probability in the topic
topics = [[extract_word(tw) for tw in model.get_topic_words(k, 20)] for k in range(1,num_topics)]

In [None]:
topics[0]

In [None]:
from gensim.corpora import Dictionary

In [None]:
dictionary = Dictionary(cleaned)
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in cleaned] # bag of words corpus 

In [None]:
BoW_corpus[0]

In [None]:
# saving the coherence preprocessing operations into two functions
extract_word = lambda x: x[0] # get_topic_words returns both the word and its probability in the topic

def get_topics(model, num_topics):
    return [[extract_word(tw) for tw in model.get_topic_words(k, 20)] for k in range(num_topics)]

def get_corpus(dictionary, texts):
    return [dictionary.doc2bow(doc, allow_update=True) for doc in texts]

In [None]:
# topics (list of list of str, optional) – List of tokenized topics
# texts (list of list of str, optional) – Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`) probability estimator .
# corpus (iterable of list of (int, number), optional) – Corpus in BoW format.
# dictionary (Dictionary, optional) – Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed. If both are provided, passed dictionary will be used.
# window_size (int, optional) – Is the size of the window to be used for coherence measures using boolean sliding window as their probability estimator. For ‘u_mass’ this doesn’t matter. If None - the default window sizes are used which are: ‘c_v’ - 110, ‘c_uci’ - 10, ‘c_npmi’ - 10.
# coherence ({'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional) – Coherence measure to be used. Fastest method - ‘u_mass’, ‘c_uci’ also known as c_pmi. For ‘u_mass’ corpus should be provided, if texts is provided, it will be converted to corpus using the dictionary. For ‘c_v’, ‘c_uci’ and ‘c_npmi’ texts should be provided (corpus isn’t needed)
# topn (int, optional) – Integer corresponding to the number of top words to be extracted from each topic.
cm = CoherenceModel(topics=topics, corpus=BoW_corpus, dictionary=dictionary, texts=cleaned, coherence='c_v')
coherence = cm.get_coherence()  # get coherence value
coherence

In [None]:
# Note: there are different types of coherence measures, we need to decide which to use!

###  2.1.4 Complete grid search 

---

In [None]:
random.seed = 11

In [None]:
# split data in train, test and validation set
print(len(cleaned))
train_size = int(0.7*len(cleaned)) #70% for training
test_size = int(0.3*len(cleaned)) #30% for testing 

random.shuffle(cleaned)

train_docs = cleaned[0:train_size]
test_docs = cleaned[train_size:]

len(train_docs) + len(test_docs) == len(cleaned)

In [None]:
# Defining the grid

ks = [50, 100, 150, 200, 300, 350, 450]
#alpha = [1/k, 10/k, 0.1/k, None]
#eta = [1/w, 10/w, 0.1/w, None]

In [None]:
def compute_test_pp(ll, docs):
    """ pp = exp(-ll/ct)"""
    ct = sum([len(docs) for doc in docs])
    pp = np.exp(-1*ll/ct)
    return pp

In [None]:
# Grid search of best topic number (this needs to run for a while)
# We collect LL, perplexity and coherence scores, saving them in variables 

import time

pps = []
best_models = []
# number of words in our vocabulary
c = []
for doc in train_docs: 
    c+=doc
w = len(set(c))

# define training parameters 
model_burn_in=250
train_updates = 1000
train_iter = 10

for k in ks:
    
    start = time.time()

    for alpha in [1/k, 10/k, 0.1/k, None]:      
        for eta in [1/w, 10/w, 0.1/w, None]:
    
            print("K= "+str(k)+", alpha = "+str(alpha)+", eta="+str(eta)+" -----------------------------")
            model, LLs, _ = train_LDA(train_docs, k, 
                                      alpha=alpha,
                                      eta=eta, 
                                      model_burn_in=model_burn_in,
                                      train_updates = train_updates, 
                                      train_iter = train_iter)

            # LL
            ll = get_test_LL(test_docs, model)
            ## PP
            # TODO: obtain perplexity on the test set
            pp = compute_test_pp(ll, test_docs)
            pps += [pp]
            print("Test perplexity = "+str(pp))

            
            end = time.time()

            print("Time elapsed: "+ str(round(end - start,1))+" s")

In [None]:
# plotting the results
fig, axs = plt.subplots(3, 1, figsize=(10,12))
axs[0].plot(ks,log_likelihoods)
axs[1].plot(ks,perplexities)
axs[2].plot(ks,coherences)
axs[0].set_title("Test log-likelihood")
axs[1].set_title("Train perplexity")
axs[2].set_title("Train coherence")

In [None]:
# savin big model 
model.save("lda1000.bin", full=True)

In [None]:
# Looking at the biggest model a bit

In [None]:
topics = get_topics(model, k)
corpus = get_corpus(dictionary, train_docs)
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, texts=train_docs, coherence='c_v')
cm.get_coherence()

In [None]:
for k in range(10,20):
    print('Topic #{}'.format(k))
    for word, prob in model.get_topic_words(k):
        print('\t', word, prob, sep='\t')

In [None]:
# TODO: evaluate the model on the validation set 

In [None]:
get_test_coherence(model, test_docs, dictionary):
    """ Get topics over test set and compute coherence"""
    pass

#### Nice visualisation 

--- 

In [None]:
#!pip3.7 install pyLDAvis

In [None]:
import pyLDAvis
pyLDAvis.enable_notebook()

**Note**: you need to have trained a model to use the following code 

In [None]:
topic_term_dists = np.stack([model.get_topic_word_dist(k) for k in range(model.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in model.docs])
doc_lengths = np.array([len(doc.words) for doc in model.docs])
vocab = list(model.used_vocabs)
term_frequency = model.used_vocab_freq

In [None]:
# doesn't work ...
doc = pyLDAvis.prepare(
    topic_term_dists, 
    doc_topic_dists, 
    doc_lengths, 
    vocab, 
    term_frequency)

## 2.2 CTM

### 2.2.1 Training example

In [None]:
# Define the hyperparameters
tw = tp.TermWeight.ONE # term weighting scheme in TermWeight. The default value is TermWeight.ONE
k = 5 # number of topics
min_cf=3 # minimum collection frequency of words. Words with a smaller collection frequency than min_cf are excluded from the model. The default value is 0, which means no words are excluded.
rm_top=5 # the number of top words to be removed. If you want to remove too common words from model, you can set this value to 1 or more. The default value is 0, which means no top words are removed.
alpha = None # hyperparameter of Dirichlet distribution for document-topic
eta = None # hyperparameter of Dirichlet distribution for topic-word
seed = 41 # random seed
model_burn_in = 100 
train_updates = 500
train_iter = 10
save_path = "ctm_model.bin" #.bin format

In [None]:
def train_CTM(documents, k, min_cf=0, rm_top=0, smoothing_alpha=0.1, eta=0.01, model_burn_in=100, 
              train_updates = 1000, train_iter = 10):
    
    # instantiate
    model = tp.CTModel(tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top, k=k, smoothing_alpha = smoothing_alpha,
                      eta = eta)
    
    # add documents to model
    for doc in documents: model.add_doc(doc)
    
    # training**
    model.burn_in = model_burn_in
    # initialising 
    model.train(iter=0)
    print('Num docs:', len(model.docs), ', Vocab size:', len(model.used_vocabs), ', Num words:', model.num_words)
    print('Removed top words:', model.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    # actual training 
    time = []
    LLs = []
    for i in range(0, train_updates, train_iter):
        model.train(train_iter)
        if i%100==0:print('Iteration: {}'.format(i))
        time.append(i)
        LLs.append(model.ll_per_word)
    
    return model, LLs, time

In [None]:
import time
start = time.time()
model, LLs, _ = train_CTM(cleaned, k=k, min_cf=min_cf, rm_top=rm_top, smoothing_alpha=alpha, 
                                 eta=eta, model_burn_in=model_burn_in, 
                                 train_updates = train_updates, train_iter = train_iter)
end = time.time()
print("Time elapsed: "+ str(round(end - start,1))+" s")

In [None]:
# compare time to LDA with same hyperparameters

start = time.time()
train_LDA(cleaned, k=k, min_cf=min_cf, rm_top=rm_top, alpha=alpha, 
                                 eta=eta, model_burn_in=model_burn_in, 
                                 train_updates = train_updates, train_iter = train_iter)
end = time.time()
print("Time elapsed: "+ str(round(end - start,1))+" s")

In [None]:
for k in range(model.k):
    print('Topic #{}'.format(k))
    for word, prob in model.get_topic_words(k):
        print('\t', word, prob, sep='\t')

In [None]:
# Look at topic correlations
model.get_correlations()

In [None]:
#!pip3.7 install pyvis

In [None]:
# visualization tool for topic correlations

from pyvis.network import Network

g = Network(width=800, height=800, font_color="#333")
correl = model.get_correlations().reshape([-1])
correl.sort()
top_tenth = model.k * (model.k - 1) // 10
top_tenth = correl[-model.k - top_tenth]

for k in range(model.k):
    label = "#{}".format(k)
    title= ' '.join(word for word, _ in model.get_topic_words(k, top_n=6))
    print('Topic', label, title)
    g.add_node(k, label=label, title=title, shape='ellipse')
    for l, correlation in zip(range(k - 1), model.get_correlations(k)):
        if correlation < top_tenth: continue
        g.add_edge(k, l, value=float(correlation), title='{:.02}'.format(correlation))

g.barnes_hut(gravity=-1000, spring_length=20)
g.show_buttons()
g.show("topic_network.html")

### 2.2.2 Complete grid search

In [None]:
# helper functions 

def get_test_LL(test_docs, model):
    
    # make a list of documents of type required by tp
    test_set = []
    for doc in test_docs:
        test_set.append(model.make_doc(doc))
    
    # return topic distribution and log-likelihood of new documents
    topic_dist, likelihood = model.infer(test_set)
    
    # use mean log-likelihood as performance measure
    return np.mean(likelihood)

def compute_test_pp(ll, docs):
    """ pp = exp(-ll/ct)"""
    ct = sum([len(docs) for doc in docs])
    pp = np.exp(-1*ll/ct)
    return pp

In [None]:
random.seed = 11

In [None]:
# split data in train, test and validation set
print(len(cleaned))
train_size = int(0.7*len(cleaned)) #70% for training
test_size = int(0.3*len(cleaned)) #30% for testing 

random.shuffle(cleaned)

train_docs = cleaned[0:train_size]
test_docs = cleaned[train_size:]

len(train_docs) + len(test_docs) == len(cleaned)

In [None]:
# Defining the grid

c = []
for doc in train_docs: 
    c+=doc
w = len(set(c))

ks = [50, 100, 150, 200, 300, 350, 450]
etas = [1/w, 10/w, 0.1/w, None]

In [None]:
# Grid search of best topic number (this needs to run for a while)
# We collect LL, perplexity and coherence scores, saving them in variables 

import time

pps = []
best_models = []
# number of words in our vocabulary

# define training parameters 
model_burn_in=250
train_updates = 1000
train_iter = 10

for k in ks:
    
    start = time.time()
      
    for eta in etas:

        print("K= "+str(k)+", eta="+str(eta)+" -----------------------------")
        model, LLs, _ = train_CTM(train_docs, k,
                                  eta=eta, 
                                  model_burn_in=model_burn_in,
                                  train_updates = train_updates, 
                                  train_iter = train_iter)

        # LL
        ll = get_test_LL(test_docs, model)
        ## PP
        pp = compute_test_pp(ll, test_docs)
        pps += [pp]
        print("Test perplexity = "+str(pp))


        end = time.time()

        print("Time elapsed: "+ str(round(end - start,1))+" s")

## 2.3 Pachinko

### 2.3.1 Training example

In [None]:
# Hyperparameters
tw = tp.TermWeight.ONE # term weighting scheme in TermWeight. The default value is TermWeight.ONE
k1 = 1 # the number of super topics 
k2 = 5 # the number of sub topics
min_cf=3 # minimum collection frequency of words. Words with a smaller collection frequency than min_cf are excluded from the model. The default value is 0, which means no words are excluded.
rm_top=5 # the number of top words to be removed. If you want to remove too common words from model, you can set this value to 1 or more. The default value is 0, which means no top words are removed.
alpha = None # hyperparameter of Dirichlet distribution for document-topic
eta = None # hyperparameter of Dirichlet distribution for topic-word
seed = 41 # random seed
model_burn_in = 100 
train_updates = 500
train_iter = 10
save_path = "pachinko_model.bin" #.bin format

In [None]:
# Note: only possible to have two layers in topic tree??

In [None]:
def train_PA(documents, k1, k2, min_cf=0, rm_top=0, alpha=0.1, eta=0.01, model_burn_in=100, 
              train_updates = 1000, train_iter = 10):
    
    # instantiate
    model = tp.PAModel(tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top, k1=k1, k2=k2, alpha = alpha, eta = eta)
    
    # add documents to model
    for doc in documents: model.add_doc(doc)
    
    # training**
    model.burn_in = model_burn_in
    # initialising 
    model.train(iter=0)
    print('Num docs:', len(model.docs), ', Vocab size:', len(model.used_vocabs), ', Num words:', model.num_words)
    print('Removed top words:', model.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    # actual training 
    time = []
    LLs = []
    for i in range(0, train_updates, train_iter):
        model.train(train_iter)
        if i%100==0:print('Iteration: {}'.format(i))
        time.append(i)
        LLs.append(model.ll_per_word)
    
    return model, LLs, time

In [None]:
import time
start = time.time()
model, LLs, iters = train_PA(cleaned, k1=k1, k2=k2, min_cf=min_cf, rm_top=rm_top, alpha=alpha, 
                                 eta=eta, model_burn_in=model_burn_in, 
                                 train_updates = train_updates, train_iter = train_iter)
end = time.time()
print("Time elapsed: "+ str(round(end - start,1))+" s")

In [None]:
for k1 in range(model.k1):
    print('Topic #{}'.format(k1))
    for word, prob in model.get_topic_words(k1):
        print('\t', word, prob, sep='\t')

In [None]:
print(model.get_sub_topics(super_topic_id = 0), "\n") # this returns an ordered list of the probabilities of the sub-topics
print(model.get_sub_topics(super_topic_id = 1), "\n")
print(model.get_sub_topics(super_topic_id = 2))

# super topics have more or less same prob for sub topics => requires tuning

### 2.3.2 Complete grid search

In [None]:
# helper functions 

def get_test_LL(test_docs, model):
    
    # make a list of documents of type required by tp
    test_set = []
    for doc in test_docs:
        test_set.append(model.make_doc(doc))
    
    # return topic distribution and log-likelihood of new documents
    topic_dist, likelihood = model.infer(test_set)
    
    # use mean log-likelihood as performance measure
    return np.mean(likelihood)

def compute_test_pp(ll, docs):
    """ pp = exp(-ll/ct)"""
    ct = sum([len(docs) for doc in docs])
    pp = np.exp(-1*ll/ct)
    return pp

In [None]:
random.seed = 11

In [None]:
# split data in train, test and validation set
print(len(cleaned))
train_size = int(0.7*len(cleaned)) #70% for training
test_size = int(0.3*len(cleaned)) #30% for testing 

random.shuffle(cleaned)

train_docs = cleaned[0:train_size]
test_docs = cleaned[train_size:]

len(train_docs) + len(test_docs) == len(cleaned)

In [None]:
# Defining the grid

c = []
for doc in train_docs: 
    c+=doc
w = len(set(c))

k2s = [50, 100, 150, 200, 300, 350, 450]
etas = [1/w, 10/w, 0.1/w, None]

In [None]:
# Grid search of best topic number (this needs to run for a while)
# We collect LL, perplexity and coherence scores, saving them in variables 

import time

pps = []
best_models = []
# number of words in our vocabulary
c = []
for doc in train_docs: 
    c+=doc
w = len(set(c))

# define training parameters 
model_burn_in=250
train_updates = 1000
train_iter = 10

for k2 in k2s:
    for k1 in [int(k2/5), int(k2/10), int(k2/20)]:
    
        start = time.time()

        for alpha in [1/k1, 0.1/k1, 0.01/k1]:      
            for eta in etas:

                print("K1= "+str(k1)+ ", K2= " + str(k2) + ", alpha = "+str(alpha)+", eta="+str(eta)+" -----------------------------")
                model, LLs, _ = train_PA(train_docs, k1 = k1, k2 = k2, 
                                          alpha=alpha,
                                          eta=eta, 
                                          model_burn_in=model_burn_in,
                                          train_updates = train_updates, 
                                          train_iter = train_iter)

                # LL
                ll = get_test_LL(test_docs, model)
                ## PP
                pp = compute_test_pp(ll, test_docs)
                pps += [pp]
                print("Test perplexity = "+str(pp))


                end = time.time()

                print("Time elapsed: "+ str(round(end - start,1))+" s")

## 3. Enriching the data

In this section we merge the data extracted from the topic modeling back into the original dataset. 

In [None]:
dictionary = Dictionary(cleaned)
topics = get_topics(model, k)
corpus = get_corpus(dictionary, cleaned)
cm = CoherenceModel(topics=topics, corpus=cleaned, dictionary=dictionary, texts=cleaned, coherence='c_v')
cm.get_coherence()

In [None]:
topic2words = [model.get_topic_word_dist(k) for k in range(model.k)]
doc2topics = [doc.get_topic_dist() for doc in model.docs]

In [None]:
dictionary[]

In [None]:
model.get_topic_words(2)