# Streaming LDA

This notebook produces the final model used to enrich the **graph** or **embedding space**. Note that two separate models will need to be trained, one for each representation. When training the model used for the graph we use lemmatisation and stemming. This is the only difference - the hyperparameters used are the same. 

In [None]:
import pandas as pd
import os 
import numpy as np
import re
import random
import nltk
import pickle
from nltk import word_tokenize, RegexpTokenizer,PunktSentenceTokenizer, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
abstracts = pd.read_csv("abstracts_eng.csv")
abs_list = list(abstracts['abstract'])

In [None]:
tokenised = []
count = 0
for abstract in abs_list:
    raw = abstract
    tokens = gensim.utils.simple_preprocess(str(raw), deacc=True)
    tokenised.append(tokens)
    count += len(tokens)
print(str(count)+" tokens created")

In [None]:
c = []
for doc in tokenised: 
    c+=doc
len(set(c))

In [None]:
stop_words = stopwords.words('english')
print(len(stop_words))
stop_words = stop_words + stopwords.words('german')
print(len(stop_words))
stop_words = stop_words + stopwords.words('french')
print(len(stop_words))
tokenized_stop = [[word for word in doc if word not in stop_words] for doc in tokenised]

In [None]:
c = []
for doc in tokenized_stop: 
    c+=doc
len(set(c))

In [None]:
lemmatiser = WordNetLemmatizer()
word_stemmer = PorterStemmer()
lemmatized = [[lemmatiser.lemmatize(word_stemmer.stem(word)) for word in doc] for doc in tokenized_stop]

In [None]:
c = []
for doc in lemmatized: 
    c+=doc
len(set(c))

In [None]:
lemmatized[0]

In [None]:
# Export file with full words, i.e. only tokenized and with stop word removal
pickle.dump(tokenized_stop, open( "collection_cleaned_fullwords.p", "wb" ))

In [None]:
# Export fully pre-processed collection
pickle.dump(lemmatized, open( "collection_cleaned.p", "wb" ))

### Training models

In [None]:
## Important: Select lemmatized dataset when training model for graph and tokenized_stop when training for the embeddings

cleaned = lemmatized
#cleaned = tokenized_stop

In [None]:
import sys
import tomotopy as tp

In [None]:
SEED = 11
random.seed(SEED)
random.shuffle(cleaned)

In [None]:
# Defining hyperparameters
tw = tp.TermWeight.IDF # term weighting scheme in TermWeight. The default value is TermWeight.ONE
k = 100 # number of topics...
min_cf=3 # minimum collection frequency of words. Words with a smaller collection frequency than min_cf are excluded from the model. The default value is 0, which means no words are excluded.
min_df=0 # minimum document frequency of words. Words with a smaller document frequency than min_df are excluded from the model. The default value is 0, which means no words are excluded
rm_top=5 # the number of top words to be removed. If you want to remove too common words from model, you can set this value to 1 or more. The default value is 0, which means no top words are removed.
alpha = 0.1 # hyperparameter of Dirichlet distribution for document-topic
eta = 0.01 # hyperparameter of Dirichlet distribution for topic-word
seed = 41 # random seed
model_burn_in = 100 
train_updates = 1000
train_iter = 10
save_path = "lda_model150.bin" #.bin format

In [None]:
def train_LDA(documents, k, tw, min_cf=0, min_df=0, rm_top=0, alpha=0.1, eta=0.01, model_burn_in=100, 
              train_updates = 1000, train_iter = 10, seed=41):
    
    # instantiate
    model = tp.LDAModel(tw=tw, min_df=min_df, min_cf=min_cf, rm_top=rm_top, k=k, alpha = alpha, eta = eta, seed=seed)
    
    # add documents to model
    for doc in documents: model.add_doc(doc)
    
    # training**
    model.burn_in = model_burn_in
    # initialising 
    model.train(iter=0)
    print('Num docs:', len(model.docs), ', Vocab size:', len(model.used_vocabs), ', Num words:', model.num_words)
    print('Removed top words:', model.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    # actual training 
    time = []
    LLs = []
    for i in range(0, train_updates, train_iter):
        model.train(train_iter)
        if i%100==0:print('Iteration: {}'.format(i))
        time.append(i)
        LLs.append(model.ll_per_word)
    
    return model, LLs, time

### Top Models Topics from gridsearch

In [None]:
# Defining hyperparameters
tw = tp.TermWeight.IDF # term weighting scheme in TermWeight. The default value is TermWeight.ONE
min_cf=3 # minimum collection frequency of words. Words with a smaller collection frequency than min_cf are excluded from the model. The default value is 0, which means no words are excluded.
min_df=0 # minimum document frequency of words. Words with a smaller document frequency than min_df are excluded from the model. The default value is 0, which means no words are excluded
rm_top=8 # the number of top words to be removed. If you want to remove too common words from model, you can set this value to 1 or more. The default value is 0, which means no top words are removed.
seed = 41 # random seed
model_burn_in = 100 
train_updates = 1000
train_iter = 10

In [None]:
Subset = 5000
parameters = [{'k':175, 'alpha':5.71E-05,'eta':2.82E-05},{'k':150, 'alpha':0.000666667,'eta':2.82E-05},
              {'k':125, 'alpha':0.0008,'eta':2.82E-05},{'k':100, 'alpha':0.0001,'eta':2.82E-05},
              {'k':75, 'alpha':0.000133333,'eta':2.82E-05}]
models = []
LLs = []
batch = cleaned[10000:3*Subset]
for dicti in parameters:
    model, loglikes, _ = train_LDA(batch, **dicti, tw=tw, min_cf=min_cf, rm_top=rm_top, 
                                 model_burn_in=model_burn_in, 
                                 train_updates = train_updates, train_iter = train_iter, seed = seed)
    models.append(model)
    LLs.append(loglikes)    

### Evaluating manually results from top 5 grid-search models

In [None]:
print(models[2].summary())
#print(LLs[0])

In [None]:
test = cleaned[15000:]
test_inf=[models[3].make_doc(doc) for doc in test]
tpc_dist, ll = models[3].infer(test_inf)

In [None]:
len(tpc_dist)

In [None]:
topic_test = np.array(tpc_dist[1])
np.argsort(topic_test, axis=0)


In [None]:
models[3].get_topic_words(76)

In [None]:
' '.join(cleaned[15001])

### Finally training and storing best models for 4 batches

In [None]:
models = []
LLs = []
num_batches = 4
batches = np.array_split(cleaned, num_batches)
dicti = {'k':125, 'alpha':0.0008,'eta':2.82E-05}
for batch_num in range(0, num_batches):

    batch = batches[batch_num].tolist()
    model, loglikes, _ = train_LDA(batch, **dicti, tw=tw, min_cf=min_cf, rm_top=rm_top, 
                                 model_burn_in=model_burn_in, 
                                 train_updates = train_updates, train_iter = train_iter)
    models.append(model)
    LLs.append(loglikes)
    

In [None]:
names = ['LDA1batch1.bin','LDA1batch2.bin','LDA1batch3.bin','LDA1batch4.bin']
for i,model in enumerate(models):
    model.save(names[i])