In [3]:
import pandas as pd
import os 
import numpy as np
import re
import random
import tomotopy as tp
import sys
import pickle
import time

In [4]:
input_path = "abstracts_processed.csv"
with open(input_path, "rb") as fp:   
    # Unpickling
    documents = pickle.load(fp)

In [13]:
## Setting the hyperparameters
tw = tp.TermWeight.ONE # term weighting scheme in TermWeight. The default value is TermWeight.ONE
initial_k = 2
min_cf=3 # minimum collection frequency of words. Words with a smaller collection frequency than min_cf are excluded from the model. The default value is 0, which means no words are excluded.
min_df=0 # minimum document frequency of words. Words with a smaller document frequency than min_df are excluded from the model. The default value is 0, which means no words are excluded
rm_top=5 # the number of top words to be removed. If you want to remove too common words from model, you can set this value to 1 or more. The default value is 0, which means no top words are removed.
alpha = 0.1 # hyperparameter of Dirichlet distribution for document-topic
eta = 0.01 # hyperparameter of Dirichlet distribution for topic-word
gamma = 0.1 # concentration coeficient of Dirichlet Process for table-topic
transform = None # a callable object to manipulate arbitrary keyword arguments for a specific topic model
seed = 41 # random seed
model_burn_in = 500 
train_updates = 10000
train_iter = 10
save_path = "hdp_model.bin" #.bin format

In [14]:
model = tp.HDPModel(tw=tw, min_cf=min_cf, min_df=min_df, rm_top=rm_top, initial_k=initial_k, alpha=alpha, 
                    eta=eta, gamma=gamma, transform=transform)

In [15]:
# adding documents to the model 
for doc in documents: model.add_doc(doc)

In [16]:

start = time.time()
# training**
model.burn_in = model_burn_in
# initialising 
model.train(iter=0)
print('Num docs:', len(model.docs), ', Vocab size:', len(model.used_vocabs), ', Num words:', model.num_words)
print('Removed top words:', model.removed_top_words)
print('Training...', file=sys.stderr, flush=True)
# actual training 
t = []
LLs = []
for i in range(0, train_updates, train_iter):
    model.train(train_iter)
    if i%1000==0:print('Iteration: {}'.format(i))
    t.append(i)
    LLs.append(model.ll_per_word)

end = time.time()
print("Time elapsed: "+ str(round(end - start,1))+" s")

Training...


Num docs: 20494 , Vocab size: 42219 , Num words: 2045097
Removed top words: ['use', 'model', 'result', 'studi', 'base']
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900


In [17]:
model.summary()

<Basic Info>
| HDPModel (current version: 0.9.1)
| 20494 docs, 2045097 words
| Total Vocabs: 69312, Used Vocabs: 42219
| Entropy of words: -8.71136
| Removed Vocabs: use model result studi base
|
<Training Info>
| Iterations: 1000, Burn-in steps: 100
| Optimization Interval: 10
| Log-likelihood per word: -8.33705
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 3 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 5 (the number of top words to be removed)
| initial_k: 150 (the initial number of topics between 2 ~ 32767 The number of topics will be adjusted for data during training)
| alpha: 0.1 (concentration coeficient of Dirichlet Process for document-table )
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| gamma: 0.1 (concentration coeficient of Dirichlet Process for table-topic)
| seed: 205194031 (random seed)
| trained in version 0.9.1
|
<Parameters>
| alpha (concentration coeficient of Dirichlet Process f

In [None]:
def train_HDP()