In [None]:
!pip install tomotopy

In [1]:
import tomotopy as tp
import pickle
import time as timer
import pathlib
import numpy as np
import pandas as pd

In [2]:
def mallet_corpus_to_df(corpusFile: pathlib.Path):
    """Converts a Mallet corpus file (i.e., file required for the Mallet import command) to a pandas DataFrame

    Parameters
    ----------
    corpusFile: pathlib.Path
        Path to the Mallet corpus file

    Returns
    -------
    :   pandas.DataFrame
        DataFrame with the corpus
    """

    corpus = [line.rsplit(' 0 ')[1].strip() for line in open(
        corpusFile, encoding="utf-8").readlines()]
    indexes = [line.rsplit(' 0 ')[0].strip() for line in open(
        corpusFile, encoding="utf-8").readlines()]
    corpus_dict = {
        'id': indexes,
        'text': corpus
    }
    return pd.DataFrame(corpus_dict)

In [3]:
path_corpus = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Datasets/CORDIS/models_preproc/iter_0/corpus.txt")
df = mallet_corpus_to_df(path_corpus)
df

Unnamed: 0,id,text
0,100016,methods processes embedded_systems embed criti...
1,115153,rapid point_care platforms infectious_diseases...
2,115632,recognise adverse drug reactions regulatory_ag...
3,115861,vaccine phase_ii sofia_ref main_objective exte...
4,116030,translational quantitative toxicology medicine...
...,...,...
65571,190119289,early_detection skin_cancer endure detect skin...
65572,190134697,water_vapor turbo compression thermal cool maj...
65573,190151860,geotechnical genetic inverse poor track_record...
65574,190161902,artificial_intelligence musical preserve prese...


In [4]:
df_lemas = df[["text"]].values.tolist()

In [9]:
tw = tp.TermWeight.ONE # term weighting scheme in TermWeight
min_cf = 0             # minimum collection frequency of words.
min_df = 0             # minimum document frequency of words.
rm_top = 0             # the number of top words to be removed. 
depth = 4              # the maximum depth level of hierarchy between 2 ~ 32767
alpha = 10.0           # hyperparameter of Dirichlet distribution for document-depth level
eta = 0.1              # hyperparameter of Dirichlet distribution for topic-word
gamma = 1.0            # concentration coeficient of Dirichlet Process
seed = None            # random seed
mycorpus = df_lemas  
transform = None       # a callable object to manipulate arbitrary keyword arguments for a specific topic model

In [None]:
mdl = tp.HLDAModel(tw = tp.TermWeight.ONE , min_cf= 0, min_df= 0, rm_top = 0, depth = 2, alpha = 10.0, eta = 0.1, gamma = 1.0)

print("LLEGA 1")
for texts in mycorpus:
    mdl.add_doc(texts[0].split())
print("LLEGA 2")
mdl.train(0)
print("LLEGA 3")
print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words))

# Let's train the model
for i in range(0, 1000, 20):
    print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
    mdl.train(20)
print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))

mdl.summary()


LLEGA 1
LLEGA 2
LLEGA 3
Num docs:65576, Num Vocabs:20374, Total Words:4497518
Iteration: 0000, LL per word: -9.169
Iteration: 0020, LL per word: -8.102
Iteration: 0040, LL per word: -8.055


In [None]:
for k in range(mdl.k):
    if not mdl.is_live_topic(k): continue
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=15))

In [None]:
#save lda model for reuse
hlda_save = 'hlda.bin'
mdl.save(hlda_save)

In [None]:
#load and print
#mdl = tp.HLDAModel.load(hlda_save) 
#for k in range(mdl.k):
#    if not mdl.is_live_topic(k): continue
#    print('Top 10 words of topic #{}'.format(k))
#    print(mdl.get_topic_words(k, top_n=10))