In [None]:
!pip install tomotopy

In [1]:
import tomotopy as tp
import pickle
import time as timer
import pathlib
import numpy as np
import pandas as pd

In [2]:
def mallet_corpus_to_df(corpusFile: pathlib.Path):
    """Converts a Mallet corpus file (i.e., file required for the Mallet import command) to a pandas DataFrame

    Parameters
    ----------
    corpusFile: pathlib.Path
        Path to the Mallet corpus file

    Returns
    -------
    :   pandas.DataFrame
        DataFrame with the corpus
    """

    corpus = [line.rsplit(' 0 ')[1].strip() for line in open(
        corpusFile, encoding="utf-8").readlines()]
    indexes = [line.rsplit(' 0 ')[0].strip() for line in open(
        corpusFile, encoding="utf-8").readlines()]
    corpus_dict = {
        'id': indexes,
        'text': corpus
    }
    return pd.DataFrame(corpus_dict)

In [3]:
path_corpus = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Datasets/CORDIS/models_preproc/iter_0/corpus.txt")
df = mallet_corpus_to_df(path_corpus)
df

Unnamed: 0,id,text
0,100016,methods processes embedded_systems embed criti...
1,115153,rapid point_care platforms infectious_diseases...
2,115632,recognise adverse drug reactions regulatory_ag...
3,115861,vaccine phase_ii sofia_ref main_objective exte...
4,116030,translational quantitative toxicology medicine...
...,...,...
65571,190119289,early_detection skin_cancer endure detect skin...
65572,190134697,water_vapor turbo compression thermal cool maj...
65573,190151860,geotechnical genetic inverse poor track_record...
65574,190161902,artificial_intelligence musical preserve prese...


In [4]:
df_lemas = df[["text"]].values.tolist()

In [9]:
tw = tp.TermWeight.ONE # term weighting scheme in TermWeight
min_cf = 0             # minimum collection frequency of words.
min_df = 0             # minimum document frequency of words.
rm_top = 0             # the number of top words to be removed. 
depth = 4              # the maximum depth level of hierarchy between 2 ~ 32767
alpha = 10.0           # hyperparameter of Dirichlet distribution for document-depth level
eta = 0.1              # hyperparameter of Dirichlet distribution for topic-word
gamma = 1.0            # concentration coeficient of Dirichlet Process
seed = None            # random seed
mycorpus = df_lemas  
transform = None       # a callable object to manipulate arbitrary keyword arguments for a specific topic model

In [None]:
mdl = tp.HLDAModel(tw = tp.TermWeight.ONE , min_cf= 0, min_df= 0, rm_top = 0, depth = 2, alpha = 10.0, eta = 0.1, gamma = 1.0)

print("LLEGA 1")
for texts in mycorpus:
    mdl.add_doc(texts[0].split())
print("LLEGA 2")
mdl.train(0)
print("LLEGA 3")
print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words))

# Let's train the model
for i in range(0, 1000, 20):
    print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
    mdl.train(20)
print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))

mdl.summary()


LLEGA 1
LLEGA 2
LLEGA 3
Num docs:65576, Num Vocabs:20374, Total Words:4497518
Iteration: 0000, LL per word: -9.169
Iteration: 0020, LL per word: -8.102
Iteration: 0040, LL per word: -8.055


In [None]:
for k in range(mdl.k):
    if not mdl.is_live_topic(k): continue
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=15))

In [None]:
#save lda model for reuse
hlda_save = 'hlda.bin'
mdl.save(hlda_save)

In [None]:
#load and print
#mdl = tp.HLDAModel.load(hlda_save) 
#for k in range(mdl.k):
#    if not mdl.is_live_topic(k): continue
#    print('Top 10 words of topic #{}'.format(k))
#    print(mdl.get_topic_words(k, top_n=10))

In [119]:
betas_ = np.load('/export/usuarios_ml4ds/lbartolome/Datasets/CORDIS/htm_variability_models/htm_6_tpcs_20230922/TMmodel/betas.npy')

In [123]:
betas_

array([[8.38666805e-05, 4.13074700e-05, 1.00000000e-12, ...,
        1.87761233e-05, 3.00417966e-05, 1.00000000e-12],
       [9.97873293e-05, 2.49468331e-05, 7.10984724e-05, ...,
        1.00000000e-12, 1.00000000e-12, 1.00000000e-12],
       [1.14584453e-04, 4.86983932e-05, 1.00000000e-12, ...,
        1.00000000e-12, 1.00000000e-12, 3.43753367e-05],
       [1.00000000e-12, 1.00000000e-12, 1.00000000e-12, ...,
        1.00000000e-12, 1.00000000e-12, 1.00000000e-12],
       [3.41553395e-05, 1.30115576e-04, 1.00000000e-12, ...,
        1.00000000e-12, 1.00000000e-12, 1.00000000e-12],
       [3.71241193e-05, 1.00000000e-12, 1.00000000e-12, ...,
        1.00000000e-12, 1.00000000e-12, 1.00000000e-12]])

In [69]:
mdl = tp.HLDAModel.load("/export/usuarios_ml4ds/lbartolome/Repos/my_repos/UserInLoopHTM/experiments/hlda/output/hlda_cordis.bin") 

In [71]:
len(mdl.vocabs)

20374

In [72]:
mdl.k

96

In [137]:
np.array([mdl.get_topic_word_dist(el) for el in range(mdl.k)]).shape

(96, 20374)

In [124]:
betas = []
for el in range(mdl.k):
    betas_i = mdl.get_topic_word_dist(el)
    betas.append(betas_i)
betas = np.array(betas)
betas.shape

(96, 20374)

In [125]:
betas_ds = np.copy(betas)
if np.min(betas_ds) < 1e-12:
    betas_ds += 1e-12
deno = np.reshape((sum(np.log(betas_ds)) / mdl.k), (len(mdl.vocabs), 1))
deno = np.ones((mdl.k, 1)).dot(deno.T)
betas_ds = betas_ds * (np.log(betas_ds) - deno)
betas_ds

array([[ 8.81692984e-04,  2.59725790e-02,  4.59628272e-02, ...,
        -5.19945132e-07, -5.15196468e-07, -5.21960272e-07],
       [-4.89315327e-05, -6.56981603e-05, -5.23421798e-05, ...,
         6.82128010e-05,  7.03226895e-05,  6.73174500e-05],
       [-4.89315327e-05, -6.56981603e-05, -5.23421798e-05, ...,
         6.82128010e-05,  7.03226895e-05,  6.73174500e-05],
       ...,
       [-4.89315327e-05, -6.56981603e-05, -5.23421798e-05, ...,
         6.82128010e-05,  7.03226895e-05,  6.73174500e-05],
       [-4.89315327e-05, -6.56981603e-05, -5.23421798e-05, ...,
         6.82128010e-05,  7.03226895e-05,  6.73174500e-05],
       [-4.89315327e-05, -6.56981603e-05, -5.23421798e-05, ...,
         6.82128010e-05,  7.03226895e-05,  6.73174500e-05]])

In [126]:
vocab_w2id = {}
vocab_id2w = {}
for i, wd in enumerate(mdl.vocabs):
    vocab_w2id[wd] = i
    vocab_id2w[str(i)] = wd

In [130]:
n_words = 15
tpc_descs = []
for i in range(mdl.k):
    words = [mdl.vocabs[idx2] for idx2 in np.argsort(betas_ds[i])[::-1][0:n_words]]
    tpc_descs.append((i, ', '.join(words)))


In [134]:
#for i in range(mdl.k):
#    print(f"{str(i)}: {mdl.get_topic_words(i)}")

In [131]:
tpc_descs = [el for el in tpc_descs if el[0] not in [0,1,2,3,4,5,6,85,89,92,93,94,95]]
tpc_descs 

[(7,
  'supernova_explosion, place_september, microtubule_mt, supersymmetric_gauge, astrophysical_phenomenon, diagnostic_audit, atlantic_meridional, coach_client, historiographic, nucleosomes, absorption_coefficient, astrophysical_object, resect, calcium_image, science_cafã'),
 (8,
  'cellular, genetic, protein, cancer, deoxyribonucleic_acid, disease, mechanism, molecule, ribonucleic_acid, genomic, signal, control, pathway, tissue, novel'),
 (9,
  'quantum, molecule, electric, optic, spin, control, light, atom, magnetic, novel, interaction, electron, energy, photonic, surface'),
 (10,
  'galaxy, particle, energy, universe, observation, star, evolution, mass, dark_matter, search, detector, measurement, solar, formation, large_hadron_collider'),
 (11,
  'energy, battery, fuel, cellular, electric, efficiency, carbon_dioxide, emission, hydrogen, vehicle, component, solar, plant, heat, reduction'),
 (12,
  'patient, cancer, therapeutic, treatment, cellular, clinical, disease, drug, tumor, d