In [83]:
import sys
sys.path.append('../')

import json
import multiprocessing as mp
import pathlib
import sys
import time
from subprocess import check_output
import warnings
import scipy.sparse as sparse
import random
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    warnings.filterwarnings("ignore", category=FutureWarning)
from src.topicmodeling.manageModels import TMmodel

**Training parameters**

In [74]:
ntopics = 5
dropout = 0.1
epochs = 1
training_params = {
    "activation": "softplus",
    "batch_size": 64,
    "dropout": dropout,
    "hidden_sizes": (50, 50),
    "labels": "",
    "learn_priors": True,
    "lr": 2e-3,
    "momentum": 0.99,
    "num_data_loader_workers": mp.cpu_count(),
    "num_threads": 4,
    "optimize_interval": 10,
    "reduce_on_plateau": False,
    "sbert_model_to_load": "paraphrase-distilroberta-base-v1",
    "solver": "adam",
    "thetas_thr": 0.003,
    "topic_prior_mean": 0.0,
    "topic_prior_variance": None,
    "ctm_model_type": "CombinedTM",
    "model_type": "prodLDA",
    "ntopics": ntopics,
    "num_epochs": epochs,
    "num_samples": 20
}

In [75]:
def get_model_config(TMparam,
                     hierarchy_level,
                     htm_version,
                     expansion_tpc,
                     thr):
    
    fields = ["ntopics",
              "thetas_thr",
              "labels",
              "model_type",
              "ctm_model_type",
              "hidden_sizes",
              "activation",
              "dropout",
              "learn_priors",
              "lr",
              "momentum",
              "solver",
              "num_epochs",
              "reduce_on_plateau",
              "batch_size",
              "topic_prior_mean",
              "topic_prior_variance",
              "num_samples",
              "num_data_loader_workers"]

    params = {"trainer": "ctm",
              "TMparam": {t: TMparam[t] for t in fields},
              "hierarchy-level": hierarchy_level,
              "htm-version": htm_version,
              "expansion_tpc": expansion_tpc,
              "thr": thr}

    return params

**Training**

In [85]:
def train_automatic(path_corpus: str, path_script:str):

    # Get training corpus (already preprocessed)
    corpusFile = pathlib.Path(path_corpus)
    print(corpusFile)
    if not corpusFile.is_dir() and not corpusFile.is_file:
        sys.exit(
            "The provided corpus file does not exist.")

    # Train root model
    train_config = get_model_config(
        TMparam=training_params,
        hierarchy_level=0,
        htm_version=None,
        expansion_tpc=None,
        thr=None)

    configFile = corpusFile.parent.joinpath("trainconfig.json")
    if configFile.is_file():
        with configFile.open('r', encoding='utf8') as fin:
            train_config_txt = json.load(fin)
            train_config_txt["TMparam"] = train_config["TMparam"]
        with configFile.open("w", encoding="utf-8") as fout:
            json.dump(train_config_txt, fout, ensure_ascii=False,
                      indent=2, default=str)

    t_start = time.perf_counter()
    cmd = f'python {path_script}/src/topicmodeling/topicmodeling.py --train --config {configFile.as_posix()}'
    print(cmd)

    print(f'-- -- Running command {cmd}')
    output = check_output(args=cmd, shell=True)
    t_end = time.perf_counter()

    t_total = t_end - t_start
    print(f"Total training time root model --> {t_total}")

In [86]:
train_automatic(path_corpus="/Users/lbartolome/Documents/GitHub/UserInLoopHTM/data/CORDIS/corpus.parquet", path_script="/Users/lbartolome/Documents/GitHub/UserInLoopHTM")

/Users/lbartolome/Documents/GitHub/UserInLoopHTM/data/CORDIS/corpus.parquet
python /Users/lbartolome/Documents/GitHub/UserInLoopHTM/src/topicmodeling/topicmodeling.py --train --config /Users/lbartolome/Documents/GitHub/UserInLoopHTM/data/CORDIS/trainconfig.json
-- -- Running command python /Users/lbartolome/Documents/GitHub/UserInLoopHTM/src/topicmodeling/topicmodeling.py --train --config /Users/lbartolome/Documents/GitHub/UserInLoopHTM/data/CORDIS/trainconfig.json


INFO:textPreproc:-- -- Saving lemmas and embeddings in auxiliary files
INFO:textPreproc:ddf read, starting compute
Epoch: [1/1]	 Seen Samples: [45824/45837]	Train Loss: 950.9997712886533	Time: 0:01:03.262346: : 1it [02:50, 170.98s/it]                              
100%|██████████| 717/717 [01:09<00:00, 10.38it/s] 
100%|██████████| 717/717 [01:08<00:00, 10.47it/s] 
INFO:TMmodel:-- -- -- Topic model object (TMmodel) successfully created
INFO:gensim.corpora.dictionary:adding document #0 to Dictionary<0 unique tokens: []>
INFO:gensim.corpora.dictionary:adding document #10000 to Dictionary<19604 unique tokens: ['argument', 'british', 'career', 'cease', 'change']...>
INFO:gensim.corpora.dictionary:adding document #20000 to Dictionary<19880 unique tokens: ['argument', 'british', 'career', 'cease', 'change']...>
INFO:gensim.corpora.dictionary:adding document #30000 to Dictionary<19884 unique tokens: ['argument', 'british', 'career', 'cease', 'change']...>
INFO:gensim.corpora.dictionary:adding 

CalledProcessError: Command 'python /Users/lbartolome/Documents/GitHub/UserInLoopHTM/src/topicmodeling/topicmodeling.py --train --config /Users/lbartolome/Documents/GitHub/UserInLoopHTM/data/CORDIS/trainconfig.json' died with <Signals.SIGSEGV: 11>.

In [2]:
path_model = \
    pathlib.Path("/Users/lbartolome/Documents/GitHub/UserInLoopHTM/data/CORDIS/TMmodel")

In [33]:
thetas = sparse.load_npz(path_model.joinpath("thetas.npz"))
# ndocs x ntopics: (45837, ntopics)
thetas_dense = thetas.todense()

In [50]:
def sum_up_to(vector, max_sum):
    x = np.array(list(map(np.int_, vector*max_sum))).ravel()
    pos_idx = list(np.where(x != 0)[0])
    while np.sum(x) != max_sum:
        idx = random.choice(pos_idx)
        x[idx] += 1
    return x

def get_str_rpr(vector, max_sum):
    vector = sum_up_to(vector, max_sum)
    rpr = ""
    for idx,val in enumerate(vector):
        rpr += "t" + str(idx) + "|" + str(val) + " "
    rpr = rpr.rstrip() 
    return rpr

In [41]:
corpus_txt = [line.rsplit(' 0 ')[1].strip() for line in open(
    pathlib.Path("/Users/lbartolome/Documents/GitHub/UserInLoopHTM/data/CORDIS/modelFiles/corpus.txt"), encoding="utf-8").readlines()]
print(len(corpus_txt))
doc_tpc_rpr = [get_str_rpr(thetas_dense[row,:], 1000) for row in range(len(thetas_dense))]

df = pd.DataFrame(list(zip(corpus_txt,doc_tpc_rpr)),
               columns =['lemmas', 'doc-tpc'])
df

45837


Unnamed: 0,lemmas,doc-tpc
0,discovery pleasure change italy long west repr...,t0|808 t1|66 t2|59 t3|39 t4|28
1,spectroscopic computing investigation chromium...,t0|227 t1|127 t2|150 t3|116 t4|380
2,enterprise_europe targeting innovation impleme...,t0|81 t1|534 t2|58 t3|251 t4|76
3,network observatories research_infrastructures...,t0|242 t1|437 t2|142 t3|90 t4|89
4,dynamical large_number partial_differential ph...,t0|532 t1|56 t2|208 t3|34 t4|170
...,...,...
45832,causal relations genetic environment interacti...,t0|244 t1|353 t2|243 t3|58 t4|102
45833,theranostic nanomedicines oncology combinatori...,t0|68 t1|108 t2|646 t3|66 t4|112
45834,tame legal politic accountability post_crisis ...,t0|378 t1|413 t2|96 t3|52 t4|61
45835,mathematic methods financial risk management p...,t0|464 t1|146 t2|74 t3|159 t4|157


In [71]:
df, vocab_id2w = TMmodel(path_model).to_dataframe()
df['cohr_cv'] = df['topic_coherence'].apply(lambda x: x[0:ntopics])
df['cohr_npmi'] = df['topic_coherence'].apply(lambda x: x[ntopics:])
df.drop(columns=['topic_coherence'], inplace=True)
df = df.apply(pd.Series.explode)
df.reset_index(drop=True)
df["id"] = [f"t{i}" for i in range(len(df))]
df.set_index("id", inplace=True)
df.sort_values(by=['alphas'],ascending=False)

INFO:TMmodel:-- -- -- Topic model object (TMmodel) successfully created


Unnamed: 0_level_0,betas,alphas,topic_entropy,ndocs_active,tpc_descriptions,tpc_labels,cohr_cv,cohr_npmi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
t0,"[5.199836e-05, 4.97296e-05, 4.988181e-05, 4.98...",0.211105,0.999898,45837,"tradition, mathematic, scholar, history, croat...",,0.393015,-0.174302
t1,"[4.9700728e-05, 5.1815525e-05, 5.125947e-05, 5...",0.208491,0.999909,45837,"stakeholders, policy, network, implementation,...",,0.660844,0.084611
t2,"[5.0690152e-05, 5.0512714e-05, 4.9998544e-05, ...",0.199761,0.99992,45837,"identify, drug, associate, therapy, genome, ge...",,0.761429,0.111183
t3,"[5.0266062e-05, 5.0365707e-05, 4.97803e-05, 5....",0.193079,0.999947,45837,"market, reduce, battery, production, safe, equ...",,0.617074,0.066994
t4,"[5.001027e-05, 5.009509e-05, 5.085136e-05, 5.0...",0.187566,0.999937,45837,"spin, electronic, catalysis, magnetic, frequen...",,0.668901,0.09535


In [72]:
df.tpc_descriptions.values.tolist()


['tradition, mathematic, scholar, history, croatian, relation, notion, dig, draw, contemporary, neglect, carbohydrate_chemistry, catapult, selection_committee, researcher_night',
 'stakeholders, policy, network, implementation, support, country, innovation, impact, regional, governance, good_practice, facilitate, organisation, management, action',
 'identify, drug, associate, therapy, genome, genetic, mediate, tissue, stem_cell, cancer, biology, regulation, disease, tumor, mouse',
 'market, reduce, battery, production, safe, equipment, company, vehicle, operational, pricing, manufacture, industry, water, demonstration, product',
 'spin, electronic, catalysis, magnetic, frequency, quantum, semiconductor, nanoscale, computing, interface, chemistry, information_processing, surface, coupling, nanostructure']