In [1]:
import topmost
from topmost.data import download_dataset

device = "cuda" # or "cpu"
dataset_dir = "./datasets/NYT"
download_dataset('NYT', cache_path='./datasets')

Downloading https://raw.githubusercontent.com/BobXWu/TopMost/master/data/NYT.zip to ./datasets/NYT.zip


100%|██████████| 15070620/15070620 [00:01<00:00, 11511023.89it/s]


In [7]:
# load a dataset
#dataset = topmost.data.BasicDataset(dataset_dir, read_labels=True, device=device)

dataset_dir = '/export/usuarios_ml4ds/lbartolome/Datasets/CORDIS/models_preproc/iter_0/topmost'
dataset = topmost.data.BasicDataset(dataset_dir, read_labels=True, device=device)

train_size:  52460
test_size:  13116
vocab_size:  10000
average length: 74.651


In [8]:
# create a model
# model = topmost.models.SawETM(vocab_size=dataset.vocab_size, num_topics_list=[10, 50, 200], device=device)
model = topmost.models.HyperMiner(vocab_size=dataset.vocab_size, num_topics_list=[10, 50], device=device)
#model = topmost.models.HyperMiner(vocab_size=dataset.vocab_size, num_topics_list=[10, 50, 200], device=device)
#topmost.models.TraCo(dataset.vocab_size, num_topics_list=[10, 50, 200], device=device)
model = model.to(device)

# create a trainer
trainer = topmost.trainers.HierarchicalTrainer(model, dataset)
# train the model
top_words, train_theta = trainer.train()

                                                 

In [9]:
########################### Evaluate ####################################
import json
import numpy as np
from topmost import evaluations

# get theta (doc-topic distributions)
train_theta, test_theta = trainer.export_theta()

# compute topic coherence
# refer to https://github.com/BobXWu/ECRTM

# compute topic diversity
TD = evaluations.multiaspect_topic_diversity(top_words)
print(f"TD: {TD}")

# evaluate clustering
results = evaluations.hierarchical_clustering(test_theta, dataset.test_labels)
print(dict(results))

# evaluate classification
results = evaluations.hierarchical_classification(train_theta, test_theta, dataset.train_labels, dataset.test_labels)
print(dict(results))

TD: 0.7986666666666666
{'Purity': 0.3528133577310155, 'NMI': 0.001190953938798616}
{'acc': 0.3331427264409881, 'macro-F1': 0.3259473418185663}


In [10]:
# evaluate quality of topic hierarchy
beta_list = trainer.get_beta()
phi_list = trainer.get_phi()
annoated_top_words = trainer.get_top_words(annotation=True)
reference_bow = np.concatenate((dataset.train_bow, dataset.test_bow), axis=0) # or reference_bow = train_bow
results, topic_hierarchy = evaluations.hierarchy_quality(dataset.vocab, reference_bow, annoated_top_words, beta_list, phi_list)

print(json.dumps(topic_hierarchy, indent=4))
print(results)

100%|██████████| 1/1 [00:03<00:00,  3.92s/it]

{
    "L-0_K-0 interaction mechanism factor determine imaging know insight play image important behaviour affect response resolution evolution": [
        "L-1_K-38 determine know play insight affect hypothesis basis central suggest live ability consequence little crucial recognition",
        "L-1_K-34 factor imaging image behaviour resolution nuclear pattern stress resonance family temporal translation spatial alzheimer risk",
        "L-1_K-17 brain neuronal cognitive disorder visual motor tomography sensory neural mental synaptic neuroscience plasticity eye cortical",
        "L-1_K-4 important recent remain importance investigation origin occur observe understanding composition reveal advance detail presence arise",
        "L-1_K-35 mechanism response signal functional pathway induce underlie molecule mediate activation molecular circuit physiological characterize elucidate"
    ],
    "L-0_K-1 electric operation robotic transport sensor scalability real mobile flexible embed mai




In [6]:
import pickle

save_data = {
    "model_state_dict": model.state_dict(),
    "trainer_config": {
        "vocab_size": dataset.vocab_size,
        "num_topics_list": [10, 50, 200],
    },
    "top_words": top_words,
    "train_theta": train_theta,
    "test_theta": test_theta,
    "beta_list": beta_list,
    "phi_list": phi_list,
    "annotated_top_words": trainer.get_top_words(annotation=True),
    "topic_hierarchy": topic_hierarchy,
    "hierarchy_quality_results": results,
    "topic_diversity": TD,
}

# Save the dataset and preprocessing results for reproducibility
save_data["dataset"] = {
    "vocab": dataset.vocab,
    "train_bow": dataset.train_bow,
    "test_bow": dataset.test_bow,
}

# Save all data to a pickle file
with open("trained_model_full_.pkl", "wb") as f:
    pickle.dump(save_data, f)