In [1]:
import os

os.chdir("../..")

In [2]:
from mars.db import collections
from mars.db.db_fields import LANGUAGE, CONTENT
from typing import List
from tqdm import tqdm
import tomotopy as tp

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

nlp.disable_pipe("parser")
nlp.disable_pipe("ner")


def preprocess_text(text:str, user_data=None)->List[str]:
    doc = nlp(text)
    return [t.lemma_.lower() for t in doc if not t.is_stop if t.is_alpha]

2021-10-04 13:09:45.687721: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-04 13:09:45.687770: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [4]:
query = collections.segmented_texts.fetchByExample({LANGUAGE:"en"}, batchSize=1000)

texts = [doc[CONTENT] for doc in tqdm(query, total=query.count)]

100%|████████████████████████████████████████████████████| 68597/68597 [00:37<00:00, 1819.34it/s]


In [5]:
processed_texts = [preprocess_text(text) for text in tqdm(texts)]

100%|█████████████████████████████████████████████████████| 68597/68597 [03:49<00:00, 298.85it/s]


In [9]:
corpus = tp.utils.Corpus()
for t in tqdm(processed_texts):
    corpus.add_doc(words=t)

100%|██████████████████████████████████████████████████| 68597/68597 [00:00<00:00, 169449.04it/s]


In [10]:
print("Starting model training...")
model = tp.CTModel(k=15, corpus=corpus)
model.train(10)
print("Model trained")
print(model.summary())

Starting model training...
Model trained
<Basic Info>
| CTModel (current version: 0.12.2)
| 67867 docs, 1051632 words
| Total Vocabs: 21918, Used Vocabs: 21918
| Entropy of words: 7.70043
| Entropy of term-weighted words: 7.70043
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 10, Burn-in steps: 0
| Optimization Interval: 2
| Log-likelihood per word: -9.21841
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 15 (the number of topics between 1 ~ 32767)
| smoothing_alpha: [0.1] (small smoothing value for preventing topic counts to be zero, given as a single `float` in case of symmetric and as a list with length `k` of `float` in case of asymmetric.)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 371865896 (random seed)
| trained in version 0.12.2
|
<Parameters>
| prior_mean (Prior mean of Logit-norma

In [11]:
extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000, normalized=True)
cands = extractor.extract(model)

labeler = tp.label.FoRelevance(model, cands, min_df=5, smoothing=1e-2, mu=0.25)
for k in range(model.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    for word, prob in model.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()


== Topic #0 ==
Labels: kenyan, immutable, transactional, cryptocurrencie, virtual currency
shall	0.011202641762793064
states	0.008804910816252232
regulation	0.006865763105452061
member	0.006695432588458061
article	0.006459590047597885
directive	0.006171338725835085
commission	0.005856882315129042
person	0.005673449020832777
ensure	0.005450709257274866
eu	0.005306583363562822

== Topic #1 ==
Labels: kenyan, immutable, transactional, cryptocurrency, cryptocurrencie
shall	0.01128973439335823
include	0.006849533878266811
european	0.006415318697690964
states	0.006037131417542696
article	0.00589706189930439
public	0.005630929954349995
system	0.00529476348310709
information	0.005210721865296364
fund	0.005098666530102491
provide	0.005028631538152695

== Topic #2 ==
Labels: transactional, cryptocurrencie, disruptive technology, immutable, cryptocurrency
shall	0.009416715241968632
eu	0.007994499057531357
follow	0.007455554325133562
member	0.007380701135843992
state	0.0071411700919270515
article	