In [5]:
import json
import numpy as np
from pathlib import Path
from collections import Counter
import tqdm
import networkx as nx
from pprint import pprint

from data_reader import JsonDocReader, Document
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
from joblib import Parallel, delayed
import octis
import common
from functools import partial

In [4]:
data_reader = JsonDocReader(Path('data/docs.json')).read_documents()
docs = list(data_reader)
filtered_docs = common.filter_common_words(docs, min_freq=0, max_freq=0.65)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sem.kolesnikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 10/10 [00:00<00:00, 2534.17it/s]


In [3]:
with open('data/preprocessed_docs.json') as f:
    docs = json.load(f)
    docs = [Document(k, v.split()) for k, v in docs.items()]
filtered_docs = common.filter_common_words(docs, min_freq=0.1, max_freq=0.65)
doc_name_to_doc = {doc.name: doc for doc in docs}
filtered_docs.remove(Document('gost_r_54481-2011.txt', []))

100%|██████████| 1236/1236 [00:02<00:00, 430.21it/s]


In [30]:
graph = common.get_graph(filtered_docs, 0.85, partial(common.jaccard_sim, _words_cache={}))
doc_name_to_doc = {doc.name: doc for doc in docs}

components = list(nx.connected_components(graph))
len(components)

1235it [01:22, 15.00it/s] 


22

In [18]:
import scoring
from importlib import reload
scoring = reload(scoring)

In [67]:
components_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(components)
]
component_to_topics = scoring.get_topics_ctfidf(components_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
components_topics = [v for k, v in component_to_topics.items()]

22it [00:00, 504.81it/s]


In [69]:
component_to_topics[0]

['заявление',
 'регистрация',
 'выдача',
 'орган',
 'агентство',
 'заявитель',
 'единица',
 'реестр',
 'обязанность',
 'присвоение']

In [13]:
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.coherence_metrics import WECoherencePairwise

pairwise = WECoherencePairwise('cc.ru.300.vec.gz', binary=False, topk=10)

In [68]:
pairwise.score({'topics': components_topics})

0.034275747549654255