In [7]:
import json
import numpy as np
from pathlib import Path
from collections import Counter
import tqdm
import networkx as nx
from pprint import pprint

from data_reader import JsonDocReader, PostPdfDocReader, Document
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
from joblib import Parallel, delayed
import octis
import common
from functools import partial

ГОСТ

In [4]:
data_reader = JsonDocReader(Path('data/docs.json')).read_documents()
docs = list(data_reader)
filtered_docs = common.filter_common_words(docs, min_freq=0, max_freq=0.65)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sem.kolesnikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 10/10 [00:00<00:00, 2534.17it/s]


In [3]:
with open('data/preprocessed_docs.json') as f:
    docs = json.load(f)
    docs = [Document(k, v.split()) for k, v in docs.items()]
filtered_docs = common.filter_common_words(docs, min_freq=0.1, max_freq=0.65)
doc_name_to_doc = {doc.name: doc for doc in docs}
filtered_docs.remove(Document('gost_r_54481-2011.txt', []))

100%|██████████| 1236/1236 [00:02<00:00, 430.21it/s]


In [30]:
graph = common.get_graph(filtered_docs, 0.85, partial(common.jaccard_sim, _words_cache={}))
doc_name_to_doc = {doc.name: doc for doc in docs}

components = list(nx.connected_components(graph))
len(components)

1235it [01:22, 15.00it/s] 


22

In [18]:
import scoring
from importlib import reload
scoring = reload(scoring)

In [67]:
components_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(components)
]
component_to_topics = scoring.get_topics_ctfidf(components_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
components_topics = [v for k, v in component_to_topics.items()]

22it [00:00, 504.81it/s]


In [69]:
component_to_topics[0]

['заявление',
 'регистрация',
 'выдача',
 'орган',
 'агентство',
 'заявитель',
 'единица',
 'реестр',
 'обязанность',
 'присвоение']

In [13]:
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.coherence_metrics import WECoherencePairwise

pairwise = WECoherencePairwise('cc.ru.300.vec.gz', binary=False, topk=10)

In [68]:
pairwise.score({'topics': components_topics})

0.034275747549654255

Статьи

In [8]:
articles = list(PostPdfDocReader().read_saved_as_document("upd"))
filtered_articles = common.filter_common_words(articles, min_freq=0, max_freq=0.65)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexandra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


100%|██████████| 2454/2454 [01:16<00:00, 32.01it/s]
100%|██████████| 2454/2454 [00:09<00:00, 270.93it/s]


In [9]:
#graph = common.get_graph(filtered_articles, 0.85, partial(common.jaccard_sim, _words_cache={}))

import pickle
with open('jac_graph.pickle', 'rb') as f:
    graph = pickle.load(f)
doc_name_to_doc = {doc.name: doc for doc in articles}

components = list(nx.connected_components(graph))
len(components)

16

In [10]:
components

[{'Внутренняя торговля. Туристско-экскурсионное обслуживание-15487913_14760873.csv',
  'Внутренняя торговля. Туристско-экскурсионное обслуживание-15487913_41307961.csv'},
 {'Информатика-36462175_17151772.csv', 'Информатика-36462175_86833932.csv'},
 {'Кибернетика-48968947_50106637.csv', 'Кибернетика-48968947_63817574.csv'},
 {'Космические исследования-46357156_20482003.csv',
  'Космические исследования-46357156_44596993.csv'},
 {'Культура. Культурология-11659345_42976510.csv',
  'Культура. Культурология-11659345_66404855.csv'},
 {'Культура. Культурология-38569997_47758309.csv',
  'Культура. Культурология-38569997_81188332.csv'},
 {'Культура. Культурология-41871153_84879739.csv',
  'Культура. Культурология-41871153_87245919.csv'},
 {'Культура. Культурология-44557483_82171609.csv',
  'Культура. Культурология-44557483_93330972.csv'},
 {'Лесная и деревообрабатывающая промышленность-42446382_51614510.csv',
  'Лесная и деревообрабатывающая промышленность-42446382_56625649.csv'},
 {'Машиностро

In [11]:
import scoring
from importlib import reload
scoring = reload(scoring)

In [12]:
components_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(components)
]
component_to_topics = scoring.get_topics_ctfidf(components_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
components_topics = [v for k, v in component_to_topics.items()]
component_to_topics[0]

16it [00:00, 73.90it/s]


['специалист',
 'профессиональный',
 'подготовка',
 'стандарт',
 'обслуживание',
 'образовательный',
 'учебный',
 'предприятие',
 'требование',
 'качество']

In [18]:
from octis.evaluation_metrics.coherence_metrics import WECoherencePairwise

pairwise = WECoherencePairwise('cc.ru.300.vec.gz', binary=False, topk=10)
pairwise.score({'topics': components_topics})

0.00060675549838278

In [None]:
import collections

#mode_graph = graph.copy()
with open('jac_graph.pickle', 'rb') as f:
    mode_graph = pickle.load(f)

node_weights = collections.defaultdict(0)
wtot = 0
for (u,v) in mode_graph.edges():
    weight = mode_graph[u][v]['weight']
    node_weights[u] += weight
    node_weights[v] += weight
    wtot += weight
    if u != v:
        wtot += weight

cluster_sizes = collections.defaultdict(1)
connected_components = []
D = []
u = mode_graph.number_of_nodes()
for n in mode_graph.nodes():
    neighbor_chain = [list(mode_graph.nodes())[0]]
    while neighbor_chain != []:
        a = neighbor_chain.pop()
        dmin = float("inf")
        nearest = -1
        for v in neighbor_chain.neighbors(a):
            if v != a:
                d = node_weights[v] * node_weights[a] / float(mode_graph[a][v]['weight']) / float(wtot)
                if d < dmin:
                    nearest = v
                    dmin = d
                elif d == dmin:
                    nearest = min(nearest,v)
        d = dmin
        if neighbor_chain != []:
            c = neighbor_chain.pop()
            if nearest == c:
                D.append([a,nearest,d,cluster_sizes[a] + cluster_sizes[nearest]])
                    # update graph
                mode_graph.add_node(u)
                neighbors_a = list(mode_graph.neighbors(a))
                neighbors_b = list(mode_graph.neighbors(nearest))
                for v in neighbors_a:
                    mode_graph.add_edge(u,v,weight = mode_graph[a][v]['weight'])
                for v in neighbors_b:
                    if mode_graph.has_edge(u,v):
                        mode_graph[u][v]['weight'] += mode_graph[nearest][v]['weight']
                    else:
                        mode_graph.add_edge(u,v,weight = mode_graph[nearest][v]['weight'])
                mode_graph.remove_node(a)
                mode_graph.remove_node(nearest)
                node_weights[u] = node_weights.pop(a) + node_weights.pop(nearest)
                cluster_sizes[u] = cluster_sizes.pop(a) + cluster_sizes.pop(nearest)
                u += 1
            else:
                neighbor_chain.append(c)
                neighbor_chain.append(a)
                neighbor_chain.append(nearest)
        elif nearest >= 0:
            neighbor_chain.append(a)
            neighbor_chain.append(nearest)   
        else:
            connected_components.append((a,cluster_sizes[a]))
            F.remove_node(a)
            w.pop(a)
            s.pop(a)
            n -= 1