In [2]:
import json
import numpy as np
from pathlib import Path
from collections import Counter
import tqdm
import networkx as nx
from pprint import pprint

from data_reader import JsonDocReader, PostPdfDocReader, Document
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
from joblib import Parallel, delayed
import octis
import common
from functools import partial

ГОСТ

In [4]:
data_reader = JsonDocReader(Path('data/docs.json')).read_documents()
docs = list(data_reader)
filtered_docs = common.filter_common_words(docs, min_freq=0, max_freq=0.65)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sem.kolesnikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 10/10 [00:00<00:00, 2534.17it/s]


In [3]:
with open('data/preprocessed_docs.json') as f:
    docs = json.load(f)
    docs = [Document(k, v.split()) for k, v in docs.items()]
filtered_docs = common.filter_common_words(docs, min_freq=0.1, max_freq=0.65)
doc_name_to_doc = {doc.name: doc for doc in docs}
filtered_docs.remove(Document('gost_r_54481-2011.txt', []))

100%|██████████| 1236/1236 [00:02<00:00, 430.21it/s]


In [30]:
graph = common.get_graph(filtered_docs, 0.85, partial(common.jaccard_sim, _words_cache={}))
doc_name_to_doc = {doc.name: doc for doc in docs}

components = list(nx.connected_components(graph))
len(components)

1235it [01:22, 15.00it/s] 


22

In [18]:
import scoring
from importlib import reload
scoring = reload(scoring)

In [67]:
components_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(components)
]
component_to_topics = scoring.get_topics_ctfidf(components_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
components_topics = [v for k, v in component_to_topics.items()]

22it [00:00, 504.81it/s]


In [69]:
component_to_topics[0]

['заявление',
 'регистрация',
 'выдача',
 'орган',
 'агентство',
 'заявитель',
 'единица',
 'реестр',
 'обязанность',
 'присвоение']

In [13]:
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.coherence_metrics import WECoherencePairwise

pairwise = WECoherencePairwise('cc.ru.300.vec.gz', binary=False, topk=10)

In [68]:
pairwise.score({'topics': components_topics})

0.034275747549654255

Статьи

In [3]:
articles = list(PostPdfDocReader().read_saved_as_document("upd"))
filtered_articles = common.filter_common_words(articles, min_freq=0, max_freq=0.65)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexandra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 2454/2454 [01:54<00:00, 21.39it/s]
100%|██████████| 2454/2454 [00:06<00:00, 367.24it/s]


In [4]:
#graph = common.get_graph(filtered_articles, 0.85, partial(common.jaccard_sim, _words_cache={}))

import pickle
with open('jac_graph.pickle', 'rb') as f:
    graph = pickle.load(f)
doc_name_to_doc = {doc.name: doc for doc in articles}

components = list(nx.connected_components(graph))
len(components)

16

In [5]:
components

[{'Внутренняя торговля. Туристско-экскурсионное обслуживание-15487913_14760873.csv',
  'Внутренняя торговля. Туристско-экскурсионное обслуживание-15487913_41307961.csv'},
 {'Информатика-36462175_17151772.csv', 'Информатика-36462175_86833932.csv'},
 {'Кибернетика-48968947_50106637.csv', 'Кибернетика-48968947_63817574.csv'},
 {'Космические исследования-46357156_20482003.csv',
  'Космические исследования-46357156_44596993.csv'},
 {'Культура. Культурология-11659345_42976510.csv',
  'Культура. Культурология-11659345_66404855.csv'},
 {'Культура. Культурология-38569997_47758309.csv',
  'Культура. Культурология-38569997_81188332.csv'},
 {'Культура. Культурология-41871153_84879739.csv',
  'Культура. Культурология-41871153_87245919.csv'},
 {'Культура. Культурология-44557483_82171609.csv',
  'Культура. Культурология-44557483_93330972.csv'},
 {'Лесная и деревообрабатывающая промышленность-42446382_51614510.csv',
  'Лесная и деревообрабатывающая промышленность-42446382_56625649.csv'},
 {'Машиностро

In [6]:
import scoring
from importlib import reload
scoring = reload(scoring)

In [7]:
components_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(components)
]
component_to_topics = scoring.get_topics_ctfidf(components_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
components_topics = [v for k, v in component_to_topics.items()]
component_to_topics[0]

16it [00:00, 233.05it/s]


['специалист',
 'профессиональный',
 'подготовка',
 'стандарт',
 'обслуживание',
 'образовательный',
 'учебный',
 'предприятие',
 'требование',
 'качество']

In [8]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('cc.ru.300.vec.gz', binary=False)

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary()
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]

In [None]:
from gensim.models.coherencemodel import CoherenceModel

cm = CoherenceModel(topics=components_topics, corpus=common_corpus, dictionary=common_dictionary, coherence='u_mass')
coherence = cm.get_coherence()

In [14]:
common.compute_top_words_sim(components_topics, model, topn=10)

0.21202647737372898

In [18]:
from octis.evaluation_metrics.coherence_metrics import WECoherencePairwise

pairwise = WECoherencePairwise('cc.ru.300.vec.gz', binary=False, topk=10)
pairwise.score({'topics': components_topics})

0.00060675549838278

elib + word2vec

In [15]:
def cosine_sim(first_doc, second_doc, doc_to_vec):
    first = doc_to_vec[first_doc.name]
    second = doc_to_vec[second_doc.name]

    return first @ second / np.linalg.norm(first) / np.linalg.norm(second)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer


def get_topn_words(docs, top_k=10):
    texts = [' '.join(doc.tokens) for doc in docs]
    tfidf = TfidfVectorizer(max_features=8_000, min_df=0.1, max_df=0.65)
    transformed = tfidf.fit_transform(texts)

    doc_to_top_words = {}
    for doc, row in tqdm.tqdm(zip(docs, transformed)):
        most_scored_idx = np.argsort(-row.toarray())[0, :top_k]
        top_words = tfidf.get_feature_names_out()[most_scored_idx]
        scores = row.toarray()[0, most_scored_idx]
        doc_to_top_words[doc.name] = (list(top_words), scores)

    return doc_to_top_words

In [20]:
doc_to_topn_words = get_topn_words(articles, top_k=100)

2454it [00:31, 78.74it/s] 


In [None]:
doc_to_embed = {}
for doc_name, text in tqdm.tqdm(raw_docs.items()):
    doc_to_embed[doc_name] = model.encode(text)

In [21]:
doc_to_vec = {}
for doc in tqdm.tqdm(articles):
    top_words, scores = doc_to_topn_words[doc.name]
    vectorized_doc = [token for token in top_words if token in model]
    if not vectorized_doc:
        continue
    doc_to_vec[doc.name] = model[vectorized_doc].mean(axis=0)

100%|██████████| 2454/2454 [00:06<00:00, 378.29it/s]


In [22]:
from functools import partial
from common import get_graph


graph = get_graph(filtered_articles, 0.95, partial(cosine_sim, doc_to_vec=doc_to_vec))
doc_name_to_doc = {doc.name: doc for doc in articles}

components = list(nx.connected_components(graph))
len(components)

2454it [03:20, 12.25it/s] 


154

In [23]:
components

[{'Автоматика. Вычислительная техника-28399372_27529733.csv',
  'Автоматика. Вычислительная техника-32660194_43149545.csv',
  'Автоматика. Вычислительная техника-36295616_67734200.csv',
  'Автоматика. Вычислительная техника-36400296_69140450.csv',
  'Автоматика. Вычислительная техника-36807091_67702615.csv',
  'Автоматика. Вычислительная техника-36807099_51617257.csv',
  'Автоматика. Вычислительная техника-39134732_64663522.csv',
  'Автоматика. Вычислительная техника-41496866_22920968.csv',
  'Автоматика. Вычислительная техника-42709510_59686715.csv',
  'Автоматика. Вычислительная техника-44049938_99436623.csv',
  'Автоматика. Вычислительная техника-44776180_40845162.csv',
  'Автоматика. Вычислительная техника-47276625_31090676.csv',
  'Астрономия-28849237_95925118.csv',
  'Астрономия-48530594_99060342.csv',
  'Астрономия-49185567_23584267.csv',
  'Астрономия-49759122_41607339.csv',
  'Астрономия-50169070_64497353.csv',
  'Астрономия-50293049_32307412.csv',
  'Биотехнология-36730729_84

In [24]:
components_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(components)
]
component_to_topics = scoring.get_topics_ctfidf(components_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
components_topics = [v for k, v in component_to_topics.items()]
component_to_topics[0]

154it [00:01, 79.48it/s]


['engineering',
 'materials',
 'power',
 'doi',
 'vol',
 'напряжение',
 'org',
 'orcid',
 'reference',
 'ток']

In [25]:
common.compute_top_words_sim(components_topics, model, topn=10)

0.1946195683542109

Bigrams + louvain

In [26]:
with open('full_graph.pickle', 'rb') as f:
    full_graph = pickle.load(f)

In [27]:
import community.community_louvain as community_louvain #python-louvain

partition = community_louvain.best_partition(full_graph)

communities = {}
for node, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = [node]
    else:
        communities[community_id].append(node)

In [28]:
communities_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in communities.items()
]
communities_to_topics = scoring.get_topics_ctfidf(communities_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
communities_topics = [v for k, v in communities_to_topics.items()]
communities_to_topics[0]

9it [00:17,  1.91s/it]


['салют',
 'станция',
 'тпк',
 'орбита',
 'союз',
 'перелёт',
 'мир',
 'перелёта',
 'экипаж',
 'орбит']

In [29]:
common.compute_top_words_sim(communities_topics, model, topn=10)

0.11399501890395368

bigrams + Markov

In [30]:
import markov_clustering as mc
import networkx as nx

matrix = nx.to_scipy_sparse_matrix(full_graph)
result = mc.run_mcl(matrix)
clusters = mc.get_clusters(result)


The scipy.sparse array containers will be used instead of matrices
in Networkx 3.0. Use `to_scipy_sparse_array` instead.
  matrix = nx.to_scipy_sparse_matrix(full_graph)


In [32]:
communities = []

graph_nodes = list(full_graph.nodes)
for cluster in clusters:
    communities.append([graph_nodes[i] for i in cluster])

communities_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(communities)
]
communities_to_topics = scoring.get_topics_ctfidf(communities_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
communities_topics = [v for k, v in communities_to_topics.items()]

58it [00:00, 106.61it/s]


In [33]:
common.compute_top_words_sim(communities_topics, model, topn=10)

0.13408947778346955

elib + bert

In [34]:
from sentence_transformers import SentenceTransformer, util

model_st = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
articles[0]

In [None]:
doc_to_embed = {}
for doc_name, text in tqdm.tqdm(raw_docs.items()):
    doc_to_embed[doc_name] = model_st.encode(text)

In [None]:
import collections

#mode_graph = graph.copy()
with open('jac_graph.pickle', 'rb') as f:
    mode_graph = pickle.load(f)

node_weights = collections.defaultdict(0)
wtot = 0
for (u,v) in mode_graph.edges():
    weight = mode_graph[u][v]['weight']
    node_weights[u] += weight
    node_weights[v] += weight
    wtot += weight
    if u != v:
        wtot += weight

cluster_sizes = collections.defaultdict(1)
connected_components = []
D = []
u = mode_graph.number_of_nodes()
for n in mode_graph.nodes():
    neighbor_chain = [list(mode_graph.nodes())[0]]
    while neighbor_chain != []:
        a = neighbor_chain.pop()
        dmin = float("inf")
        nearest = -1
        for v in neighbor_chain.neighbors(a):
            if v != a:
                d = node_weights[v] * node_weights[a] / float(mode_graph[a][v]['weight']) / float(wtot)
                if d < dmin:
                    nearest = v
                    dmin = d
                elif d == dmin:
                    nearest = min(nearest,v)
        d = dmin
        if neighbor_chain != []:
            c = neighbor_chain.pop()
            if nearest == c:
                D.append([a,nearest,d,cluster_sizes[a] + cluster_sizes[nearest]])
                    # update graph
                mode_graph.add_node(u)
                neighbors_a = list(mode_graph.neighbors(a))
                neighbors_b = list(mode_graph.neighbors(nearest))
                for v in neighbors_a:
                    mode_graph.add_edge(u,v,weight = mode_graph[a][v]['weight'])
                for v in neighbors_b:
                    if mode_graph.has_edge(u,v):
                        mode_graph[u][v]['weight'] += mode_graph[nearest][v]['weight']
                    else:
                        mode_graph.add_edge(u,v,weight = mode_graph[nearest][v]['weight'])
                mode_graph.remove_node(a)
                mode_graph.remove_node(nearest)
                node_weights[u] = node_weights.pop(a) + node_weights.pop(nearest)
                cluster_sizes[u] = cluster_sizes.pop(a) + cluster_sizes.pop(nearest)
                u += 1
            else:
                neighbor_chain.append(c)
                neighbor_chain.append(a)
                neighbor_chain.append(nearest)
        elif nearest >= 0:
            neighbor_chain.append(a)
            neighbor_chain.append(nearest)   
        else:
            connected_components.append((a,cluster_sizes[a]))
            F.remove_node(a)
            w.pop(a)
            s.pop(a)
            n -= 1