In [1]:
import common
from data_reader import PostPdfDocReader, Document
from sentence_transformers import SentenceTransformer, util

#29s

Сборник всех статей

In [2]:
articles = list(PostPdfDocReader().read_saved_as_document("upd"))
filtered_articles = common.filter_common_words(articles, min_freq=0, max_freq=0.65)

#~4m

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexandra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 2454/2454 [01:16<00:00, 32.17it/s]
100%|██████████| 2454/2454 [00:09<00:00, 257.12it/s]


Делим на группы по известным категориям

In [3]:
components_known = dict()
for doc in filtered_articles:
    category = doc.name.split('-')[0]
    if not category in components_known:
        components_known[category] = []
    components_known[category].append(doc.tokens)

In [4]:
import scoring

#50s

Для этих разделённых по категориям кластеров ищем топики `components_topics_known`

In [5]:
components_docs_known = [
    Document(k, sum((doc_tokens for doc_tokens in docs_tokens), []))
    for k, docs_tokens in enumerate(components_known.values())
]
component_to_topics_known = scoring.get_topics_ctfidf(components_docs_known, reduce_frequent_words=True, bm25_weighting=True, top_k=10, min_df=0.3, max_df=0.8)
components_topics_known = [v for k, v in component_to_topics_known.items()]

#1m1s

62it [00:02, 25.84it/s]


In [11]:
print(list(components_known.keys())[11])
print("top 10:", components_topics_known[11])
print("similarity:", common.compute_top_words_sim([components_topics_known[11]], w2v_model, topn=10))

Геофизика
top 10: ['землетрясение', 'осадок', 'юг', 'географический', 'геология', 'река', 'лёд', 'климатический', 'тёплый', 'сток']
similarity: 0.19677855153050688


In [7]:
import pickle

with open('w2v_model.pickle', 'rb') as file:
    w2v_model = pickle.load(file)

0.19677855153050688

X -- топик группы, y -- индекс группы

In [16]:
X = []
y = []
i = 0
for gr in components_topics_known:
    for word in gr:
        X.append(gr)
        y.append(i)
    i+=1

Берем модель для подсчёта расстояний между словами

In [None]:
# from gensim.models import KeyedVectors
# w2v_model = KeyedVectors.load_word2vec_format('cc.ru.300.vec.gz', binary=False)

#21m10s

In [12]:
import numpy as np
from numpy import linalg as LA

def calinski_harabasz_index(topics, components, model):
    topic_words_count = 0
    cluster_means = []
    topic_words = []
    components_counts = []
    for i in range(len(topics)):
        topic = topics[i]
        words = []
        for word in topic:
            try:
                words.append(model[word])
                topic_words_count += 1
            except KeyError:
                continue
        if len(words) > 0:
            cluster_means.append(np.array(words).mean(axis=0))
            topic_words.append(words)
            components_counts.append(len(components[i]))
    overall_mean = np.array(cluster_means).mean(axis=0)
    bcss = 0
    wcss = 0
    for i in range(len(topic_words)):
        cluster_norm = LA.norm(cluster_means[i] - overall_mean)
        bcss += components_counts[i] * cluster_norm * cluster_norm
        for word in topic_words[i]:
            word_norm = LA.norm(cluster_means[i] - word)
            wcss += word_norm * word_norm
            
    clusters_count = len(cluster_means)
    return (bcss * (topic_words_count - clusters_count)) / (wcss * (clusters_count - 1))

In [13]:
calinski_harabasz_index(components_topics_known, list(components_known.values()), w2v_model)

6.992782219165669

bigrams-Louvain + Calinski–Harabasz index

In [14]:
import pickle

with open('full_graph.pickle', 'rb') as f:
    full_graph = pickle.load(f)

In [15]:
import community.community_louvain as community_louvain #python-louvain

partition = community_louvain.best_partition(full_graph)

communities = {}
for node, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = [node]
    else:
        communities[community_id].append(node)

In [16]:
doc_name_to_doc = {doc.name: doc for doc in filtered_articles}

In [17]:
communities_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in communities.items()
]
communities_to_topics = scoring.get_topics_ctfidf(communities_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
communities_topics = [v for k, v in communities_to_topics.items()]

9it [00:16,  1.81s/it]


In [18]:
calinski_harabasz_index(communities_topics, list(communities.values()), w2v_model)

75.10279899210188

elib + word2vec + calinski_harabasz_index

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
import tqdm


def get_topn_words(docs, top_k=10):
    texts = [' '.join(doc.tokens) for doc in docs]
    tfidf = TfidfVectorizer(max_features=8_000, min_df=0.1, max_df=0.65)
    transformed = tfidf.fit_transform(texts)

    doc_to_top_words = {}
    for doc, row in tqdm.tqdm(zip(docs, transformed)):
        most_scored_idx = np.argsort(-row.toarray())[0, :top_k]
        top_words = tfidf.get_feature_names_out()[most_scored_idx]
        scores = row.toarray()[0, most_scored_idx]
        doc_to_top_words[doc.name] = (list(top_words), scores)

    return doc_to_top_words

doc_to_topn_words = get_topn_words(articles, top_k=100)

#25s

2454it [00:17, 138.02it/s]


In [20]:
doc_to_vec = {}
for doc in tqdm.tqdm(articles):
    top_words, scores = doc_to_topn_words[doc.name]
    vectorized_doc = [token for token in top_words if token in w2v_model]
    if not vectorized_doc:
        continue
    doc_to_vec[doc.name] = w2v_model[vectorized_doc].mean(axis=0)

100%|██████████| 2454/2454 [00:05<00:00, 485.83it/s]


In [21]:
from functools import partial
from common import get_graph
import networkx as nx

def cosine_sim(first_doc, second_doc, doc_to_vec):
    first = doc_to_vec[first_doc.name]
    second = doc_to_vec[second_doc.name]

    return first @ second / np.linalg.norm(first) / np.linalg.norm(second)


graph = get_graph(filtered_articles, 0.95, partial(cosine_sim, doc_to_vec=doc_to_vec))
doc_name_to_doc = {doc.name: doc for doc in articles}

components = list(nx.connected_components(graph))

2454it [02:28, 16.51it/s] 


In [22]:
components_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(components)
]
component_to_topics = scoring.get_topics_ctfidf(components_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
components_topics = [v for k, v in component_to_topics.items()]

154it [00:03, 49.25it/s]


In [23]:
calinski_harabasz_index(components_topics, components, w2v_model)

2.6375640464664003

elib + jaccard + calinski_harabasz_index

In [24]:
import pickle
with open('jac_graph.pickle', 'rb') as f:
    graph = pickle.load(f)
doc_name_to_doc = {doc.name: doc for doc in articles}

components = list(nx.connected_components(graph))
len(components)

16

In [25]:
components_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(components)
]
component_to_topics = scoring.get_topics_ctfidf(components_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
components_topics = [v for k, v in component_to_topics.items()]
component_to_topics[0]

16it [00:00, 77.13it/s]


['специалист',
 'профессиональный',
 'подготовка',
 'стандарт',
 'обслуживание',
 'образовательный',
 'учебный',
 'предприятие',
 'требование',
 'качество']

In [26]:
calinski_harabasz_index(components_topics, components, w2v_model)

0.3190372095349591

elib + bert + ...

In [1]:
from sentence_transformers import SentenceTransformer, util

sent_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

In [2]:
import pickle

# with open('elib_tops_bert_doc_to_embed.pickle', 'wb') as file:
#     pickle.dump(doc_to_embed, file)

with open('elib_tops_bert_doc_to_embed.pickle', 'rb') as file:
    doc_to_embed = pickle.load(file)

In [23]:
for (k, v) in doc_to_embed.items():
    print(v.shape)
    break

(100, 768)


In [12]:
from common import cosine_sim, get_graph
from functools import partial
import networkx as nx
import tqdm

In [18]:
def get_graph(docs, threshold, sim_fn):
    graph = nx.Graph()
    for first_ix, first_doc in tqdm.tqdm(enumerate(docs)):
        for second_ix, second_doc in enumerate(docs[first_ix + 1:]):
            sim = sim_fn(first_doc, second_doc)
            if sim < threshold:
                continue
            graph.add_edge(first_doc.name, second_doc.name, weight=sim)
            graph.add_edge(second_doc.name, first_doc.name, weight=sim)

    return graph

graph = get_graph(filtered_articles, 0.8, partial(cosine_sim, doc_to_vec=doc_to_embed))

doc_name_to_doc = {doc.name: doc for doc in filtered_articles}

components = list(nx.connected_components(graph))
len(components)
components_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(components)
]

0it [00:00, ?it/s]


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 100 is different from 768)

In [17]:
len(components)

0

In [16]:
component_to_topics = scoring.get_topics_ctfidf(components_docs, reduce_frequent_words=True, bm25_weighting=True, top_k=10, min_df=0.3, max_df=0.8)
components_topics = [v for k, v in component_to_topics.items()]

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
calinski_harabasz_index(components_topics, components, w2v_model)

ГОСТ + Jaccard 

In [25]:
import os
import zipfile, re, docx
import json
import tqdm
from pathlib import Path

def _read_docx_from_zipfile(path):
    archive = zipfile.ZipFile(path, 'r')

    for name in archive.namelist():
        if not re.fullmatch(r'.*\.docx', name):
            continue

        with archive.open(name) as file:
            doc = docx.Document(file)
            
            text = '.'.join([' '.join(p.text.split()) for p in doc.paragraphs if len(p.text) > 0])

        yield name, text


docs = {}
# set your path to a zipfile
zipfile_path = Path().cwd() / 'data' / 'docx35.zip'
for name, text in tqdm.tqdm(_read_docx_from_zipfile(zipfile_path)):
    doc_name = Path(name).with_suffix('.txt').name
    docs[doc_name] = text

#1162 за 27m


1162it [27:05,  1.40s/it]


KeyboardInterrupt: 

In [27]:
len(docs)

1162

In [28]:
# data_path = Path().cwd() / 'data'
# data_path.mkdir(exist_ok=True)
# with open(data_path / 'docs.json', 'w') as f:
#     json.dump(docs, f)

In [27]:
import json

with open('data/docs.json') as f:
    raw_docs = json.load(f)

In [31]:
from data_reader import JsonDocReader, Document
from pathlib import Path

In [32]:
# data_reader = JsonDocReader(Path('data/docs.json')).read_documents()
# docs = list(data_reader)

# with open('data/preprocessed_docs.json', 'w') as f:
#     docs_dict = {doc.name: ' '.join(doc.tokens) for doc in docs}
#     json.dump(docs_dict, f)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexandra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1162it [47:04,  2.43s/it]


In [33]:
with open('data/preprocessed_docs.json') as f:
    docs = json.load(f)
    docs = [Document(k, v.split()) for k, v in docs.items()]
filtered_docs = common.filter_common_words(docs, min_freq=0.1, max_freq=0.65)
doc_name_to_doc = {doc.name: doc for doc in docs}
filtered_docs.remove(Document('gost_r_54481-2011.txt', []))

100%|██████████| 1162/1162 [00:08<00:00, 133.70it/s]


In [34]:
graph = common.get_graph(filtered_docs, 0.85, partial(common.jaccard_sim, _words_cache={}))
doc_name_to_doc = {doc.name: doc for doc in docs}

components = list(nx.connected_components(graph))

1161it [03:36,  5.36it/s]


In [35]:
components_docs = [
    Document(k, sum((doc_name_to_doc[doc_name].tokens for doc_name in docs_names), []))
    for k, docs_names in enumerate(components)
]
component_to_topics = scoring.get_topics_ctfidf(components_docs, reduce_frequent_words=True, bm25_weighting=False, top_k=10, min_df=0.1, max_df=0.85)
components_topics = [v for k, v in component_to_topics.items()]

20it [00:00, 51.84it/s]


In [36]:
common.compute_top_words_sim(components_topics, w2v_model, topn=10)

0.19883913752157242

Тестировочный черновик

In [41]:
topic=components_topics_known[11]

In [44]:
a = w2v_model[topic[0], topic[1], topic[2], topic[3], topic[4],
          topic[5], topic[6], topic[7], topic[8], topic[9]].mean(axis=0)
b = [a, a]

In [47]:
from numpy import linalg as LA

LA.norm(np.array(b).mean(axis=0))

0.8206531