In [None]:
# test

In [None]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import gensim
tqdm().pandas()
%matplotlib inline

In [None]:
#df = pd.read_csv('lenta-ru-news_small.csv')
df = pd.read_csv('/data/share/TopicModelling/lenta-ru-news_small.csv')
df.head()

In [None]:
import re
GROUPING_SPACE_REGEX = re.compile(r'([^\w]|[+])', re.UNICODE)
def simple_word_tokenize(text, _split=GROUPING_SPACE_REGEX.split):
    return [t for t in _split(text.lower()) if t and not t.isspace()]

def token_r(text):
    words = simple_word_tokenize(text)
    return [m.parse(x)[0].normal_form for x in words if len(x) >= 4]

import pymorphy2
m= pymorphy2.MorphAnalyzer()

In [None]:
#df['text'] = df['text'].progress_apply(token_r)

In [None]:
#bow_txts = [gensim.utils.simple_preprocess(str(txt), deacc=False) for txt in tqdm.tqdm_notebook(txts)]
bow_txts = [token_r(str(txt)) for txt in tqdm(df.text)]
wrd_index = gensim.corpora.Dictionary(bow_txts)

In [None]:
bow_txts[0]

In [None]:
# отбросим частотные и редкие слова
no_above = 0.5
buzzwords = {}
for key, val in wrd_index.items():
    if wrd_index.dfs[key] > no_above * wrd_index.num_docs:
        buzzwords[val] = wrd_index.dfs[key]/wrd_index.num_docs
wrd_index.filter_extremes(no_above=no_above, no_below = 4, keep_n = None)
buzzwords

In [None]:
corpus = [wrd_index.doc2bow(txt) for txt in tqdm(bow_txts)]
corpus[1]

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=wrd_index,
                                           num_topics=df.topic.nunique(), 
                                            passes=10,
                                           random_state=0,
                                           per_word_topics=True)

In [None]:
logging.getLogger().setLevel(logging.ERROR)

Теперь можем получать векторы тематического распределения для различных текстов

In [None]:
#lda_model.get_document_topics(corpus[7])
lda_model[corpus[7]][0]

Посмотрим на вектора тем

In [None]:
v_topic = [len(lda_model[doc][0]) for doc in tqdm(corpus)]

In [None]:
import seaborn
import matplotlib.pyplot as plt

plt.hist(v_topic, bins=20)

# -----------------------------------------------------------------------------------------------------------------------------------

Как же это интерпретировать? Да очень просто

In [None]:
lda_model.show_topic(0, 5)

In [None]:
# с частотами внутри топа
for t in range(lda_model.num_topics):
    words = lda_model.show_topic(t, 2)
    tf = sum(w for f, w in words)
    for w, f in words:
        print (w, f / tf)
    print ('--------------------')

In [None]:
from pprint import pprint
pprint(lda_model.print_topics())

Можем взять самую «популярную» тематику

In [None]:
topics = gensim.matutils.corpus2dense([x[0] for x in lda_model[corpus]], lda_model.num_topics)

In [None]:
weight = topics.sum(axis=1)
weight

In [None]:
max_topic = weight.argmax()
max_topic

In [None]:
topics.shape

In [None]:
words = lda_model.show_topic(max_topic, 10)
words

А можем красиво визуализировать

In [None]:
!pip install pyLDAvis

In [None]:
import pyLDAvis
from pyLDAvis import gensim as pyLDAvis_gensim
lda_display = pyLDAvis_gensim.prepare(lda_model, corpus, wrd_index, sort_topics=False)
pyLDAvis.save_html(lda_display, 'lda.html')

In [None]:
pyLDAvis.enable_notebook(lda_display)
pyLDAvis.display(lda_display)

А можем строить облака тегов

In [None]:
!pip install wordcloud

In [None]:
import wordcloud
wc =  wordcloud.WordCloud(width=800, height=400, margin=5)

In [None]:
words = lda_model.show_topic(max_topic, 100)
words = dict(words)

In [None]:
wc.generate_from_frequencies(words)

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(20, 10))
plt.imshow(wc)
plt.axis('off')
plt.show()

<h3>Можем вот так искать похожие документы</h3>

In [None]:
from scipy.spatial import distance
import numpy

In [None]:
pair = distance.squareform(distance.pdist(numpy.matrix.transpose(topics)))

In [None]:
pair.shape

In [None]:
def close_by_topic(doc_id):
    return pair[doc_id].argmin()

Забьем нули на главной диагонали, чтобы не мешались

In [None]:
large = pair.max()
for t in range(pair.shape[0]):
    pair[t,t] = large + 1

In [None]:
close_by_topic(5)

In [None]:
doc_id = 10
lda_model[corpus[doc_id]][0]

In [None]:
lda_model[corpus[close_by_topic(doc_id)]][0]

А можем и вот такой граф построить

In [None]:
!pip install networkx

In [None]:
import networkx as nx

In [None]:
G = nx.Graph()

In [None]:
for i in range(lda_model.num_topics):
    topic_l = 'topic_' + str(i)
    terms = [t for t, v in lda_model.show_topic(i, 5)]
    for term in terms:
        G.add_edge(topic_l, term)

In [None]:
pos = nx.spring_layout(G)

In [None]:
fig = plt.figure(figsize=(27, 15))
ax = fig.add_subplot(111, xlim=(-1,1), ylim=(-1,1))
f = G.subgraph([topic for topic,_ in pos.items() if 'topic_' in topic])
nx.draw_networkx_labels(f, pos, font_color='r',ax=ax)
f = G.subgraph([topic for topic,_ in pos.items() if 'topic_' not in topic])
nx.draw_networkx_labels(f, pos,ax=ax)
nx.draw_networkx_edges(G, pos, edgelist=G.edges(), alpha=0.1,ax=ax)
ax.axis('off')

In [None]:
from math import log
2**log(len(wrd_index))