In [37]:
import json
import os
import random
import numpy as np
import pandas as pd
import gensim
import nltk

from gensim.models import CoherenceModel
from multiprocess import Pool
from tqdm import tqdm 
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
from string import punctuation

In [38]:
SOCIAL_MEDIA = 'BBC'

## LDA topic modeling
### Preprocessing & stemming 

In [39]:
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian") + ['би', 'си', 'это', 'который', 'которая', 'которые']

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ivanhladkyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
df = pd.read_csv(f'./data/{SOCIAL_MEDIA}/dataset.csv')

In [41]:
rus_stemmer = RussianStemmer()

def preprocess(text):
    tokens = gensim.utils.simple_preprocess(text)
    stemmed_tokens = [rus_stemmer.stem(t) for t in tokens if t not in russian_stopwords]
    # stemmed_tokens = [t for t in tokens if t not in russian_stopwords]
    return stemmed_tokens

In [42]:
df['processed_text'] = list(map(preprocess, df['text'].tolist()))

In [43]:
# df.to_csv(f'./data_31.03/{SOCIAL_MEDIA}/dataset_lda.csv', index=False)

### BoW

In [44]:
dictionary = gensim.corpora.Dictionary(df['processed_text'].tolist())
count = 0
for k, v in dictionary.items():
    print(k, v)
    count += 1
    if count > 10:
        break
print(f"\nDictionary len: {len(dictionary)}")

0 адресова
1 аннекс
2 атак
3 бессмыслен
4 близост
5 больш
6 буд
7 будет
8 будущ
9 ве
10 вер

Dictionary len: 22028


In [45]:
dictionary.filter_extremes(no_below=20, no_above=0.15)
print(f"Dictionary len: {len(dictionary)}")

Dictionary len: 3143


In [46]:
bow_corpus = [dictionary.doc2bow(doc) for doc in df['processed_text'].tolist()]

In [47]:
bow_doc = bow_corpus[random.choice(range(len(df)))]
for i in range(len(bow_doc)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc[i][0], 
                                               dictionary[bow_doc[i][0]], 
                                                bow_doc[i][1]))

Word 60 ("наш") appears 1 time.
Word 94 ("реч") appears 1 time.
Word 95 ("россиян") appears 1 time.
Word 114 ("точн") appears 1 time.
Word 134 ("действ") appears 1 time.
Word 158 ("приня") appears 1 time.
Word 163 ("решен") appears 1 time.
Word 189 ("безопасн") appears 1 time.
Word 202 ("заявлен") appears 1 time.
Word 258 ("мид") appears 1 time.
Word 403 ("нескольк") appears 1 time.
Word 606 ("дан") appears 1 time.
Word 608 ("идет") appears 1 time.
Word 638 ("ряд") appears 1 time.
Word 701 ("уточня") appears 1 time.
Word 735 ("интерес") appears 1 time.
Word 812 ("дипломат") appears 1 time.
Word 830 ("франц") appears 2 time.
Word 923 ("дипломатическ") appears 1 time.
Word 943 ("однак") appears 1 time.
Word 963 ("десятк") appears 1 time.
Word 1033 ("сотрудник") appears 2 time.
Word 1078 ("источник") appears 1 time.
Word 1514 ("имеющ") appears 1 time.
Word 1578 ("количеств") appears 1 time.
Word 1593 ("статус") appears 1 time.


In [48]:
pd.read_csv('data/NovayaGazeta/LDA_stemmed_NovayaGazeta_best_params.csv').describe()

Unnamed: 0,n_topics,coherence
count,270.0,270.0
mean,6.0,0.371068
std,2.586784,0.034135
min,2.0,0.294563
25%,4.0,0.351285
50%,6.0,0.371594
75%,8.0,0.392966
max,10.0,0.461482


### LDA model

In [49]:
num_topics = 5
passes = 30
alpha = 'asymmetric'
eta = 0.1

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, 
                                       passes=passes, alpha=alpha, eta=eta, random_state=42)

In [50]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.011*"обстрел" + 0.011*"удар" + 0.009*"район" + 0.008*"сил" + 0.007*"ракет" + 0.007*"арм" + 0.007*"продолжа" + 0.007*"человек" + 0.007*"северодонецк" + 0.007*"миноборон"
Topic: 1 
Words: 0.015*"зеленск" + 0.009*"мариупол" + 0.009*"переговор" + 0.009*"министр" + 0.008*"путин" + 0.007*"нат" + 0.007*"оон" + 0.006*"гуманитарн" + 0.006*"премьер" + 0.006*"мирн"
Topic: 2 
Words: 0.017*"санкц" + 0.014*"компан" + 0.009*"рубл" + 0.008*"сша" + 0.007*"банк" + 0.007*"доллар" + 0.007*"нов" + 0.007*"газ" + 0.007*"прот" + 0.006*"ес"
Topic: 3 
Words: 0.008*"путин" + 0.006*"наш" + 0.006*"русск" + 0.005*"сша" + 0.005*"подкаст" + 0.005*"журналист" + 0.005*"сми" + 0.004*"москв" + 0.004*"перв" + 0.004*"сам"
Topic: 4 
Words: 0.011*"суд" + 0.009*"территор" + 0.008*"дел" + 0.008*"человек" + 0.008*"плен" + 0.006*"задержа" + 0.006*"прав" + 0.005*"днр" + 0.005*"акц" + 0.005*"депутат"


### Inference

In [15]:
idx = 1280
print(f"Input sentence:\n{df.loc[idx, 'text']}")
print(f"\nProcessed input sentence:\n{df.loc[idx, 'processed_text']}")

for index, score in sorted(lda_model[bow_corpus[idx]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

Input sentence:
Станислав Белковский в колонке для «Новой»:..— Начинается не календарный — настоящий XXI век. И самое главное — найти в себе силы пережить это мрачное время перемены эпох в компании четырех всадников (чумы, войны, голода, смерти)...Отдельный оптимизм причитается нашей России. Которая-таки своевременно получит новый шанс на покаяние — и потому новое национальное строительство — вместо профуканного старого...

Processed input sentence:
['станисла', 'белковск', 'колонк', 'нов', 'начина', 'календарн', 'настоя', 'xxi', 'век', 'сам', 'главн', 'найт', 'сил', 'переж', 'мрачн', 'врем', 'перем', 'эпох', 'компан', 'четырех', 'всадник', 'чум', 'войн', 'голод', 'смерт', 'отдельн', 'оптимизм', 'причита', 'наш', 'росс', 'так', 'своевремен', 'получ', 'нов', 'шанс', 'покаян', 'нов', 'национальн', 'строительств', 'вмест', 'профука', 'стар']

Score: 0.9261434078216553	 
Topic: 0.027*"санкц" + 0.021*"прот" + 0.020*"дел" + 0.020*"спецоперац" + 0.019*"стран" + 0.019*"действ" + 0.018*"воен" +

In [35]:
for idx, topic in lda_model.show_topics(formatted=False, num_words=7):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: обстрел|удар|район|сил|ракет|арм|продолжа
Topic: 1 
Words: зеленск|мариупол|переговор|министр|путин|нат|оон
Topic: 2 
Words: санкц|компан|рубл|сша|банк|доллар|нов
Topic: 3 
Words: путин|наш|русск|сша|подкаст|журналист|сми
Topic: 4 
Words: суд|территор|дел|человек|плен|задержа|прав


### Coherence

In [35]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_text'].tolist(), dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Baseline coherence score: ', coherence_lda)

Baseline coherence score:  0.3791984064825126


### Visualisation

In [51]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(lda_model, bow_corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [52]:
lda_viz

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


### Hyperparameters tuning 

In [37]:
def compute_coherence_values(input_params):
    k, a, b = input_params
    lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           passes=15,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_text'].tolist(), dictionary=dictionary, coherence='c_v')
    return coherence_model_lda.get_coherence()


def hyperparameters_tuning():
    grid = {}
    grid['Validation_Set'] = {}

    # Topics range
    min_topics = 2
    max_topics = 11
    step_size = 1
    topics_range = range(min_topics, max_topics, step_size)

    # Alpha parameter
    alpha = [0.01, 0.03, 0.06, 0.1]
    alpha.append('symmetric')
    alpha.append('asymmetric')

    # Beta parameter
    beta = [0.01, 0.03, 0.06, 0.1]
    beta.append('symmetric')

    input_params = []
    # Can take a long time to run
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                input_params.append((k, a, b))
    
    with Pool() as pool:
        cv_list = list(tqdm(pool.imap(compute_coherence_values, input_params), total=len(input_params)))  
    
    input_params = np.array(input_params)
    model_results = pd.DataFrame({'n_topics': input_params[:, 0], 'alpha': input_params[:, 1], 'eta': input_params[:, 2], 'coherence': cv_list})
    return model_results

In [38]:
model_results = hyperparameters_tuning()
model_results = model_results.sort_values('coherence', ascending=False)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 270/270 [18:40<00:00,  4.15s/it]


In [39]:
model_results

Unnamed: 0,n_topics,alpha,eta,coherence
119,5,asymmetric,symmetric,0.461482
115,5,asymmetric,0.01,0.453717
117,5,asymmetric,0.06,0.453350
116,5,asymmetric,0.03,0.453350
118,5,asymmetric,0.1,0.451749
...,...,...,...,...
7,2,0.03,0.06,0.299899
2,2,0.01,0.06,0.299899
0,2,0.01,0.01,0.299899
1,2,0.01,0.03,0.299899


In [41]:
model_results.to_csv(f'data/{SOCIAL_MEDIA}/LDA_stemmed_{SOCIAL_MEDIA}_best_params.csv', index=False)

### Retrain

In [42]:
num_topics = 5
passes = 50
alpha = 'asymmetric'
eta = 0.03

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, 
                                       passes=passes, alpha=alpha, eta=eta)

In [43]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_text'].tolist(), dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Tuned coherence score: ', coherence_lda)

Tuned coherence score:  0.34738493866109976


In [44]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.018*"сво" + 0.016*"дел" + 0.016*"говор" + 0.015*"год" + 0.015*"котор" + 0.015*"спецоперац" + 0.012*"стран" + 0.011*"лет" + 0.011*"друг" + 0.011*"суд"
Topic: 1 
Words: 0.031*"сайт" + 0.030*"действ" + 0.029*"боев" + 0.027*"компан" + 0.026*"сми" + 0.020*"роскомнадзор" + 0.018*"заблокирова" + 0.018*"минюст" + 0.017*"такж" + 0.017*"суд"
Topic: 2 
Words: 0.038*"президент" + 0.036*"путин" + 0.032*"заяв" + 0.032*"глав" + 0.030*"санкц" + 0.029*"владимир" + 0.025*"прот" + 0.025*"сообщ" + 0.025*"март" + 0.021*"воен"
Topic: 3 
Words: 0.037*"миноборон" + 0.033*"задержа" + 0.030*"газет" + 0.028*"человек" + 0.027*"рф" + 0.021*"украинск" + 0.021*"воен" + 0.019*"сил" + 0.019*"москв" + 0.019*"сообща"
Topic: 4 
Words: 0.056*"рубл" + 0.039*"газет" + 0.031*"наш" + 0.025*"доллар" + 0.021*"кана" + 0.021*"магазин" + 0.020*"нам" + 0.020*"редакц" + 0.019*"отдел" + 0.019*"выпуск"


In [None]:
lda_viz = gensimvis.prepare(lda_model, bow_corpus, dictionary)
lda_viz

In [None]:
os.mkdir(f'data_31.03/{SOCIAL_MEDIA}/lda_model')
lda_model.save(f'data_31.03/{SOCIAL_MEDIA}/lda_model/lda.model')