In [None]:
import json
import os
import random
import numpy as np
import pandas as pd
import gensim
import nltk

from gensim.models import CoherenceModel
from multiprocess import Pool
from tqdm import tqdm 
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
from string import punctuation

In [1]:
SOCIAL_MEDIA = 'Medusa'

## LDA topic modeling
### Preprocessing & stemming 

In [None]:
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian") + ['би', 'си', 'это', 'который', 'которая', 'которые']

In [None]:
df = pd.read_csv(f'./data_31.03/{SOCIAL_MEDIA}/dataset.csv')

In [None]:
rus_stemmer = RussianStemmer()

def preprocess(text):
    tokens = gensim.utils.simple_preprocess(text)
    stemmed_tokens = [rus_stemmer.stem(t) for t in tokens if t not in russian_stopwords]
    # stemmed_tokens = [t for t in tokens if t not in russian_stopwords]
    return stemmed_tokens

In [None]:
df['processed_text'] = list(map(preprocess, df['text'].tolist()))

In [None]:
df.to_csv(f'./data_31.03/{SOCIAL_MEDIA}/dataset.csv', index=False)

### BoW

In [None]:
dictionary = gensim.corpora.Dictionary(df['processed_text'].tolist())
count = 0
for k, v in dictionary.items():
    print(k, v)
    count += 1
    if count > 10:
        break
print(f"\nDictionary len: {len(dictionary)}")

In [None]:
dictionary.filter_extremes(no_below=20, no_above=0.15)
print(f"Dictionary len: {len(dictionary)}")

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in df['processed_text'].tolist()]

In [None]:
bow_doc = bow_corpus[random.choice(range(len(df)))]
for i in range(len(bow_doc)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc[i][0], 
                                               dictionary[bow_doc[i][0]], 
                                                bow_doc[i][1]))

### LDA model

In [None]:
num_topics = 5
passes = 30
alpha = 'asymmetric'
eta = 0.1

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, 
                                       passes=passes, alpha=alpha, eta=eta, random_state=100)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

### Inference

In [None]:
idx = 1280
print(f"Input sentence:\n{df.loc[idx, 'text']}")
print(f"\nProcessed input sentence:\n{df.loc[idx, 'processed_text']}")

for index, score in sorted(lda_model[bow_corpus[idx]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

### Coherence

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_text'].tolist(), dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Baseline coherence score: ', coherence_lda)

### Visualisation

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(lda_model, bow_corpus, dictionary)

### Hyperparameters tuning 

In [None]:
def compute_coherence_values(input_params):
    k, a, b = input_params
    lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           passes=15,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_text'].tolist(), dictionary=dictionary, coherence='c_v')
    return coherence_model_lda.get_coherence()


def hyperparameters_tuning():
    grid = {}
    grid['Validation_Set'] = {}

    # Topics range
    min_topics = 2
    max_topics = 11
    step_size = 1
    topics_range = range(min_topics, max_topics, step_size)

    # Alpha parameter
    alpha = [0.01, 0.03, 0.06, 0.1]
    alpha.append('symmetric')
    alpha.append('asymmetric')

    # Beta parameter
    beta = [0.01, 0.03, 0.06, 0.1]
    beta.append('symmetric')

    input_params = []
    # Can take a long time to run
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                input_params.append((k, a, b))
    
    with Pool() as pool:
        cv_list = list(tqdm(pool.imap(compute_coherence_values, input_params), total=len(input_params)))  
    
    input_params = np.array(input_params)
    model_results = pd.DataFrame({'n_topics': input_params[:, 0], 'alpha': input_params[:, 1], 'eta': input_params[:, 2], 'coherence': cv_list})
    return model_results

In [None]:
model_results = hyperparameters_tuning()
model_results = model_results.sort_values('coherence', ascending=False)

In [None]:
model_results

In [None]:
model_results.to_csv(f'data_31.03/{SOCIAL_MEDIA}/LDA_stemmed_{SOCIAL_MEDIA}_best_params.csv', index=False)

### Retrain

In [None]:
num_topics = 3
passes = 50
alpha = 'asymmetric'
eta = 0.03

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, 
                                       passes=passes, alpha=alpha, eta=eta)

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['processed_text'].tolist(), dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Tuned coherence score: ', coherence_lda)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
lda_viz = gensimvis.prepare(lda_model, bow_corpus, dictionary)
lda_viz

In [None]:
os.mkdir(f'data_31.03/{SOCIAL_MEDIA}/lda_model')
lda_model.save(f'data_31.03/{SOCIAL_MEDIA}/lda_model/lda.model')