In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from datetime import datetime
import csv
from matplotlib import pyplot as plt
from sklearn.feature_extraction import text
import numpy as np
from gensim import matutils, models
import gensim

#### Retrieve the preprocessed text data

In [3]:
import pickle

file_name = "data_preprocessed"

open_file = open(file_name, "rb")
data_lemmatized_NA = pickle.load(open_file)
open_file.close()

# Document Term Matrix

#### Bad of Words approach

In [4]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(data_lemmatized_NA)
texts = data_lemmatized_NA
corpus_new = [id2word.doc2bow(text) for text in texts] # BOW

# Topic Modeling with LDA

### Latent Dirichlet Allocation (LDA) Base Model

In [5]:
lda_3 = models.LdaModel(corpus=corpus_new, num_topics=3, id2word=id2word, passes=10)
lda_3.print_topics()

[(0,
  '0.015*"day" + 0.010*"place" + 0.009*"friend" + 0.009*"right" + 0.009*"work" + 0.008*"man" + 0.008*"reason" + 0.008*"guy" + 0.008*"big" + 0.007*"world"'),
 (1,
  '0.037*"people" + 0.032*"time" + 0.024*"thing" + 0.023*"good" + 0.019*"way" + 0.014*"year" + 0.013*"post" + 0.011*"many" + 0.009*"life" + 0.009*"question"'),
 (2,
  '0.023*"game" + 0.014*"lot" + 0.013*"well" + 0.011*"bad" + 0.010*"character" + 0.010*"part" + 0.010*"player" + 0.009*"comment" + 0.009*"point" + 0.008*"money"')]

In [6]:
lda_4 = models.LdaModel(corpus=corpus_new, num_topics=4, id2word=id2word, passes=10)
lda_4.print_topics()

[(0,
  '0.029*"thing" + 0.018*"year" + 0.016*"lot" + 0.016*"post" + 0.013*"bad" + 0.012*"life" + 0.011*"work" + 0.010*"right" + 0.010*"comment" + 0.010*"man"'),
 (1,
  '0.031*"way" + 0.024*"day" + 0.019*"well" + 0.015*"much" + 0.014*"player" + 0.013*"point" + 0.013*"different" + 0.011*"reddit" + 0.010*"last" + 0.009*"job"'),
 (2,
  '0.050*"people" + 0.015*"many" + 0.012*"part" + 0.010*"place" + 0.010*"big" + 0.010*"person" + 0.009*"woman" + 0.009*"level" + 0.009*"sure" + 0.009*"use"'),
 (3,
  '0.044*"time" + 0.032*"good" + 0.029*"game" + 0.013*"question" + 0.012*"character" + 0.012*"friend" + 0.011*"able" + 0.011*"reason" + 0.011*"money" + 0.010*"new"')]

In [7]:
lda_5 = models.LdaModel(corpus=corpus_new, num_topics=5, id2word=id2word, passes=10)
lda_5.print_topics()

[(0,
  '0.072*"people" + 0.022*"many" + 0.018*"life" + 0.018*"question" + 0.018*"much" + 0.016*"right" + 0.013*"sure" + 0.013*"action" + 0.012*"problem" + 0.010*"least"'),
 (1,
  '0.028*"way" + 0.019*"lot" + 0.017*"well" + 0.017*"new" + 0.014*"place" + 0.014*"work" + 0.011*"big" + 0.011*"system" + 0.009*"gt" + 0.008*"high"'),
 (2,
  '0.056*"time" + 0.022*"post" + 0.015*"character" + 0.015*"part" + 0.014*"able" + 0.014*"different" + 0.012*"first" + 0.012*"woman" + 0.012*"reddit" + 0.012*"level"'),
 (3,
  '0.046*"thing" + 0.020*"bad" + 0.015*"man" + 0.015*"reason" + 0.015*"money" + 0.013*"issue" + 0.012*"great" + 0.011*"child" + 0.011*"story" + 0.011*"old"'),
 (4,
  '0.041*"good" + 0.037*"game" + 0.025*"day" + 0.025*"year" + 0.015*"player" + 0.015*"friend" + 0.014*"comment" + 0.013*"guy" + 0.013*"person" + 0.012*"world"')]

By human interpretation, the result show us the possible topics that LDA can achieved and it looks decent, but we are unsure about the optimal number of topics, number of iteration and more just by human judgement. Therefore it is important to get a grasp on the parameters involved in LDA and perform fine tuning.

#### Coherence Score of Base Model
Ideally, we can train different LDA models by computing the coherence score of these models.

In [8]:
from gensim.models import CoherenceModel

# Compute Coherence Score for the base model
coherence_model_lda = CoherenceModel(model=lda_3, texts=data_lemmatized_NA, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 3 topics: ', coherence_lda)

coherence_model_lda = CoherenceModel(model=lda_4, texts=data_lemmatized_NA, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 4 topics: ', coherence_lda)

coherence_model_lda = CoherenceModel(model=lda_5, texts=data_lemmatized_NA, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 5 topics: ', coherence_lda)


Coherence Score:  0.49215920826769033


The scores is not as good as we expected due to multiple reason. 
- We don't know the optimal number of topics that can give the best result
- We don't know the alpha and beta values that give the best result

### Hyperparameter tuning

From a standard LDA model, there are several key parameters that we have to keep in mind and consider programmatically tuning before we invoke the model:
1. k number of topics 
2. alpha parameter represent the document topic density
3. beta parameter represent the topic word density

**Documentation of LDA Model using Gensim Library: https://radimrehurek.com/gensim/models/ldamodel.html

In [24]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    # LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=dictionary,num_topics=k, random_state=100,chunksize=100,passes=10,alpha=a,eta=b)
    # Coherence model
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    return coherence_model_lda.get_coherence()

def hyperparameter_tuning(corpus_sets, topics_range, alpha, beta, model_results, corpus_title):
    # Can take a long time to run
    if 1 == 1:
        pbar = tqdm.tqdm(total=540)
        # iterate through validation corpuses
        for i in range(len(corpus_sets)):
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, k=k, a=a, b=b)
                        # Save the model results
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        model_results['Coherence'].append(cv)

                        pbar.update(1)
        pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
        pbar.close()

In [26]:
import numpy as np
import tqdm

# Topics
min_topic = 2
max_topic = 11
step_size = 1
topics_range = range(min_topic, max_topic, step_size)

# Alpha
alpha = list(np.arange(.01, 1, .3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta
beta = list(np.arange(.01, 1, .3))
beta.append('symmetric')

# Validation sets of 75% and 100%
doc_num = len(corpus_new)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus_new, int(doc_num*0.75)), 
               corpus_new]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

hyperparamter_tuning(corpus_sets, topics_range, alpha, beta, model_results, corpus_title)


  0%|          | 0/540 [01:17<?, ?it/s][A


KeyboardInterrupt: 

### Evaluation & Metrics

In [None]:
lda_result = pd.to_csv("lda_tuning_results.csv")
lda_result.head()

#### Plot for topic coherence

#### Table for Alpha Beta coherence

#### Insights:

### Final Model

In [None]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared