# Evaluate the result of topic modeling for unigram, bigram and trigram

In [8]:
import pandas as pd
from datetime import datetime
import csv
from matplotlib import pyplot as plt
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
import re
# from spellchecker import SpellChecker
import spacy
import seaborn as sns

# N-Gram
- unigram
- bigram
- trigram

### First Attempt - Topic Modeling (Unigram)

In [27]:
import pickle

file_name = "unigrams"

open_file = open(file_name, "rb")
data_unigram = pickle.load(open_file)
open_file.close()

In [28]:
data_unigram[:5]

[['okay',
  'tone',
  'volume',
  'accent',
  'vocabulary',
  'emotions',
  'physical',
  'movement'],
 ['gtyou', 'made', 'snort', 'snorting', 'crack', 'class'],
 ['ah', 'peasant', 'spirit', 'alive', 'well', 'see'],
 ['interesting', 'bet', 'pros', 'would', 'love', 'training'],
 ['think', 'unacceptable', 'every', 'context']]

In [29]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
nlp.max_length = 1400000

# Do lemmatization keeping noun, adj, vb, adv
data_lemmatized = lemmatization(data_unigram, allowed_postags=['NOUN', 'ADJ', 'VB', 'ADV'])
print("NAVA:", data_lemmatized[:20])

NAVA: [['tone', 'volume', 'vocabulary', 'emotion', 'physical', 'movement'], ['snort', 'crack', 'class'], ['peasant', 'spirit', 'alive', 'well'], ['interesting', 'bet', 'pro', 'training'], ['unacceptable', 'context'], ['ever', 'cookie'], ['long', 'feeling', 'wrong'], ['even', 'botw', 'lmao', 'recommendation', 'terrible'], ['never', 'reason', 'use'], ['work', 'holy', 'one'], ['first', 'half'], ['odst', 'proper', 'multiplayer', 'right', 'firefight'], ['star', 'rating'], ['quick', 'game'], ['appreciate'], ['suspect', 'theory', 'already', 'phase'], ['quality', 'game'], ['good', 'world'], ['thing', 'ever'], ['sun', 'radiation', 'outside']]


In [30]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus_new = [id2word.doc2bow(text) for text in texts] # BOW

In [31]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus_new = [id2word.doc2bow(text) for text in texts] # BOW

In [32]:
lda_3 = models.LdaModel(corpus=corpus_new, num_topics=3, id2word=id2word, passes=10)
lda_3.print_topics()

[(0,
  '0.016*"music" + 0.014*"well" + 0.013*"reddit" + 0.012*"post" + 0.012*"place" + 0.012*"community" + 0.011*"new" + 0.010*"feedback" + 0.010*"way" + 0.009*"question"'),
 (1,
  '0.025*"people" + 0.011*"never" + 0.008*"even" + 0.008*"time" + 0.007*"well" + 0.007*"year" + 0.007*"really" + 0.007*"life" + 0.007*"kid" + 0.007*"way"'),
 (2,
  '0.016*"game" + 0.016*"thing" + 0.013*"good" + 0.013*"time" + 0.010*"even" + 0.008*"day" + 0.008*"still" + 0.008*"man" + 0.008*"also" + 0.008*"friend"')]

In [33]:
lda_4 = models.LdaModel(corpus=corpus_new, num_topics=4, id2word=id2word, passes=10)
lda_4.print_topics()

[(0,
  '0.021*"music" + 0.018*"reddit" + 0.013*"user" + 0.012*"comment" + 0.011*"man" + 0.010*"rmusic" + 0.010*"friend" + 0.009*"submit" + 0.008*"place" + 0.008*"help"'),
 (1,
  '0.027*"people" + 0.016*"much" + 0.013*"feedback" + 0.013*"good" + 0.012*"well" + 0.010*"place" + 0.010*"positive" + 0.009*"able" + 0.009*"thing" + 0.009*"even"'),
 (2,
  '0.025*"way" + 0.020*"day" + 0.020*"game" + 0.015*"community" + 0.013*"good" + 0.012*"even" + 0.011*"thing" + 0.011*"share" + 0.009*"life" + 0.009*"character"'),
 (3,
  '0.040*"time" + 0.011*"post" + 0.011*"question" + 0.011*"action" + 0.010*"new" + 0.009*"year" + 0.008*"rule" + 0.008*"subreddit" + 0.008*"well" + 0.007*"even"')]

In [34]:
lda_5 = models.LdaModel(corpus=corpus_new, num_topics=5, id2word=id2word, passes=10)
lda_5.print_topics()

[(0,
  '0.015*"system" + 0.014*"post" + 0.011*"place" + 0.011*"good" + 0.010*"rule" + 0.010*"subreddit" + 0.009*"world" + 0.009*"well" + 0.008*"content" + 0.008*"instead"'),
 (1,
  '0.025*"music" + 0.021*"reddit" + 0.018*"new" + 0.015*"question" + 0.014*"character" + 0.013*"share" + 0.013*"rmusic" + 0.011*"comment" + 0.011*"submit" + 0.010*"alignment"'),
 (2,
  '0.022*"feedback" + 0.016*"place" + 0.010*"person" + 0.009*"concern" + 0.009*"advice" + 0.008*"ampx" + 0.008*"gt" + 0.008*"party" + 0.007*"member" + 0.007*"already"'),
 (3,
  '0.034*"people" + 0.015*"game" + 0.012*"much" + 0.012*"player" + 0.012*"even" + 0.011*"child" + 0.011*"man" + 0.011*"thing" + 0.010*"level" + 0.009*"also"'),
 (4,
  '0.024*"time" + 0.017*"year" + 0.017*"well" + 0.016*"community" + 0.016*"thing" + 0.014*"day" + 0.013*"bad" + 0.013*"really" + 0.013*"good" + 0.012*"user"')]

In [36]:
from gensim.models import CoherenceModel

# Compute Coherence Score for the base model
coherence_model_lda = CoherenceModel(model=lda_3, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 3 topics: ', coherence_lda)

coherence_model_lda = CoherenceModel(model=lda_4, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 4 topics: ', coherence_lda)

coherence_model_lda = CoherenceModel(model=lda_5, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 5 topics: ', coherence_lda)


Coherence Score for 3 topics:  0.65676271077902

Coherence Score for 4 topics:  0.5428848292453103

Coherence Score for 5 topics:  0.4730728000659042


### Second Attempt - Topic Modeling (Bigram)

In [2]:
import pickle

file_name = "bigrams"

open_file = open(file_name, "rb")
data_bigram = pickle.load(open_file)
open_file.close()

In [4]:
data_bigram[:5]

[['okay',
  'tone',
  'volume',
  'accent',
  'vocabulary',
  'emotions',
  'physical',
  'movement'],
 ['gtyou', 'made', 'snort', 'snorting', 'crack', 'class'],
 ['ah', 'peasant', 'spirit', 'alive', 'well', 'see'],
 ['interesting', 'bet', 'pros', 'would', 'love', 'training'],
 ['think', 'unacceptable', 'every', 'context']]

In [37]:
data_lemmatized = lemmatization(data_bigram, allowed_postags=['NOUN', 'ADJ', 'VB', 'ADV'])
print("NAVA:", data_lemmatized[:20])

NAVA: [['tone', 'volume', 'vocabulary', 'emotion', 'physical', 'movement'], ['snort', 'crack', 'class'], ['peasant', 'spirit', 'alive', 'well'], ['interesting', 'bet', 'pro', 'training'], ['unacceptable', 'context'], ['ever', 'cookie'], ['long', 'feeling', 'wrong'], ['even', 'botw', 'lmao', 'recommendation', 'terrible'], ['never', 'reason', 'use'], ['work', 'holy', 'one'], ['first', 'half'], ['odst', 'proper', 'multiplayer', 'right', 'firefight'], ['star', 'rating'], ['quick', 'game'], ['appreciate'], ['suspect', 'theory', 'already', 'phase'], ['quality', 'game'], ['good', 'world'], ['thing', 'ever'], ['sun', 'radiation', 'outside']]


In [38]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus_new = [id2word.doc2bow(text) for text in texts] # BOW

In [39]:
lda_3 = models.LdaModel(corpus=corpus_new, num_topics=3, id2word=id2word, passes=10)
lda_3.print_topics()

[(0,
  '0.016*"game" + 0.014*"people" + 0.011*"year" + 0.011*"even" + 0.009*"also" + 0.008*"system" + 0.007*"good" + 0.007*"player" + 0.006*"money" + 0.006*"alignment"'),
 (1,
  '0.017*"well" + 0.015*"place" + 0.013*"reddit" + 0.013*"way" + 0.011*"question" + 0.010*"share" + 0.010*"comment" + 0.010*"rmusic" + 0.009*"day" + 0.009*"new"'),
 (2,
  '0.025*"time" + 0.013*"good" + 0.012*"thing" + 0.010*"really" + 0.008*"day" + 0.008*"back" + 0.008*"still" + 0.008*"much" + 0.008*"well" + 0.008*"man"')]

In [40]:
lda_4 = models.LdaModel(corpus=corpus_new, num_topics=4, id2word=id2word, passes=10)
lda_4.print_topics()

[(0,
  '0.020*"people" + 0.011*"good" + 0.011*"actually" + 0.010*"man" + 0.010*"also" + 0.010*"time" + 0.009*"way" + 0.008*"child" + 0.008*"alignment" + 0.008*"much"'),
 (1,
  '0.018*"place" + 0.014*"game" + 0.014*"year" + 0.012*"bad" + 0.012*"people" + 0.012*"thing" + 0.010*"even" + 0.010*"never" + 0.010*"day" + 0.009*"player"'),
 (2,
  '0.020*"still" + 0.018*"way" + 0.018*"well" + 0.014*"new" + 0.012*"share" + 0.012*"rmusic" + 0.011*"time" + 0.010*"music" + 0.008*"day" + 0.007*"feeling"'),
 (3,
  '0.014*"question" + 0.014*"post" + 0.014*"time" + 0.013*"thing" + 0.013*"comment" + 0.012*"good" + 0.010*"issue" + 0.009*"reddit" + 0.009*"help" + 0.009*"action"')]

In [41]:
lda_5 = models.LdaModel(corpus=corpus_new, num_topics=5, id2word=id2word, passes=10)
lda_5.print_topics()

[(0,
  '0.020*"also" + 0.018*"time" + 0.015*"good" + 0.013*"character" + 0.011*"well" + 0.011*"much" + 0.010*"thing" + 0.009*"back" + 0.009*"problem" + 0.008*"really"'),
 (1,
  '0.020*"game" + 0.019*"thing" + 0.015*"even" + 0.014*"people" + 0.012*"share" + 0.011*"way" + 0.011*"good" + 0.011*"music" + 0.010*"really" + 0.008*"bad"'),
 (2,
  '0.023*"year" + 0.019*"people" + 0.017*"well" + 0.015*"community" + 0.014*"player" + 0.013*"time" + 0.013*"system" + 0.013*"alignment" + 0.011*"level" + 0.010*"bit"'),
 (3,
  '0.027*"day" + 0.013*"rmusic" + 0.012*"way" + 0.012*"new" + 0.012*"able" + 0.011*"time" + 0.010*"child" + 0.009*"kid" + 0.009*"help" + 0.009*"subreddit"'),
 (4,
  '0.027*"place" + 0.022*"reddit" + 0.020*"well" + 0.018*"question" + 0.018*"post" + 0.016*"man" + 0.012*"comment" + 0.012*"user" + 0.010*"rule" + 0.010*"also"')]

In [42]:
# Compute Coherence Score for the base model
coherence_model_lda = CoherenceModel(model=lda_3, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 3 topics: ', coherence_lda)

coherence_model_lda = CoherenceModel(model=lda_4, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 4 topics: ', coherence_lda)

coherence_model_lda = CoherenceModel(model=lda_5, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 5 topics: ', coherence_lda)


Coherence Score for 3 topics:  0.6008988395515505

Coherence Score for 4 topics:  0.5848013109391921

Coherence Score for 5 topics:  0.5291158043887213


### Third Attempt - Topic Modeling (Trigram)

In [18]:
import pickle

file_name = "trigrams"

open_file = open(file_name, "rb")
data_trigram = pickle.load(open_file)
open_file.close()

In [19]:
data_trigram[:5]

[['okay',
  'tone',
  'volume',
  'accent',
  'vocabulary',
  'emotions',
  'physical',
  'movement'],
 ['gtyou', 'made', 'snort', 'snorting', 'crack', 'class'],
 ['ah', 'peasant', 'spirit', 'alive', 'well', 'see'],
 ['interesting', 'bet', 'pros', 'would', 'love', 'training'],
 ['think', 'unacceptable', 'every', 'context']]

In [43]:
data_lemmatized = lemmatization(data_trigram, allowed_postags=['NOUN', 'ADJ', 'VB', 'ADV'])
print("NAVA:", data_lemmatized[:20])

NAVA: [['tone', 'volume', 'vocabulary', 'emotion', 'physical', 'movement'], ['snort', 'crack', 'class'], ['peasant', 'spirit', 'alive', 'well'], ['interesting', 'bet', 'pro', 'training'], ['unacceptable', 'context'], ['ever', 'cookie'], ['long', 'feeling', 'wrong'], ['even', 'botw', 'lmao', 'recommendation', 'terrible'], ['never', 'reason', 'use'], ['work', 'holy', 'one'], ['first', 'half'], ['odst', 'proper', 'multiplayer', 'right', 'firefight'], ['star', 'rating'], ['quick', 'game'], ['appreciate'], ['suspect', 'theory', 'already', 'phase'], ['quality', 'game'], ['good', 'world'], ['thing', 'ever'], ['sun', 'radiation', 'outside']]


In [44]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus_new = [id2word.doc2bow(text) for text in texts] # BOW

In [45]:
lda_3 = models.LdaModel(corpus=corpus_new, num_topics=3, id2word=id2word, passes=10)
lda_3.print_topics()

[(0,
  '0.016*"time" + 0.016*"game" + 0.012*"even" + 0.009*"also" + 0.008*"well" + 0.007*"still" + 0.007*"year" + 0.007*"player" + 0.007*"good" + 0.007*"much"'),
 (1,
  '0.019*"people" + 0.014*"good" + 0.013*"thing" + 0.009*"character" + 0.009*"bad" + 0.008*"really" + 0.008*"life" + 0.007*"friend" + 0.007*"never" + 0.007*"kid"'),
 (2,
  '0.016*"place" + 0.016*"well" + 0.012*"new" + 0.012*"time" + 0.011*"reddit" + 0.011*"question" + 0.011*"post" + 0.010*"day" + 0.010*"comment" + 0.009*"action"')]

In [46]:
lda_4 = models.LdaModel(corpus=corpus_new, num_topics=4, id2word=id2word, passes=10)
lda_4.print_topics()

[(0,
  '0.027*"good" + 0.016*"thing" + 0.015*"new" + 0.013*"character" + 0.012*"question" + 0.012*"post" + 0.012*"part" + 0.011*"time" + 0.010*"action" + 0.009*"community"'),
 (1,
  '0.013*"man" + 0.013*"year" + 0.012*"time" + 0.009*"maybe" + 0.008*"still" + 0.008*"big" + 0.008*"guy" + 0.008*"woman" + 0.007*"month" + 0.006*"mind"'),
 (2,
  '0.019*"people" + 0.018*"way" + 0.014*"game" + 0.013*"even" + 0.013*"also" + 0.012*"never" + 0.011*"time" + 0.010*"thing" + 0.009*"friend" + 0.009*"bad"'),
 (3,
  '0.019*"place" + 0.016*"well" + 0.016*"day" + 0.014*"reddit" + 0.011*"able" + 0.011*"music" + 0.010*"rmusic" + 0.008*"first" + 0.007*"submit" + 0.007*"even"')]

In [47]:
lda_5 = models.LdaModel(corpus=corpus_new, num_topics=5, id2word=id2word, passes=10)
lda_5.print_topics()

[(0,
  '0.030*"time" + 0.023*"year" + 0.014*"reason" + 0.010*"bit" + 0.010*"person" + 0.009*"positive" + 0.008*"high" + 0.007*"rule" + 0.007*"relationship" + 0.007*"album"'),
 (1,
  '0.027*"game" + 0.018*"day" + 0.015*"time" + 0.013*"people" + 0.012*"system" + 0.010*"level" + 0.009*"different" + 0.009*"player" + 0.008*"even" + 0.008*"user"'),
 (2,
  '0.017*"reddit" + 0.016*"question" + 0.016*"place" + 0.016*"post" + 0.013*"new" + 0.013*"able" + 0.012*"kid" + 0.012*"action" + 0.012*"comment" + 0.012*"child"'),
 (3,
  '0.028*"people" + 0.015*"well" + 0.014*"even" + 0.014*"way" + 0.011*"many" + 0.011*"music" + 0.011*"community" + 0.009*"alignment" + 0.009*"also" + 0.009*"money"'),
 (4,
  '0.029*"good" + 0.018*"thing" + 0.017*"right" + 0.014*"character" + 0.012*"man" + 0.012*"really" + 0.011*"much" + 0.009*"world" + 0.009*"never" + 0.009*"also"')]

In [48]:
# Compute Coherence Score for the base model
coherence_model_lda = CoherenceModel(model=lda_3, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 3 topics: ', coherence_lda)

coherence_model_lda = CoherenceModel(model=lda_4, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 4 topics: ', coherence_lda)

coherence_model_lda = CoherenceModel(model=lda_5, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for 5 topics: ', coherence_lda)


Coherence Score for 3 topics:  0.633579925051611

Coherence Score for 4 topics:  0.6018338278500548

Coherence Score for 5 topics:  0.5534405169808639


### Insights: