# Import libraries


In [91]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
nlp = spacy.load("en_core_web_sm")

import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
import os

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [93]:
df = pd.read_csv('./cleaned_df.csv')

In [94]:
df = df.dropna(subset=['Cleaned_Review'])

In [95]:
df.shape

(5210, 21)

# Topic modeling

In [57]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=False)) 

In [59]:
data = df.Cleaned_Review.values.tolist()
data_words = list(sent_to_words(data))

In [60]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [96]:
import gensim.corpora as corpora

tokenized_reviews = [nltk.word_tokenize(review) for review in df['Cleaned_Review']] 
id2word = corpora.Dictionary(tokenized_reviews)
texts = tokenized_reviews
corpus = [id2word.doc2bow(text) for text in texts]

In [76]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=5, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [77]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.025*"kid" + 0.020*"team" + 0.019*"hotel" + 0.018*"food" + 0.018*"great" + '
  '0.014*"day" + 0.014*"entertainment" + 0.014*"staff" + 0.014*"pool" + '
  '0.013*"holiday"'),
 (1,
  '0.027*"hotel" + 0.020*"room" + 0.020*"pool" + 0.016*"bar" + 0.016*"food" + '
  '0.014*"good" + 0.012*"staff" + 0.012*"day" + 0.011*"beach" + '
  '0.010*"restaurant"'),
 (2,
  '0.034*"hotel" + 0.027*"room" + 0.011*"star" + 0.011*"bad" + '
  '0.010*"reception" + 0.009*"service" + 0.008*"stay" + 0.008*"staff" + '
  '0.008*"food" + 0.008*"restaurant"'),
 (3,
  '0.050*"hotel" + 0.026*"room" + 0.026*"staff" + 0.026*"great" + 0.022*"good" '
  '+ 0.021*"food" + 0.020*"clean" + 0.017*"stay" + 0.016*"nice" + '
  '0.013*"friendly"'),
 (4,
  '0.009*"hotel" + 0.007*"holiday" + 0.007*"staff" + 0.006*"leave" + '
  '0.005*"people" + 0.005*"day" + 0.005*"tell" + 0.005*"guest" + '
  '0.004*"review" + 0.004*"bar"')]


In [90]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_reviews, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)
print('\nPerplexity: ', lda_model.log_perplexity(doc_term_matrix,total_docs=10000))  # a measure of how good the model is. lower the better.


Perplexity:  -7.072912570224587
Coherence Score:  0.3409446993644004


In [79]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_reviews, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [97]:
grid = {}
grid['Validation_Set'] = {}

min_topics = 1
max_topics = 5
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./lda_tuning_results.csv', index=False)
    pbar.close()

100%|█████████████████████████████████████████| 120/120 [39:43<00:00, 19.87s/it]
100%|█████████████████████████████████████████| 240/240 [48:47<00:00, 12.20s/it]


In [98]:
pd.DataFrame(model_results)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,1,0.01,0.01,0.351423
1,75% Corpus,1,0.01,0.31,0.349819
2,75% Corpus,1,0.01,0.61,0.34766
3,75% Corpus,1,0.01,0.91,0.34766
4,75% Corpus,1,0.01,symmetric,0.349819
5,75% Corpus,1,0.31,0.01,0.349819
6,75% Corpus,1,0.31,0.31,0.34766
7,75% Corpus,1,0.31,0.61,0.349819
8,75% Corpus,1,0.31,0.91,0.34766
9,75% Corpus,1,0.31,symmetric,0.34766


In [82]:
pd.DataFrame(model_results).to_csv('./lda_tuning_results.csv', index=False)

In [99]:
num_topics=5
LDAvis_data_filepath = os.path.join('./ldavis_tuned'+str(num_topics))

if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './ldavis_tuned'+ str(num_topics) +'.html')

LDAvis_prepared

  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
1      0.056545  0.006022       1        1  47.547192
3      0.178329  0.053056       2        1  17.135607
0      0.078746 -0.168189       3        1  15.046500
2     -0.068675  0.171150       4        1  14.371414
4     -0.244945 -0.062039       5        1   5.899287, topic_info=                Term          Freq         Total Category  logprob  loglift
27             great   3599.000000   3599.000000  Default  30.0000  30.0000
1517             kid   1906.000000   1906.000000  Default  29.0000  29.0000
28             hotel  10378.000000  10378.000000  Default  28.0000  28.0000
110             room   6398.000000   6398.000000  Default  27.0000  27.0000
206             team   1227.000000   1227.000000  Default  26.0000  26.0000
276    entertainment   1450.000000   1450.000000  Default  25.0000  25.0000
1559       animation    711.000000    711.00000