# Importing stop words

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Prepare vague words list

In [None]:
from nltk.corpus import wordnet

In [None]:
def get_synonyms(ofword):
  from nltk.corpus import wordnet
  synonyms = []

  for syn in wordnet.synsets(ofword):
    for l in syn.lemmas():
      synonyms.append(l.name())

  return synonyms

In [None]:
def get_antonyms(ofword):
  from nltk.corpus import wordnet
  antonyms = []

  for syn in wordnet.synsets(ofword):
    for l in syn.lemmas():
      if l.antonyms():
          antonyms.append(l.antonyms()[0].name())

  return antonyms
  # print(set(antonyms))

In [None]:
synonyms = get_synonyms("active")
print (synonyms)

In [None]:
# read the words to list
with open('vagueWords.txt') as vague_word_list_file:
    vague_word_list = vague_word_list_file.read().splitlines()

vague_word_set = set()

# Add the synonyms of each word
for vagueword in vague_word_list:
  for vague_word_synonym in get_synonyms(vagueword):
    vague_word_set.add(vague_word_synonym.lower())

# Add the antonyms( of each word 
for vagueword in vague_word_list:
  for vague_word_synonym in get_antonyms(vagueword):
    vague_word_set.add(vague_word_synonym.lower())

# Remove unncessary words
remove_word_list = ['adept']
for word in remove_word_list:
  if word in vague_word_set:
    vague_word_set.remove(word)

# Add words without adding synonyms
addtional_word_list = ['didnt', 'doesnt']
for word in addtional_word_list:
  vague_word_set.add(word)

vague_word_set = sorted(vague_word_set)

vague_word_list = list(vague_word_set)

print("No. of Vague words", len(vague_word_list))

# Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import json
import math
import re # We clean text using regex
import csv # To read the csv
from collections import defaultdict # For accumlating values
from nltk.corpus import stopwords # To remove stopwords
from gensim import corpora # To create corpus and dictionary for the LDA model
from gensim.models import LdaModel # To use the LDA model
import pyLDAvis.gensim # To visualise LDA model effectively
import matplotlib.colors as mcolors
from collections import Counter

# Start with loading all necessary libraries
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# % matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Importing the reviews

In [None]:
reviews = []
with open('reviews.json') as file:
  for line in file:
      entry = json.loads(line)
      reviews.append(entry["_source"]["review"])
print("Total No. of reviews =", len(reviews))

# Small part of the reviews for fast testing

In [None]:
#reviews = reviews[:1405] 

# Reviews Statistics

In [None]:
review_lengths = []
average_review_length = 0
total_length = 0
min_length = len(reviews[0])
max_length = 0
for i, review in enumerate(reviews):
  review_length = len(review)
  total_length = total_length + review_length
  review_lengths.append(review_length)
  if max_length < review_length:
    max_length = review_length
  if min_length > review_length:
    min_length = review_length

average_review_length = math.ceil(total_length/len(reviews))
print("Total Reviews: "+str(len(reviews))+ ", Average Review Length:"+ str(average_review_length)+", Minimum Length: "+str(min_length)+", Maximum Length: "+str(max_length))

# Tokenize sentences

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
sentences_set = set()
for review in reviews:
  for sentence in sent_tokenize(review):
    sentences_set.add(sentence.lower())
sentences = list(sentences_set) 

#  Cleaning up the data

In [None]:
def get_stopwords():
  nltk.download('stopwords')
  from nltk.corpus import stopwords
  stopwords = set(stopwords.words('english'))
  # read the words to list
  with open('stopwords.txt') as stop_word_list_file:
    for word in stop_word_list_file.read().splitlines():
      stopwords.add(word.lower())

  additional_words = ["hi", "ok", "am", "would", "i'm","im","ill","cant","else","youd","otherwise","due"
  ,"youre","ive","havent","hasnt","hadnt","didnt","could","doesnt","may","wouldnt","dont","cant","could"
  ,"every","anyone","say","isnt","arent","also","cannot","itll","lets","youll","aspacingtopmini","hello"
  ,"theres","itthe","shes","hes","another","etc"]

  for word in additional_words:
    stopwords.add(word)
  return stopwords

In [None]:
reviews = sentences
reviews = [re.sub(r'[^\w\s]','',str(item)) for item in reviews]
stopwords = get_stopwords()
texts = [[word for word in document.lower().split() if word not in stopwords] for document in reviews]
frequency = defaultdict(int)
for text in texts:
    for token in text:
         frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]


# Data Processing

In [None]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# LDA

In [None]:
NUM_TOPICS = 12
ldamodel = LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, random_state=10, passes=15, alpha=0.01, eta=0.001)
topics = ldamodel.show_topics(num_topics= NUM_TOPICS,num_words=20,formatted=False)

# Ranking

In [None]:
def sum_of_probability_of_words_in_topic(topic_index, topics):
  sum = 0
  for index, value in enumerate(topics[topic_index][1]):
    sum += value[1]
    # print(value)
  return sum

def sum_of_probability_of_vague_words_in_topic(vague_word_list, topic_index, topics):
  sum = 0
  for index, value in enumerate(topics[topic_index][1]):
    if value[0] in vague_word_list:
      sum += value[1]
      # print(value)
  return sum  

def vagueness_degree(vague_word_list):
  for i in range(len(topics)):
    all_word_probability = sum_of_probability_of_words_in_topic(i, topics)
    vague_word_probability = sum_of_probability_of_vague_words_in_topic(vague_word_list, i, topics)
    percentage = round(vague_word_probability / all_word_probability * 100, 2)
    percentage_str = "percentage: " + str(percentage) + "%"
    if percentage >= 10.00:
      print("topic", f'{i+1:<2}', f'{"vague: " + str(round(vague_word_probability, 4)):<15}', f'{"probability sum: " + str(round(all_word_probability, 4)):<25}', f'{percentage_str:<20}', "vague")
    else:
      print("topic", f'{i+1:<2}', f'{"vague: " + str(round(vague_word_probability, 4)):<15}', f'{"probability sum: " + str(round(all_word_probability, 4)):<25}', f'{percentage_str:<20}')

    

vagueness_degree(vague_word_list)


In [None]:
def print_vague_topic_words(vague_word_list):
  for i in range(len(topics)):
    all_word_probability = sum_of_probability_of_words_in_topic(i, topics)
    vague_word_probability = sum_of_probability_of_vague_words_in_topic(vague_word_list, i, topics)
    percentage = round(vague_word_probability / all_word_probability * 10000) / 100
    if percentage >= 10.00:
      print("\ntopic", i+1)
      for index, value in enumerate(topics[i][1]):
        print("             " + f'{value[0]:<14}', value[1])
print_vague_topic_words(vague_word_list)

In [None]:
def plot_topic_percentage(topics):

  topic_rankings_x = []
  topic_rankings_y = []
  for i in range(len(topics)):
    all_word_count = sum_of_probability_of_words_in_topic(i, topics)
    vague_word_count = sum_of_probability_of_vague_words_in_topic(vague_word_list, i, topics)
    percentage = round(vague_word_count / all_word_count * 100, 2)
    topic_rankings_x.append(i + 1)
    topic_rankings_y.append(percentage)
    # print(percentage)

  plt.yticks(np.arange(0, 100, 10))
  plt.xticks(np.arange(1, NUM_TOPICS + 1, 1))
  plt.bar(topic_rankings_x, topic_rankings_y)
  plt.ylabel('Vagueness Percentage')
  plt.xlabel('Topics')
  plt.show()
plot_topic_percentage(topics)

# Word cloud

In [None]:
import matplotlib.pyplot as plt
for t in range(ldamodel.num_topics):
    plt.figure()
    plt.imshow(WordCloud(max_font_size=50, max_words=100, background_color="white").fit_words(dict(ldamodel.show_topic(t, 200))),interpolation="bilinear")
    plt.axis("off")
    plt.title("Topic #" + str(t+1))
    plt.show()


# Bar plot for top 20 words weights in each topic 

In [None]:
from matplotlib.pyplot import figure

def plot_words_in_topic(topics):
  for i in range(len(topics)):
    figure(figsize=(20, 6), dpi=80)
    y_ticks = []
    x_labels = []
    # print(topics[i][1])
    for key, value in enumerate(topics[i][1]):
      # print(value)
      y_ticks.append(value[1])
      x_labels.append(value[0])
    # plt.yticks(np.arange(0, 1, .1))
    x_ticks = list(range(1, (len(x_labels) + 1)))
    plt.xticks(x_ticks, x_labels)
    plt.bar(x_ticks, y_ticks, width=.2)
    plt.ylabel('Word probabilty')
    plt.xlabel('Topic #' + str(i + 1))
    plt.show()
    print()
    print()
plot_words_in_topic(topics)

# Top 20 Words in each topic 

In [None]:
word_dict = {};
for i in range(NUM_TOPICS):
    words = ldamodel.show_topic(i, topn = 20)
    word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
pd.DataFrame(word_dict)

# Visualization

In [None]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# Hyperparameters Tuning 

## Optimum number of topics

Downloading LDA mallet

In [None]:
import gensim
import gensim.models
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    mallet_path = '/static/mallet-2.0.8/bin/mallet' # use it on google colab
    print('before running ', start, ' ', limit, ' ', step)
    for num_topics in range(start, limit, step):
        print('running')
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, random_seed=10, iterations=1, id2word=ldamodel.id2word)
        model_list.append(model)
        coherencemodel = gensim.models.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
start=9; limit=20; step=1;
#model_list, coherence_values = compute_coherence_values(dictionary=ldamodel.id2word, corpus=corpus, texts=texts, start=start, limit=limit, step=step)
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=texts, start=start, limit=limit, step=step)
# Show graph

In [None]:
coherence_values

In [None]:
# Show graph
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
best_topic_no = 0
best_topic_cv = 0

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
    if cv > best_topic_cv + 0.005:
        best_topic_cv = cv
        best_topic_no = m
print('Best coherence topic ', best_topic_no)

Optimum values of Alpha and Eta

In [None]:
# supporting function to find alpha and beta
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    print("lda_model = gensim.models.LdaMulticore(corpus=corpus,")
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           workers=4,
                                           num_topics=k, 
                                           random_state=10,
                                           passes=10,
                                           alpha=a,
                                               eta=b)
    print("coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=texts, dictionary=ldamodel.id2word, coherence='c_v')")
    coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    
    print("return coherence_model_lda.get_coherence()")
    return coherence_model_lda.get_coherence()

In [None]:
import numpy as np

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = best_topic_no
max_topics = min_topics + 1
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.5))
#alpha.append('symmetric')
#alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.5))
#beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
            
corpus_title = [#'25% Corpus',
                #'50% Corpus',
                 '75% Corpus',
                '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
                
# Can take a long time to run
if 1 == 1:
    #pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        print("i " + str(i))
        # iterate through number of topics
        for k in topics_range:
            print("k "  + str(k))
            # iterate through alpha values
            for a in alpha:
                print("a " + str(a))
                # iterare through beta values
                for b in beta:
                    print("b " + str(b))
                    print("# get the coherence score for the given parameters")
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary, k=k, a=a, b=b)
                    print("# Save the model results")
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
 #                   pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
  #  pbar.close()

In [None]:
pd.set_option('display.max_rows', 400)
model_results_df = pd.DataFrame(model_results).sort_values(by=['Coherence'], ascending=[False]).reset_index(drop=True)
model_results_df

In [None]:
print("The recommended nubmer of topics is", model_results_df['Topics'][0], "with value of Alpha=", model_results_df['Alpha'][0], "and Eta=", model_results_df['Beta'][0])

In [None]:
!pip freeze > requirements.txt