In [7]:
# Required Imports
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [8]:
# Gensim for text preprocessing and LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [9]:
# spacy for lemmatization
import spacy

In [10]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
# Suppress all the warnings
import warnings
warnings.filterwarnings("ignore")

In [72]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [14]:
df = pd.read_csv('C:/Users/dkushwaha/Desktop/DK/Digital/NLP/Topic Modelling/LDA/Amazon_Product_Reviews/consumer-reviews-of-amazon-products/train.csv', encoding='latin1')

In [15]:
data = df['reviews.text'].tolist()

In [16]:
names = df['name']

In [17]:
names = names.dropna().tolist()

In [18]:
category_length = len(names)

In [19]:
category_length

27900

In [20]:
len(data)

34660

In [21]:
# Remove new line characters
data = [re.sub('\s+', ' ', str(sent)) for sent in data]

In [22]:
data[:1]

['This product so far has not disappointed. My children love to use it and I like the ability to monitor control what content they see with ease.']

In [23]:
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [24]:
data[:1]

['This product so far has not disappointed. My children love to use it and I like the ability to monitor control what content they see with ease.']

In [25]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm')#, disable=['parser', 'ner'])

In [26]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words = list(set(stop_words + list(spacy_stopwords)))
len(stop_words)

361

In [27]:
# function to convert each para into tokens after preprocessing
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [28]:
data_words = list(sent_to_words(data))

In [29]:
data_words[:1]

[['this',
  'product',
  'so',
  'far',
  'has',
  'not',
  'disappointed',
  'my',
  'children',
  'love',
  'to',
  'use',
  'it',
  'and',
  'like',
  'the',
  'ability',
  'to',
  'monitor',
  'control',
  'what',
  'content',
  'they',
  'see',
  'with',
  'ease']]

In [30]:
# Function to remove stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [31]:
len(data_words)

34660

In [32]:
#data_reduced = [[x for x in data_words_1] for data_words_1 in data_words if len(data_words_1) > 10]

In [33]:
i = 0
data_reduced = []
new_names = []
for i, data_words_1 in enumerate(data_words) :
    data_reduced_1 = []
    if len(data_words_1) > 10 :
        if i < category_length :
            new_names.append(names[i])
        for x in data_words_1 :
            data_reduced_1.append(x)

    if len(data_reduced_1) > 0 :
        data_reduced.append(data_reduced_1)

In [34]:
len(data_reduced)

30273

In [35]:
len(new_names)

24395

In [36]:
len(data_reduced)

30273

In [37]:
data_reduced_sent = [" ".join(y) for y in data_reduced]

In [38]:
len(data_reduced_sent)

30273

In [39]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_reduced)

In [40]:
data_words_nostops[1]

['great', 'beginner', 'experienced', 'person', 'bought', 'gift', 'loves']

In [41]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in data_words_nostops:
    for token in text:
        frequency[token] += 1

In [42]:
import operator
most_frq = sorted(frequency.items(),key = operator.itemgetter(1), reverse=True)
most_frq

[('great', 10330),
 ('tablet', 8345),
 ('use', 7316),
 ('love', 6054),
 ('easy', 5758),
 ('amazon', 5569),
 ('kindle', 5162),
 ('bought', 4844),
 ('fire', 4688),
 ('good', 4474),
 ('price', 3756),
 ('like', 3663),
 ('tv', 3176),
 ('product', 2971),
 ('echo', 2857),
 ('read', 2728),
 ('music', 2714),
 ('works', 2706),
 ('alexa', 2597),
 ('apps', 2554),
 ('kids', 2537),
 ('books', 2486),
 ('loves', 2451),
 ('reading', 2362),
 ('device', 2308),
 ('best', 2250),
 ('time', 2218),
 ('games', 2209),
 ('buy', 2204),
 ('screen', 2157),
 ('old', 2154),
 ('purchased', 2114),
 ('play', 2108),
 ('got', 2011),
 ('better', 1890),
 ('gift', 1842),
 ('set', 1707),
 ('recommend', 1656),
 ('year', 1602),
 ('new', 1552),
 ('home', 1533),
 ('dont', 1513),
 ('little', 1482),
 ('perfect', 1474),
 ('nice', 1448),
 ('light', 1447),
 ('prime', 1377),
 ('im', 1376),
 ('quality', 1334),
 ('need', 1311),
 ('lot', 1302),
 ('purchase', 1289),
 ('able', 1257),
 ('battery', 1246),
 ('want', 1234),
 ('christmas', 1229)

In [43]:
#most_frq_words = [row[0] for i, row in enumerate(most_frq)]

#for i in ['ax','edu','subject','com','lines',]:
    #most_frq_words.remove(i)
#most_frq_words
most_frq_words = ['great','use','love',
                  'easy','good','bought',
                  'like','loves','best',
                  'buy','got','better',
                  'gift','recommend','perfect',
                  'ease','tv','work',
                  'purchase','far', 'tap']

In [44]:
# Select words to add tot the stopwords list from the above analysis
stop_words.extend(most_frq_words)
len(stop_words)

382

In [45]:
data_words_nostops = remove_stopwords(data_words_nostops)

In [46]:
data_words_nostops[1]

['beginner', 'experienced', 'person']

In [47]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words_nostops, min_count=1, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words_nostops], threshold=1)

In [48]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [49]:
# Function to make bigrams
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [50]:
# Function to make trigrams
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [51]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

In [52]:
def lemmatization_stemming(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([lemmatizer.lemmatize(token.lemma_) for token in doc if token.pos_ in allowed_postags and len(token) > 1])
    return texts_out

In [53]:
#Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [54]:
data_words_bigrams[1]

['beginner', 'experienced', 'person']

In [55]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_words_bigrams[:1]

[['product',
  'disappointed',
  'children',
  'ability',
  'monitor',
  'control',
  'content']]

In [56]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization_stemming(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [57]:
print(data_lemmatized[:1])

[['product', 'disappointed', 'child', 'ability', 'monitor', 'control', 'content']]


In [58]:
id2word = corpora.Dictionary(data_lemmatized)

In [59]:
import nltk
ADJ = []
for i in range(11917):
    if nltk.pos_tag([id2word[i]])[0][1] == 'JJ':
        #print()
        ADJ.append(id2word[i])
ADJ

['disappointed',
 'inexpensive',
 'able',
 'futuristic',
 'nice',
 'additional',
 'basic',
 'grand',
 'little',
 'big',
 'fantastic',
 'happy',
 'popular',
 'previous',
 'easy',
 'large',
 'clear',
 'graphic',
 'old',
 'expandable',
 'small',
 'important',
 'interactive',
 'surprised',
 'durable',
 'visual',
 'new',
 'sharp',
 'second',
 'solid',
 'high',
 'available',
 'difficult',
 'limited',
 'affordable',
 'comic',
 'overall',
 'inexspensive',
 'useful',
 'bad',
 'possible',
 'hard',
 'negative',
 'unusable',
 'electronic',
 'personal',
 'portable',
 'black',
 'regular',
 'white',
 'expensive',
 'knowledgeable',
 'low',
 'ready',
 'casual',
 'certain',
 'real',
 'daily',
 'safe',
 'complete',
 'rid',
 'unwanted',
 'special',
 'open',
 'satisfied',
 'automatic',
 'free',
 'worried',
 'unlimited',
 'common',
 'different',
 'extra',
 'normal',
 'unable',
 'outstanding',
 'generous',
 'educational',
 'jealous',
 'fourth',
 'single',
 'ergonomic',
 'intuitive',
 'rapid',
 'heavy',
 'ori

In [60]:
len(ADJ)

844

In [61]:
stop_words.extend(ADJ)

In [62]:
data_lemmatized = remove_stopwords(data_lemmatized)

In [63]:
data_lemmatized[1]

['beginner', 'experience', 'person']

In [64]:
id2word = corpora.Dictionary(data_lemmatized)

In [65]:
# Create Corpus
texts = data_lemmatized

In [66]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [67]:
# View
print(corpus[:1])
len(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]]


30273

In [68]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ability', 1),
  ('child', 1),
  ('content', 1),
  ('control', 1),
  ('monitor', 1),
  ('product', 1)]]

In [75]:
# Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            random_state=100,
#                                            num_topics=3,
#                                            passes=10,
#                                            per_word_topics=True)
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       random_state=100,
                                       num_topics=3,
                                       passes=10,
                                       per_word_topics=True,
                                       workers=cores)
# Build LDA model
#lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                           id2word=id2word,
#                                           num_topics=4,
#                                           random_state=42,
#                                           update_every=1,
#                                           passes=20,
#                                           alpha='auto',
#                                           #eta='auto',
#                                           per_word_topics=True)

In [62]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics(num_words=15))
doc_lda = lda_model[corpus]
# Kindle 
# Echo
# Firestick/TV

[(0,
  '0.026*"echo" + 0.025*"device" + 0.024*"alexa" + 0.023*"music" + '
  '0.022*"amazon" + 0.018*"play" + 0.015*"prime" + 0.014*"movie" + '
  '0.012*"home" + 0.011*"product" + 0.011*"set" + 0.011*"thing" + '
  '0.011*"remote" + 0.010*"sound" + 0.010*"ask"'),
 (1,
  '0.064*"fire" + 0.028*"amazon" + 0.026*"app" + 0.026*"tablet" + 0.023*"box" '
  '+ 0.020*"stick" + 0.018*"price" + 0.016*"product" + 0.015*"cable" + '
  '0.014*"fast" + 0.013*"kid" + 0.010*"year" + 0.010*"watch" + 0.010*"want" + '
  '0.009*"game"'),
 (2,
  '0.064*"kindle" + 0.037*"read" + 0.025*"book" + 0.019*"screen" + '
  '0.016*"light" + 0.015*"cover" + 0.010*"time" + 0.010*"device" + '
  '0.009*"size" + 0.009*"paperwhite" + 0.009*"reader" + 0.008*"charge" + '
  '0.008*"case" + 0.008*"reading" + 0.007*"need"')]


In [63]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -6.76268801494521


In [64]:
# Compute Coherence Score
##coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')

In [65]:
coherence_model_lda.get_coherence()

0.6572496293091735

In [66]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word,)
vis

In [87]:
# Finding the dominant topic in each sentence
def format_topics_sentences(ldamodel, corpus, texts, new_names):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel.get_document_topics(corpus, minimum_probability=0.0)):
        
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([pd.Series(new_names), sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [71]:
d
# Show
#sentences = df_dominant_topic['Keywords'].tolist()

In [72]:
# 1 model, Alpha, eta =  default , passes = 20
print(3250/5144)

# 2 model, Alpha, eta =  auto , passes = 10
print(2172/4036)

# 3rd model
print(3653/4108)

# 4th model
print(3301/4108)

# 5th model
print(3544/4108)

print(2990/3575)

0.6318040435458787
0.5381565906838454
0.8892405063291139
0.8035540408958131
0.8627069133398247
0.8363636363636363


In [76]:
df_test = pd.read_csv('C:/Users/dkushwaha/Desktop/DK/Digital/NLP/Topic Modelling/LDA/consumer-reviews-of-amazon-products/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv', encoding='latin1')
#sentence = ["The Iris dataset was used in R.A. Fisher's classic 1936 paper, The Use of Multiple Measurements in Taxonomic Problems, and can also be found on the UCI Machine Learning Repository.It includes three iris species with 50 samples each as well as some properties about each flower. One flower species is linearly separable from the other two, but the other two are not linearly separable from each other.The columns in this dataset are "]
#sentence = ["Whilst using an graft to tidy up a excavation an LV cable was struck. The cable had been picked up by the cable avoidance tool but the operator had failed to see the cable in the excavation as he used the graft. Site mark out was good and the use of the cable avoidance tool was excellent."]

In [80]:
sentence_test = df_test['reviews.text'].tolist()
#Cleaning
sentence_test = [re.sub('\s+', ' ', sent) for sent in sentence_test]
sentence_test = [re.sub("\'", "", sent) for sent in sentence_test]
sent_words_test = list(sent_to_words(sentence_test))
sent_words_nostops_test = remove_stopwords(sent_words_test)
sent_words_bigrams_test = make_bigrams(sent_words_nostops_test)
sent_lemmatized_test = lemmatization_stemming(sent_words_bigrams_test, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
id2word_test = corpora.Dictionary(sent_lemmatized_test)
ADJ_test = []
for i in range(3966):
    if nltk.pos_tag([id2word_test[i]])[0][1] == 'JJ':
        ADJ_test.append(id2word_test[i])
#stop_words.extend(ADJ_test)
sent_lemmatized_test = remove_stopwords(sent_lemmatized_test)
id2word_test = corpora.Dictionary(sent_lemmatized_test)
corpus_test = [id2word.doc2bow(text) for text in sent_lemmatized_test]
ls = lda_model.get_document_topics(corpus,minimum_probability=0.00)

In [85]:
data_reduced_sent_test = [" ".join(y) for y in sent_words_test]
new_names_test = df_test['name']

In [88]:
df_topic_sents_keywords = format_topics_sentences(lda_model, corpus_test, data_reduced_sent_test, new_names_test)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No','Names','Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_dominant_topic.to_csv('C:/Users/dkushwaha/Desktop/DK/Digital/NLP/Topic Modelling/LDA/consumer-reviews-of-amazon-products/Final_results_test.csv')

In [89]:
# Accuracy over test data
(355/423)

0.8392434988179669

In [None]:
# find the optimal number of topics for LDA
def compute_coherence_values(dictionary, corpus, texts, start=1, limit = 8, step=1):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics,
                                                random_state=None,
                                                update_every=1,
                                                chunksize=3466,
                                                passes=20,
                                                alpha='auto',
                                                eta='auto',
                                                per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# create multiple models with variation in number of topics
# can take a long time
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,texts=data_lemmatized, start=1, limit=6, step=1)

In [None]:
# Show graph
print(coherence_values)
limit = 6; start=1; step=1
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence Score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
      print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[2]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))
# 1. Fire tablet Tablet
# 2. Kindle
# 3. Alexa
# 4. Fire Tv/ Stick

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)
vis

In [None]:
# Finding the dominant topic in each sentence
def format_topics_sentences(ldamodel, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel.get_document_topics(corpus, minimum_probability=0.0)):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
sentences = df_dominant_topic['Keywords'].tolist()

In [None]:
df_dominant_topic.head(10)
to_csv('C:/Users/dkushwaha/Desktop/DK/Digital/NLP/Topic Modelling/LDA/consumer-reviews-of-amazon-products/Final_results_train.csv')

In [None]:
df = pd.read_csv('C:/Users/dkushwaha/Desktop/DK/Digital/NLP/Topic Modelling/LDA/consumer-reviews-of-amazon-products/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv', encoding='latin1')
#sentence = ["The Iris dataset was used in R.A. Fisher's classic 1936 paper, The Use of Multiple Measurements in Taxonomic Problems, and can also be found on the UCI Machine Learning Repository.It includes three iris species with 50 samples each as well as some properties about each flower. One flower species is linearly separable from the other two, but the other two are not linearly separable from each other.The columns in this dataset are "]
#sentence = ["Whilst using an graft to tidy up a excavation an LV cable was struck. The cable had been picked up by the cable avoidance tool but the operator had failed to see the cable in the excavation as he used the graft. Site mark out was good and the use of the cable avoidance tool was excellent."]
sentence = df['reviews.text'].tolist()
#Cleaning
sentence = [re.sub('\s+', ' ', sent) for sent in sentence]
sentence = [re.sub("\'", "", sent) for sent in sentence]
sent_words = list(sent_to_words(sentence))
sent_words_nostops = remove_stopwords(sent_words)
sent_words_bigrams = make_bigrams(sent_words_nostops)
sent_lemmatized = lemmatization(sent_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

id2word = corpora.Dictionary(sent_lemmatized)
corpus = [id2word.doc2bow(text) for text in sent_lemmatized]
ls = optimal_model.get_document_topics(corpus,minimum_probability=0.00)
ls[0]

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=sentence)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
#sentences = df_dominant_topic['Keywords'].tolist()

In [None]:
df_dominant_topic.to_csv('C:/Users/dkushwaha/Desktop/DK/Digital/NLP/Topic Modelling/LDA/consumer-reviews-of-amazon-products/Final_results.csv')

In [None]:
# Implementation using Mallet

In [None]:
#Building LDA Mallet Model
import os
from gensim.models.wrappers import LdaMallet

In [None]:
os.environ['MALLET_HOME'] = 'C:\\new_mallet\\mallet-2.0.8\\'

In [None]:
mallet_path = 'C:\\new_mallet\\mallet-2.0.8\\bin\\mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

In [None]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

In [None]:
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

In [None]:
# find the optimal number of topics for LDA?
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# create multiple models with variation in number of topics
# can take a long time
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,texts=data_lemmatized, start=5, limit=20, step=5)

In [None]:
# Show graph
limit = 20; start=5; step=5
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence Score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
      print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[2]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
conv_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(optimal_model)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(conv_model, corpus, id2word)
vis

In [None]:
# Finding the dominant topic in each sentence
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
sentences = df_dominant_topic['Keywords'].tolist()

In [None]:
type(optimal_model)

In [None]:
# print dominant topics
df_dominant_topic

In [None]:
from gensim.models import Word2Vec

#model = Word2Vec(sentences, min_count=1)
print(df_dominant_topic['Keywords'].tolist())

In [None]:
df_dominant_topic

In [None]:
# Find the most representative document for each topic
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

In [None]:
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

In [None]:
# Topic distribution across documents

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

In [None]:
topic_counts

In [None]:
# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_contribution

In [None]:
# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]
topic_num_keywords

In [None]:
# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)
df_dominant_topics

In [None]:
# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

In [None]:
# Show
df_dominant_topics