# **Latent Dirichlet Allocation (LDA)-based Topic Modeling and Clustering**

In [0]:
import pandas as pd
import nltk
import gensim
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
from gensim import corpora,models
import time
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
#Mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#importing the CSV file of the master reports from google drive into a dataframe
master_reports = pd.read_csv('drive/My Drive/duplicate_detection/master_reports.csv')
master_reports = master_reports.drop(columns=['Unnamed: 0'])

In [0]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 5:
            result.append(lemmatize(token))
    return result

In [0]:
master_reports['Description'] = master_reports['Description'].map(preprocess)

### **Creating Bag of Words (BoW)**

In [0]:
#Creating a dictionary using gensim library
dictionary = gensim.corpora.Dictionary(master_reports['Description'])
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [0]:
#Print top 20 words from the dictionary
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 20:
        break

0 actually
1 builders
2 change
3 comment
4 compare
5 complete
6 consider
7 consistency
8 contain
9 default
10 derive
11 document
12 editor
13 effect
14 ensure
15 external
16 extremely
17 inconsistent
18 internal
19 maintain
20 mandatory


In [0]:
#Creating BoW using the the dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in master_reports['Description']]

In [0]:
#Printing the BoW for single document
bow_doc_8 = bow_corpus[8]
for i in range(len(bow_doc_8)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_8[i][0], 
                                               dictionary[bow_doc_8[i][0]], 
bow_doc_8[i][1]))

Word 2 ("change") appears 1 time.
Word 30 ("resource") appears 1 time.
Word 48 ("expect") appears 1 time.
Word 114 ("future") appears 1 time.
Word 181 ("implementation") appears 1 time.
Word 182 ("iresource") appears 1 time.
Word 183 ("obsolete") appears 1 time.


In [0]:
# open a file, where you stored the pickled data
f= open('drive/My Drive/duplicate_detection/bow_corpus.pickle', 'wb')

# dump information to that file
pickle.dump(bow_corpus, f)

In [0]:
# open a file, where you stored the pickled data
file = open('drive/My Drive/duplicate_detection/dictionary.pickle', 'wb')

# dump information to that file
pickle.dump(dictionary, file)

### **LDA-based Topic Modeling**

In [0]:
#Preparing the parameters for LDA model
corpus = bow_corpus
no_of_topics = 10
dictionary = dictionary
p = 20
k = 2
epochs = 100

#Training the LDA model on the BoW corpus
lda_model = gensim.models.LdaMulticore(corpus, num_topics=no_of_topics, id2word=dictionary, passes=p, workers=k, iterations=epochs)

In [0]:
# save model to disk (no need to use pickle module)
lda_model.save('drive/My Drive/duplicate_detection/lda_model.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# Printing the topics and the propability distributions of words in those topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.117*"project" + 0.042*"create" + 0.031*"workspace" + 0.027*"change" + 0.021*"delete" + 0.021*"target" + 0.020*"folder" + 0.019*"resource" + 0.019*"package" + 0.015*"eclipse"
Topic: 1 
Words: 0.127*"thread" + 0.057*"javathread" + 0.055*"libjvmdylib" + 0.048*"threadblocked" + 0.035*"libsystembdylib" + 0.034*"libclientdylib" + 0.031*"native" + 0.029*"available" + 0.027*"worker" + 0.021*"symbol"
Topic: 2 
Words: 0.085*"update" + 0.060*"feature" + 0.047*"search" + 0.045*"version" + 0.023*"filter" + 0.021*"install" + 0.019*"result" + 0.018*"eclipse" + 0.016*"plugin" + 0.016*"instal"
Topic: 3 
Words: 0.088*"dialog" + 0.052*"button" + 0.045*"select" + 0.030*"launch" + 0.025*"preferences" + 0.023*"preference" + 0.017*"background" + 0.016*"progress" + 0.015*"change" + 0.015*"default"
Topic: 4 
Words: 0.075*"editor" + 0.029*"select" + 0.028*"perspective" + 0.027*"window" + 0.021*"action" + 0.017*"switch" + 0.017*"create" + 0.016*"reproduce" + 0.016*"eclipse" + 0.015*"editors"
T

In [0]:
#Let's evaluate the model using Perplexity and Coherence Bag of words- Title
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data['Description'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.389649142641688


  numerator = (co_occur_count / num_docs) + EPSILON
  denominator = (w_prime_count / num_docs) * (w_star_count / num_docs)
  co_doc_prob = co_occur_count / num_docs



Coherence Score:  nan


In [0]:
# pip install pyldavis

In [0]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
# Visualize the topics for LDA model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
vis

### **Clustering premised on Topic Modeling**

In [0]:
#Creating 10 empty clusters and pushing master reports in each of them based on topic modeling and saving them in individual csv file
for c in range(10):
    exec('topic_{} = pd.DataFrame()'.format(c))
    for i in range(len(master_reports)):
        topic=lda_model[dictionary.doc2bow(master_reports.Description[i])]
        topic= np.asarray(topic)
        if int(topic[np.argmax(topic[:,1]),0])== c:
            exec('topic_{} = topic_{}.append(master_reports.loc[[i]])'.format(c,c))
            exec('topic_{} = topic_{}.reset_index(drop=True)'.format(c,c))
            exec('topic_{}.to_csv("drive/My Drive/duplicate_detection/topic_{}.csv")'.format(c,c))