## Topic Modeling with pyLDAvis
-kernel = env mypython

In [5]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [6]:
data = dataiku.Dataset("CAPSTONEDATAMATT.ICT_TICKETS_prepared")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,resolution_name,fields_resolutiondate,fields_created,time_span_days,priority_name,labels,status,components_0,components_1,components_2,description,summary
0,ICT-13585,,,2018-10-10 23:50:07-03:00,,Medium,,Submitted,03 Control,,,A common recommendation is to use boost's stra...,TPP : Investigate using boost's strand library
1,ICT-5354,Rejected,2016-06-13 15:59:25-04:00,2015-06-13 16:11:34-03:00,366.0,Medium,"[""OnKibana""]",Closed,01 ACA Control,,,"In PRTSIR-6361, a problem is reported that the...","Sub-scan aborted with ""waiting for a dump time..."
2,ICT-11343,Rejected,2022-03-08 03:46:55-03:00,2017-11-03 18:06:08-03:00,1585.0,Medium,"[""aoscheck""]",Closed,07-08 AOSCheck,,,Make a new profiling of aos-check service (pyt...,AOS-Check Timeout Issues
3,ICT-5698,Verified,2015-10-28 22:17:39-03:00,2015-08-03 19:02:38-03:00,86.0,High,"[""201608-CYCLE4-ON""]",Closed,01 ACA Control,,,This issue was reported in [PRTSIR-6935|http:/...,ACACORR/CDPMIF/MASTER: Caught CORBA::SystemExc...
4,ICT-9945,Cannot Reproduce,2023-02-13 01:50:50-03:00,2017-05-13 14:24:12-03:00,2101.0,Medium,,Closed,03-04 DataCapturer,,,Two days in a row we have been instances of da...,DataCapturer container running out of memory


In [3]:
#drop missing values
#df = df.dropna()

In [7]:
len(df)

673

In [8]:
#extract text from summary_cleaned
summary_corpus = df['summary'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

["TPP : Investigate using boost's strand library",
 'Sub-scan aborted with "waiting for a dump timed out" message for all ACA-CDP nodes',
 'AOS-Check Timeout Issues',
 'ACACORR/CDPMIF/MASTER: Caught CORBA::SystemException while prepare for spectral data',
 'DataCapturer container running out of memory',
 'Mount command executed in the wrong timing event',
 'AOS: Insufficient lead time for the subscan / Not enough lead time',
 'ARCHIVE/ACC/javaContainer run out memory',
 'bugs/improvements for AOScheck/webAQUA post 2019JUN release',
 'Subscan fails due to "Not found spectral data" in CDPMIF',
 'Create a new version of AutoInstall with IO/CPU/Net bandwith limitation',
 'Change sourcecat (xmlrpc) service log level to log on INFO',
 'Antennas inaccessible after a FAR probably an initialization issue',
 'IERSpredict table corrupted error from time to time',
 'Compilation problem in R2016.3',
 'ACASPEC: BDF header is malformed',
 'Stopping an SB while Interferometry Mode initialization does 

In [10]:
len(summary_corpus)

673

In [9]:
#extract text from description_cleaned
description_corpus = df['description'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

["A common recommendation is to use boost's strand library when using boost's asio sockets in a multithreaded application.  We are currently using timed mutexes to prevent race conditions, particularly important when closing the sockets.  These were implemented in response to ICT-9725 and ICT-11533.\n\nThis ticket is to investigate switching to using strand.  If there is benefit in performance and/or maintainability, we'll make the change.",
 "In PRTSIR-6361, a problem is reported that the subscan with the ACA correlator has terminated with a message 'Failed to receive correlation data from ACA-CDPCs.' And immediately before that, all CDPCs failed to receive 16ms dump data from the ACA correlator at 2015-06-12T12:02:26 as excerpted below. These messages suggest the direct cause of the problem is that all CIP modules stopped data transmission simultaneously or all CDPCs failed to receive data simultaneously for some reason.\n{noformat}\n2015-06-12T12:02:26.306 [ACACORR/CDPCIF/NODE_01 - 

In [11]:
len(description_corpus)

673

#### Vectorize & TFIDF

In [13]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

In [14]:
#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

In [15]:
#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

In [0]:
#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [18]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
lda_tfidf.fit(summary_dtm_tfidf)

LatentDirichletAllocation(n_components=4, random_state=1)

In [19]:
#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(n_components=4, random_state=1)

#### Vizualize Topics

In [25]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,0.196746,-0.073495,1,1,13.554534
1,0.180212,0.02195,2,1,12.08088
7,-0.089439,-0.185821,3,1,10.725377
6,-0.069002,-0.102995,4,1,10.496826
5,0.040754,0.022931,5,1,9.690243
0,-0.150869,0.029729,6,1,9.461492
9,-0.055392,0.019582,7,1,9.335762
3,-0.012814,0.023671,8,1,8.662044
8,0.005497,0.05638,9,1,8.166833
4,-0.045693,0.188068,10,1,7.826009


In [21]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,0.026615,-0.018676,1,1,15.750124
3,0.020994,-0.138852,2,1,12.636626
7,-0.159067,-0.077495,3,1,11.966657
6,0.060747,0.017111,4,1,11.919862
4,0.127057,0.195309,5,1,11.384356
1,0.10838,-0.119118,6,1,10.559208
5,0.009012,0.222567,7,1,8.027096
0,0.020778,-0.130596,8,1,7.790202
8,0.09783,-0.013684,9,1,5.393448
2,-0.312345,0.063434,10,1,4.57242


In [22]:
#Summary_corpus

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [23]:
#Description_corpus

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)