## Topic Modeling with pyLDAvis - Java
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Dataset JIRA_filteredBy_Transfered_PRTSIR renamed to JIRA_filteredBy_Transfer_PRTSIR by vkb6bn on 2023-03-09 14:41:38
data = dataiku.Dataset("JIRA_filteredBy_fixed_transfer_java")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,fields.resolution.name,fields.description_cleaned,fields.summary_cleaned
0,PRTSPR-17210,Transfer,run focus sbs version --fail-- start end proje...,aos bl pr1 correlator resource conflict
1,PRTSPR-10272,Transfer,fail noformat 13t06:48:29.718 vlbitestobs scan...,tfint fail problem complete subscan error invo...
2,PRTSPR-60997,Transfer,auto generate ticket link exec webshiftlog?ebu...,aca7m+pr3 fail
3,PRTSPR-16078,Fixed,observing telcal problem prtspr-16066 sb crash...,aos bl photonic reference timeout
4,PRTSPR-8542,Fixed,run sciver_lb_test sdp.81 version --fail-- sta...,aos bl control array004 correlator resource co...


In [3]:
#drop missing values
df = df.dropna()
len(df)

2721

In [4]:
#extract text from summary_cleaned
summary_corpus = df['fields.summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

#extract text from description_cleaned
description_corpus = df['fields.description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

["run focus sbs version --fail-- start end project code 0000.0.00187.csv pi nphillip schedblock focus band z execblock uid://a002 xae3696 x136 sb uid uid://a002 x78fe3d x5 qa0 status band alma_rb_06 alma build 201508-cycle3-on b-2015 array array003 array corr m]/64-antenna focus sb fail follow exception:\\ noformat 22t23:59:34.730 none error script execution acserr errortrace(file='/alma acs-2014.6 acssw bin linenum=138 routine='<module host='gas01 process='25331 thread='mainthread timestamp=136701215747259058l sourceobject= errortype=10100l errorcode=5l severity error shortdescriptio n='general scriptexecutor runtime error data= previouserror=[acserr errortrace(file='subscansequenceexecutor.java linenum=54 routine='run host='gas01 p rocess='control acc javacontainer thread='thread-1704 timestamp=136701215627750000l sourceobject='control array003 errortype=10000l errorcode=16l severity error shortdescription='an unrecoverable error occur data=[acserr namevalue(name='subscan value='5 ac

#### Vectorize & TFIDF

In [5]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [6]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=3, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=3, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(n_components=3, random_state=0)

#### Visualize Topics

In [7]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,-0.05137,-0.183344,1,1,38.263458
2,-0.166329,0.12846,2,1,37.251678
0,0.217699,0.054884,3,1,24.484864


In [8]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,0.22637,-0.02877,1,1,34.701519
1,-0.148101,-0.125509,2,1,33.320956
0,-0.078269,0.154279,3,1,31.977525


In [9]:
#Summary_corpus 

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [10]:
#Description_corpus 

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)