## Topic Modeling with pyLDAvis - Correlator
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Dataset JIRA_filteredBy_Transfered_PRTSIR renamed to JIRA_filteredBy_Transfer_PRTSIR by vkb6bn on 2023-03-09 14:41:38
data = dataiku.Dataset("JIRA_filteredBy_fixed_transfer_correlator")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,fields.resolution.name,fields.description_cleaned,fields.summary_cleaned
0,PRTSPR-17210,Transfer,run focus sbs version --fail-- start end proje...,aos bl pr1 correlator resource conflict
1,PRTSPR-13826,Transfer,mention sb execute suddenly fail scan sb nofor...,aos_bl_pr1 da52 az go shutdown
2,PRTSPR-30334,Transfer,auto generate ticket link exec webshiftlog?ebu...,ape2 aca sbex wrong tsys trx value antenna
3,PRTSPR-10272,Transfer,fail noformat 13t06:48:29.718 vlbitestobs scan...,tfint fail problem complete subscan error invo...
4,PRTSPR-7614,Fixed,interferometricpointing sky fail correlator re...,aos bl correlator resource conflict


In [3]:
#drop missing values
df = df.dropna()
len(df)

3893

In [4]:
#extract text from summary_cleaned
summary_corpus = df['fields.summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['aos bl pr1 correlator resource conflict',
 'aos_bl_pr1 da52 az go shutdown',
 'ape2 aca sbex wrong tsys trx value antenna',
 'tfint fail problem complete subscan error invoke observe mode function',
 'aos bl correlator resource conflict',
 'aos bl dv17 fe not lock band6',
 'aos bl photonic reference timeout',
 "aos bl recoverable error occur ','timed wait second",
 'aos64 aca hw crash',
 'aos bl control array004 correlator resource conflict type=20000 code=8 invalid scan subscan end cdp master externally',
 'aos bl sb fail scan subscan not stop second check timeout long han subscan duration',
 'ape2 aca corr subsystem go error initializing_pass2 available',
 'ape1 pm01 pm04 not command antenna problem mount component',
 'aos aca dv01 dgck time issue',
 'aos bl da54 cppcontainer crash',
 'aos bl subscan not stop second',
 'aos pr4 aca sbex scan atm calib subscan not end sb not crash',
 'aos da43 control da43 container go',
 'ape2 bl da65 axis go shutdown',
 'aos dv11 suddenly antenna 

In [5]:
#extract text from description_cleaned
description_corpus = df['fields.description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

["run focus sbs version --fail-- start end project code 0000.0.00187.csv pi nphillip schedblock focus band z execblock uid://a002 xae3696 x136 sb uid uid://a002 x78fe3d x5 qa0 status band alma_rb_06 alma build 201508-cycle3-on b-2015 array array003 array corr m]/64-antenna focus sb fail follow exception:\\ noformat 22t23:59:34.730 none error script execution acserr errortrace(file='/alma acs-2014.6 acssw bin linenum=138 routine='<module host='gas01 process='25331 thread='mainthread timestamp=136701215747259058l sourceobject= errortype=10100l errorcode=5l severity error shortdescriptio n='general scriptexecutor runtime error data= previouserror=[acserr errortrace(file='subscansequenceexecutor.java linenum=54 routine='run host='gas01 p rocess='control acc javacontainer thread='thread-1704 timestamp=136701215627750000l sourceobject='control array003 errortype=10000l errorcode=16l severity error shortdescription='an unrecoverable error occur data=[acserr namevalue(name='subscan value='5 ac

In [6]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [7]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

#### Visualize Topics

In [8]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,0.07308,0.127063,1,1,13.788581
6,-0.080274,0.170303,2,1,13.688652
8,-0.179278,-0.227751,3,1,12.901061
2,0.253245,-0.085416,4,1,11.06466
1,0.19974,0.00133,5,1,10.288863
5,-0.19241,-0.022533,6,1,8.803201
9,0.002353,-0.098788,7,1,8.314828
4,-0.129438,0.153194,8,1,8.122375
3,0.013423,-0.051059,9,1,6.779037
0,0.03956,0.033658,10,1,6.248743


In [9]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,-0.196974,-0.034636,1,1,32.442068
5,-0.017458,-0.114283,2,1,10.646376
2,-0.10785,-0.009833,3,1,10.293762
9,0.043844,0.17284,4,1,8.427285
7,-0.153859,0.020184,5,1,8.10801
3,0.260023,-0.283599,6,1,7.64412
0,-0.059624,0.010747,7,1,6.62199
6,0.180019,0.145678,8,1,6.174286
8,-0.114746,-0.065139,9,1,5.066467
4,0.166624,0.15804,10,1,4.575636


In [10]:
#Summary_corpus 

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [11]:
#Description_corpus 

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)