## Topic Modeling with pyLDAvis - Antenna
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Dataset JIRA_filteredBy_Transfered_PRTSIR renamed to JIRA_filteredBy_Transfer_PRTSIR by vkb6bn on 2023-03-09 14:41:38
data = dataiku.Dataset("JIRA_filteredby_fixed_transfer_antenna")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,fields.resolution.name,fields.description_cleaned,fields.summary_cleaned
0,PRTSPR-17210,Transfer,run focus sbs version --fail-- start end proje...,aos bl pr1 correlator resource conflict
1,PRTSPR-13826,Transfer,mention sb execute suddenly fail scan sb nofor...,aos_bl_pr1 da52 az go shutdown
2,PRTSPR-30334,Transfer,auto generate ticket link exec webshiftlog?ebu...,ape2 aca sbex wrong tsys trx value antenna
3,PRTSPR-45268,Transfer,auto generate ticket link exec webshiftlog?ebu...,ape2 dv04 subreflector issue
4,PRTSPR-7282,Fixed,run interactive sb realize antenna stop go sou...,aos da45 mount problem mount


In [3]:
#drop missing values
df = df.dropna()
len(df)

5860

In [4]:
#extract text from summary_cleaned
summary_corpus = df['fields.summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['aos bl pr1 correlator resource conflict',
 'aos_bl_pr1 da52 az go shutdown',
 'ape2 aca sbex wrong tsys trx value antenna',
 'ape2 dv04 subreflector issue',
 'aos da45 mount problem mount',
 'ape2 aca-7 cm11 el axis go shutdown el f encoder alarm',
 'ape1 bl da53 datum flag bbpr pol ifp1 level not correctly optimize',
 'ape2 da65 antenna hvac chill present alarm compresor',
 "aos bl recoverable error occur ','timed wait second",
 'ape2 cm05 fe device go stop camb error',
 'aos bl control array004 correlator resource conflict type=20000 code=8 invalid scan subscan end cdp master externally',
 'ape1 bl dv03 not set signal level antenna ifproc0 stop',
 'ape2 aca cm01 frontend device stop state',
 'aos bl sb fail scan subscan not stop second check timeout long han subscan duration',
 'ape1 bl llc unlock',
 'ape2 pm03 az lr enconder alarm trigger',
 'ape1 api gui antenn_control not da65 metaframe delays fix',
 'ape2 bl obs da43 wvr idle state',
 'ape1 cm03 high trx band7 bbs k pol k pol x

In [5]:
#extract text from description_cleaned
description_corpus = df['fields.description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

["run focus sbs version --fail-- start end project code 0000.0.00187.csv pi nphillip schedblock focus band z execblock uid://a002 xae3696 x136 sb uid uid://a002 x78fe3d x5 qa0 status band alma_rb_06 alma build 201508-cycle3-on b-2015 array array003 array corr m]/64-antenna focus sb fail follow exception:\\ noformat 22t23:59:34.730 none error script execution acserr errortrace(file='/alma acs-2014.6 acssw bin linenum=138 routine='<module host='gas01 process='25331 thread='mainthread timestamp=136701215747259058l sourceobject= errortype=10100l errorcode=5l severity error shortdescriptio n='general scriptexecutor runtime error data= previouserror=[acserr errortrace(file='subscansequenceexecutor.java linenum=54 routine='run host='gas01 p rocess='control acc javacontainer thread='thread-1704 timestamp=136701215627750000l sourceobject='control array003 errortype=10000l errorcode=16l severity error shortdescription='an unrecoverable error occur data=[acserr namevalue(name='subscan value='5 ac

#### Vectorize & TFIDF

In [6]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [7]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

#### Visualize Topics

In [8]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,-0.124211,0.006087,1,1,20.222448
5,-0.059654,0.214862,2,1,11.535323
7,0.001123,-0.23319,3,1,10.886881
6,0.280357,0.039041,4,1,10.808077
1,0.118715,-0.011355,5,1,10.039997
4,-0.066578,0.013864,6,1,8.830903
2,-0.112459,-0.193527,7,1,8.252498
9,-0.112614,0.010447,8,1,7.544613
3,-0.127809,0.140998,9,1,5.961954
0,0.20313,0.012772,10,1,5.917306


In [9]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,-0.153229,-0.167395,1,1,23.612288
0,-0.003172,-0.233815,2,1,13.445881
2,0.436793,-0.069814,3,1,11.892184
1,-0.111274,0.079641,4,1,9.857504
8,0.114753,0.165612,5,1,8.719659
6,0.043049,0.099432,6,1,7.814649
4,-0.121075,0.044717,7,1,7.498717
7,0.026262,0.097125,8,1,7.107617
9,-0.087225,-0.168286,9,1,5.847859
5,-0.144881,0.152783,10,1,4.203642


In [10]:
#Summary_corpus 

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [11]:
#Description_corpus 

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)