## Topic Modeling with pyLDAvis
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = dataiku.Dataset("JIRA_PRTSPR")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,fields.resolution.description,fields.resolution.name,fields.resolutiondate,fields.updated,fields.description,fields.summary,fields.created,fields.description_cleaned,fields.summary_cleaned
0,PRTSPR-20847,The problem is a duplicate of an existing issue.,Duplicate,2016-07-21 12:57:23+00:00,2016-08-05 17:23:37+00:00,When observing (in parallel) a DV14 Total Powe...,AOS: CONTROL/ACC/TOTALPOWER/cppContainer strea...,2016-07-21 04:08:17+00:00,observe parallel dv14 total power no correlato...,aos control acc totalpower cppcontainer stream...
1,PRTSPR-20943,,Unresolved,2016-10-14 16:42:19+00:00,2016-10-14 16:44:13+00:00,"After recreating the array, as described in PR...",AOS: BL: Failed to cleanly activate the Observ...,2016-07-26 06:05:29+00:00,recreate array describe prtspr-20942 nogo sb e...,aos bl failed cleanly activate observing mode ...
2,PRTSPR-17210,The issue is transferred to another area in JI...,Transfer,2017-03-05 16:02:53+00:00,2018-07-22 09:19:21+00:00,While running:\n\nFocus SBs (Version 0) --FAIL...,AOS:BL:PR1: Correlator resource conflict,2015-12-22 21:15:44+00:00,run focus sbs version --fail-- start end proje...,aos bl pr1 correlator resource conflict
3,PRTSPR-13826,The issue is transferred to another area in JI...,Transfer,2017-03-05 16:15:09+00:00,2018-07-22 09:10:43+00:00,While the mentioned SB was executing it sudden...,AOS_BL_PR1: DA52 AZ went to shutdown,2015-07-26 21:25:50+00:00,mention sb execute suddenly fail scan sb nofor...,aos_bl_pr1 da52 az go shutdown
4,PRTSPR-36766,,Unresolved,2018-10-14 02:07:36+00:00,2018-10-14 02:07:39+00:00,_Auto generated ticket_\n[link to EXEC entry|h...,[Statistics] APE1: ACA: 7m: Aborted in order t...,2018-10-14 02:06:31+00:00,auto generate ticket link exec webshiftlog?ebu...,statistics ape1 aca aborted order run grid survey


In [0]:
#drop missing values
#df = df.dropna()

In [3]:
len(df)

21046

In [13]:
df = df.dropna(subset=['fields.description_cleaned'])

In [4]:
#extract text from summary_cleaned
summary_corpus = df['fields.summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['aos control acc totalpower cppcontainer stream dataflow crash telcal acacorr subsystem crash',
 'aos bl failed cleanly activate observing mode antenna da43 use subarray array010',
 'aos bl pr1 correlator resource conflict',
 'aos_bl_pr1 da52 az go shutdown',
 'statistics ape1 aca aborted order run grid survey',
 'aos es handover coredump user apply point result',
 'aos2 dv01 el axis go standby',
 'aos network connection intermittent',
 'statistic sb fail source elevation',
 'ape1 bl dv02 phase atm datum flag',
 'ape2 bl dv06 fe43 not lock band fe#43',
 'aos bl cm02 bb_2 polx negative trec see band',
 'ape1 cm10 high trec b6 bbpr 1&3 pol',
 'ape2 aca sbex wrong tsys trx value antenna',
 'tfint fail problem complete subscan error invoke observe mode function',
 'ape2 da53 drx bit report no signal power',
 'ape2 dv04 subreflector issue',
 'ape1 bl dv13 container crash cause pi observation fail',
 'aos bl correlator resource conflict',
 'aos bl dv17 fe not lock band6',
 'aca7m+pr3 fail',

In [5]:
len(summary_corpus)

21046

In [14]:
#extract text from description_cleaned
description_corpus = df['fields.description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

['observe parallel dv14 total power no correlator observation bl antenna sb aca observation cascade crash control acc totalpower cppcontainer stream dataflow telcal error jlog acacorr subsystem crash error state telcal cppcontainer acacorr cdpmif master container crash call sw effect crash unsafe try recovery sfuica suggest fsr recovery imply hand archive telcal control totalpower component restart',
 'recreate array describe prtspr-20942 nogo sb execute however crash begin follow error noformat summary failed cleanly activate observing mode not initialize interferometry controller not initialize correlator invalid array details error trace code type file interferometryobservingmodeimpl.java host gas01 line number process control acc javacontainer routine initializecontrollers short description unrecoverable error occur source object control array012 thread requestprocessor-133928 additional data usererrormessage value failed cleanly activate observing mode javaex.class value alma cont

In [15]:
len(description_corpus)

20680

#### Vectorize & TFIDF

In [16]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

In [17]:
#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

In [18]:
#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

In [19]:
#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [20]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

In [21]:
#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

#### Vizualize Topics

In [22]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,-0.229184,0.041973,1,1,14.742163
2,-0.153134,-0.062732,2,1,13.110438
0,0.149495,-0.104417,3,1,11.569868
6,0.156829,-0.163368,4,1,9.424598
1,0.085887,-0.119846,5,1,9.206559
4,0.168253,0.248096,6,1,8.825258
9,0.176866,0.128463,7,1,8.599398
3,-0.164995,0.139517,8,1,8.377422
5,-0.091931,-0.076281,9,1,8.153923
8,-0.098084,-0.031405,10,1,7.990374


In [23]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-0.209338,-0.091443,1,1,16.580821
5,-0.172332,-0.134123,2,1,16.218238
1,-0.121147,0.069489,3,1,12.851747
6,0.333373,0.076085,4,1,11.151491
9,0.202561,0.143381,5,1,8.305626
4,-0.066421,0.175666,6,1,8.054185
8,0.234746,-0.324163,7,1,7.106583
3,-0.098258,-0.087081,8,1,6.637868
7,-0.122743,0.087182,9,1,6.550927
2,0.019558,0.085007,10,1,6.542513


In [24]:
#Summary_corpus

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [25]:
#Description_corpus

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)