## Topic Modeling with pyLDAvis
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = dataiku.Dataset("JIRA_filteredBy_Fixed_PRTSPR")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,fields.resolution.name,fields.description_cleaned,fields.summary_cleaned
0,PRTSPR-8689,Fixed,happened sub array test essentially network cu...,aos network connection intermittent
1,PRTSPR-7614,Fixed,interferometricpointing sky fail correlator re...,aos bl correlator resource conflict
2,PRTSPR-7464,Fixed,running focus notice dv17 not lock band ccl ao...,aos bl dv17 fe not lock band6
3,PRTSPR-16078,Fixed,observing telcal problem prtspr-16066 sb crash...,aos bl photonic reference timeout
4,PRTSPR-7282,Fixed,run interactive sb realize antenna stop go sou...,aos da45 mount problem mount


In [3]:
#drop missing values
df = df.dropna()

In [4]:
len(df)

2785

In [5]:
#extract text from summary_cleaned
summary_corpus = df['fields.summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['aos network connection intermittent',
 'aos bl correlator resource conflict',
 'aos bl dv17 fe not lock band6',
 'aos bl photonic reference timeout',
 'aos da45 mount problem mount',
 'aos bl control acc totalpower cppcontainer crash',
 'aos bl control array004 correlator resource conflict type=20000 code=8 invalid scan subscan end cdp master externally',
 'aos bl error invoke observe mode function timed wait second correlator archive sub scan',
 'ape1 api gui antenn_control not da65 metaframe delays fix',
 'ape2 aca corr subsystem go error initializing_pass2 available',
 'aos aca dv01 dgck time issue',
 'aos bl subscan not stop second',
 'ape1 bl da60 got bad response mount error',
 'aos da50 run metrology',
 'aos da48 az axis go shutdown',
 'aos bl not destroy array cm03 antenna go inaccesable',
 'aos pm02 fe b7 fail lock',
 'aos handover cm mount component not instance',
 'ape1 bl da58 srm not work',
 'ape1 aca_7 although execution checksourcecaltarget j0854 442.526ghz ref topo re

In [6]:
len(summary_corpus)

2785

In [7]:
#extract text from description_cleaned
description_corpus = df['fields.description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

['happened sub array test essentially network cut cause archive subsystem error delaycal script keep fail',
 'interferometricpointing sky fail correlator resource conflict message reading log thing prior crash da60 acd situation apparently cause antlocontroller::setcalibrationdeviceasynch function crash not sure relate problem simply repeat script work',
 "running focus notice dv17 not lock band ccl aos 4]>fe status frontend ant dv17 controller =/ state degraded bands =/ available band alma_rb_03 alma_rb_04 alma_rb_06 alma_rb_07 available band alma_rb_08 alma_rb_09 alma_rb_10 powered band alma_rb_03 alma_rb_06 alma_rb_07 selected band =/ selected band alma_rb_06 lo1 frequency 230.506643[ghz sideband usb locked no sis mixers |_sis pol usb mode closed loopv -8.4465[mv -58.7158[ua |_sis pol lsb mode closed loopv 8.6044[mv 44.0063[ua |_sis pol usb mode closed loopv -9.0340[mv -46.5698[ua |_sis pol lsb mode closed loopv 8.9081[mv 30.9448[ua lo pa drain voltage pol 1.9609[v pol 2.2729[v phot

In [8]:
len(description_corpus)

2785

#### Vectorize & TFIDF

In [9]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

In [10]:
#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

In [11]:
#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

In [12]:
#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [13]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

In [14]:
#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

#### Visualize Topics

In [15]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,-0.23698,0.093055,1,1,18.703178
2,-0.207852,0.078642,2,1,14.079823
6,0.012052,-0.238439,3,1,10.780874
8,0.101029,-0.063608,4,1,10.559699
5,-0.031393,-0.085155,5,1,10.325728
3,0.004254,0.009667,6,1,8.909131
4,-0.054073,-0.112601,7,1,7.974011
7,0.090447,0.160148,8,1,7.063046
1,0.178298,0.046409,9,1,6.930319
0,0.144217,0.111882,10,1,4.674191


In [16]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,-0.147063,0.042899,1,1,25.617276
2,-0.05906,0.042802,2,1,14.552147
8,-0.078961,-0.196811,3,1,12.00066
4,-0.137865,0.098154,4,1,8.784446
7,-0.178409,-0.068196,5,1,8.270164
6,0.089439,0.14965,6,1,8.140487
9,-0.076862,0.105261,7,1,7.288066
5,0.192015,-0.294801,8,1,5.641253
3,0.096308,-0.047052,9,1,4.929902
0,0.300459,0.168094,10,1,4.775597


In [17]:
#Summary_corpus 

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [18]:
#Description_corpus 

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)