## Topic Modeling with pyLDAvis
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = dataiku.Dataset("JIRA_filteredBy_Fixed_PRTSIR")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,fields.resolution.name,fields.description_cleaned,fields.summary_cleaned
0,PRTSIR-1554,Fixed,perform sky point run notice dv04 point result...,dv04 erratic point sky point run
1,PRTSIR-12931,Fixed,start system dv07 appear local mode acu e stop...,ape2 dv07 e stop acu local mode not change
2,PRTSIR-3363,Fixed,last occurrence noformat 10t19:05:51.074 info ...,bl corr observation fail
3,PRTSIR-1212,Fixed,run -b -s --fail-- start end 07t23:57:20 08t00...,aos cm02 unexpectedly fail retrieve component ...
4,PRTSIR-10781,Fixed,dv12 show high trec tsys value observe band no...,aos64 bl_pr#1 da53 ex dv12 fe69 high trec tsys...


In [3]:
#drop missing values
df = df.dropna()
len(df)

187

In [4]:
#extract text from summary_cleaned
summary_corpus = df['fields.summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['dv04 erratic point sky point run',
 'ape2 dv07 e stop acu local mode not change',
 'bl corr observation fail',
 'aos cm02 unexpectedly fail retrieve component control cm02 antinterferometrycontroller',
 'aos64 bl_pr#1 da53 ex dv12 fe69 high trec tsys value pol band lo pa optimisation failure cause clna oscillation',
 'aos wca7 sn cm04 fe15 fail lock band ghz',
 'aos bl da58 fe15 band high trx cryogenic temperature spec',
 'datum cluster master node time problem cdpnode',
 'aos csv aca calsurvey uid://a002 x7a1660 x4bb no atmcal result',
 'aos handover dv06 servo failure',
 'aos bl da63 compressor go unnoticed fe warm',
 'qa2 signal teardrop dv16 fe37 band spw cca5',
 'ape2 da58 flagged temp band acd issue',
 'aos bl dv17 fe-47 frontend bus problem band band little',
 'aos handover dv04 lo2bbpair2 not visible canbus',
 'ape2 bl da58 wrong tsys trx pol band little',
 'aos acacorr not load configuration correlator',
 'ape2 da50 node not respond az axis',
 'aos handover dv02 hvac alarm',

In [5]:
#extract text from description_cleaned
description_corpus = df['fields.description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

['perform sky point run notice dv04 point result consistent pointingn model scatter arcsec attach quicklook plot bad_pointing_quicklook_x43-dv04.png|thumbnail suggest initially assign ir ticket science investigate',
 'start system dv07 appear local mode acu e stop active remove e stop try utility mode restart script crash control dv07 cppcontainer crash good antenna pass inaccessible state far -p try operation again fail way take antenna array czamorano check antenna end declare operation',
 'last occurrence noformat 10t19:05:51.074 info control array003 subscan intent on_source take second start 10t19:06:03.450 info control acc javacontainer orb status connectionthreadsused=14 lose calls=0 requestqueuemaxusepercent=5 poa componentpoa_control array002 10t19:06:12.607 error control array003 error invoke observe mode function type=10410 code=0 message=\'problem complete subscan 10t19:06:12.606 error control array003 correlator resource conflict type=20000 code=8 cause=\'data cluster mast

#### Vectorize & TFIDF

In [6]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

In [7]:
#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [8]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=3, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

LatentDirichletAllocation(n_components=3, random_state=0)

In [9]:
#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=3, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(n_components=3, random_state=0)

#### Visualize Topics

In [10]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,-0.053779,0.111582,1,1,43.265627
0,-0.099555,-0.091384,2,1,32.138262
1,0.153334,-0.020198,3,1,24.596111


In [11]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.16424,0.170303,1,1,33.963208
1,-0.253996,0.036901,2,1,33.299989
2,0.089756,-0.207204,3,1,32.736803


In [12]:
#Summary_corpus 

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [13]:
#Description_corpus 

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)