## Topic Modeling with pyLDAvis
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = dataiku.Dataset("JIRA_filteredBy_Transfer_PRTSPR")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,fields.resolution.name,fields.description_cleaned,fields.summary_cleaned
0,PRTSPR-17210,Transfer,run focus sbs version --fail-- start end proje...,aos bl pr1 correlator resource conflict
1,PRTSPR-13826,Transfer,mention sb execute suddenly fail scan sb nofor...,aos_bl_pr1 da52 az go shutdown
2,PRTSPR-58778,Transfer,problem affect band science project noformat f...,ape2 bl dv06 fe43 not lock band fe#43
3,PRTSPR-17265,Transfer,timestamp 29t01:35:00 seen scan 287.csv grid s...,aos bl cm02 bb_2 polx negative trec see band
4,PRTSPR-38083,Transfer,noformat ant cm10 band alma_rb_06 freq 229.75g...,ape1 cm10 high trec b6 bbpr 1&3 pol


In [3]:
#drop missing values
df = df.dropna()
len(df)

8874

In [4]:
#extract text from summary_cleaned
summary_corpus = df['fields.summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['aos bl pr1 correlator resource conflict',
 'aos_bl_pr1 da52 az go shutdown',
 'ape2 bl dv06 fe43 not lock band fe#43',
 'aos bl cm02 bb_2 polx negative trec see band',
 'ape1 cm10 high trec b6 bbpr 1&3 pol',
 'ape2 aca sbex wrong tsys trx value antenna',
 'tfint fail problem complete subscan error invoke observe mode function',
 'ape2 da53 drx bit report no signal power',
 'ape2 dv04 subreflector issue',
 'aca7m+pr3 fail',
 'ape1 aca_7 cm01 keep alive laser bbpr3',
 'ape2 dv21 fe69 high atm value bb pol band',
 'ape2 aca-7 cm11 el axis go shutdown el f encoder alarm',
 'ape2 bl high trx da44 fe26 dv21 fe69 band',
 'ape1 bl da53 datum flag bbpr pol ifp1 level not correctly optimize',
 'dv23 high phase rm band3',
 'ape2 bl dv20 bad atm bad metaframe',
 'aos64 bl total power bdf truncate wrong frame order different totalpower stream archive',
 'ape2 da65 antenna hvac chill present alarm compresor',
 "aos bl recoverable error occur ','timed wait second",
 'ape2 dv01 flag atm band fe61',


In [5]:
#extract text from description_cleaned
description_corpus = df['fields.description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

["run focus sbs version --fail-- start end project code 0000.0.00187.csv pi nphillip schedblock focus band z execblock uid://a002 xae3696 x136 sb uid uid://a002 x78fe3d x5 qa0 status band alma_rb_06 alma build 201508-cycle3-on b-2015 array array003 array corr m]/64-antenna focus sb fail follow exception:\\ noformat 22t23:59:34.730 none error script execution acserr errortrace(file='/alma acs-2014.6 acssw bin linenum=138 routine='<module host='gas01 process='25331 thread='mainthread timestamp=136701215747259058l sourceobject= errortype=10100l errorcode=5l severity error shortdescriptio n='general scriptexecutor runtime error data= previouserror=[acserr errortrace(file='subscansequenceexecutor.java linenum=54 routine='run host='gas01 p rocess='control acc javacontainer thread='thread-1704 timestamp=136701215627750000l sourceobject='control array003 errortype=10000l errorcode=16l severity error shortdescription='an unrecoverable error occur data=[acserr namevalue(name='subscan value='5 ac

#### Vectorize & TFIDF

In [6]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

#### LDA MODEL

In [7]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

#### Visualize Topics

In [8]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,-0.22017,0.08519,1,1,15.920267
2,-0.034651,-0.208623,2,1,12.853615
4,-0.25169,0.093783,3,1,11.980645
0,0.068973,0.105365,4,1,11.194184
9,-0.125324,-0.013041,5,1,10.595758
6,0.055791,-0.15955,6,1,9.721498
3,0.143305,0.180456,7,1,8.761391
1,0.044099,-0.203374,8,1,7.127929
5,0.14993,0.10573,9,1,6.485515
7,0.169737,0.014065,10,1,5.359199


In [9]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,-0.191621,0.120004,1,1,25.870198
5,0.358751,0.229574,2,1,12.578686
0,-0.158845,0.000987,3,1,12.250543
9,0.20817,0.033557,4,1,8.492164
6,0.099513,-0.08111,5,1,8.071517
1,0.144926,-0.321201,6,1,7.657099
2,-0.063603,-0.018043,7,1,7.409344
3,-0.148367,0.014676,8,1,6.996018
4,-0.204591,0.059817,9,1,6.619119
7,-0.044333,-0.038261,10,1,4.055314


In [10]:
#Summary_corpus 

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [11]:
#Description_corpus 

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)