## Topic Modeling with pyLDAvis - DV22
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Dataset JIRA_filteredBy_Transfered_PRTSIR renamed to JIRA_filteredBy_Transfer_PRTSIR by vkb6bn on 2023-03-09 14:41:38
data = dataiku.Dataset("JIRA_filteredBy_fixed_transfer_dv22")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,fields.resolution.name,fields.description_cleaned,fields.summary_cleaned
0,PRTSPR-6103,Fixed,co isotopic ratio enhancement pa alpha select ...,aos bl subscan not stop second
1,PRTSPR-34598,Transfer,antenna show flag tsys trx atm scan antenna at...,ape2 bl antenna flag datum atm scan band
2,PRTSPR-30539,Transfer,begin focus sb antenna dv05 not capable source...,ape2 bl dv05 axis go shutdown mount timeout error
3,PRTSPR-20455,Transfer,prtspr-20453 focus sb fail attempt scan occasi...,aos bl_pr#1 waituntilonsourcecb function got n...
4,PRTSPR-30808,Transfer,sb crash noformat summary timed wait second an...,ape2 bl da57 antenna not arrive source tiime


In [3]:
#drop missing values
df = df.dropna()
len(df)

598

In [4]:
#extract text from summary_cleaned
summary_corpus = df['fields.summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['aos bl subscan not stop second',
 'ape2 bl antenna flag datum atm scan band',
 'ape2 bl dv05 axis go shutdown mount timeout error',
 'aos bl_pr#1 waituntilonsourcecb function got no response antenna dv18 srm not initialize',
 'ape2 bl da57 antenna not arrive source tiime',
 'ape2 bl da49 devices go stop configured cambus error',
 'ape1 bl dv17 acu access mode undefined',
 'aos bl bad trx antenna science observation',
 'ape2 dv22 dv13 dv07 da65 da51 da45 da44 da43 tsys trx flagged bbp4 pol x.',
 'aos_bl_pr1 timed wait second correlator archive sub scan',
 'aos bl intermitent lose coherence bbpr2 polx bbpr3 polx bbpr4 poly',
 'ape1 bl timed wait second antenna source antennas not respond dv04',
 'ape1 bl handover da53 not source control gui stick',
 'aos bl dv22 high tsys bbpr pol band',
 'ape2 bl timed wait second antenna source dv18 subreflector not source',
 'aos handover dv22 lost communication antenna',
 'ape2 bl dv25 lose communication wvr',
 'aos bl mmex fail scan datum cluster 

In [5]:
#extract text from description_cleaned
description_corpus = df['fields.description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

["co isotopic ratio enhancement pa alpha select merge luminous infrared galaxy version --fail-- start end project code pi toshikisaito schedblock ngc_3110_a_03_te execblock uid://a002 x856bb8 x7da sb uid uid://a001 x12b x193 qa0 status fail band alma_rb_03 alma build 201404-cycle2-on b-2014 array array013 array corr m]/64-antenna sbex fail start subscan scan error log javacontainer noformat 28t20:31:52.653 info control array013 subscan intent on_source take second start 28t20:31:56.590 info control acc javacontainer orb status connectionthreadsused=16 lose calls=0 requestqueuemaxusepercent=25 poa componentpoa_control array013 28t20:32:22.080 info control array013 moving antenna source specify ephemeris initial ra dec 28t20:32:28.816 error control array013 function time type=10006 code=7 message='problem determine antenna(s dv02 dv04 dv07 dv16 dv18 dv22 complete waituntilonsource command completion null perhaps timeout short assuming problem 28t20:32:30.890 info control array013 tuning 

In [6]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

In [7]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

In [8]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,-0.068955,0.075615,1,1,21.559915
5,-0.000253,-0.145684,2,1,12.783571
6,-0.16354,-0.012689,3,1,11.952371
9,-0.162588,0.190805,4,1,11.230498
8,0.230266,0.088668,5,1,8.991471
3,-0.102423,-0.067275,6,1,7.621857
2,0.038528,-0.135672,7,1,7.397463
7,0.155672,0.124042,8,1,7.324115
1,0.070354,-0.102641,9,1,5.864785
0,0.002939,-0.015171,10,1,5.273955


In [9]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.383308,-0.073238,1,1,17.133312
2,-0.022238,-0.27838,2,1,15.271924
9,-0.161178,0.147197,3,1,13.708905
0,-0.106647,-0.009537,4,1,12.482907
7,-0.063492,-0.126554,5,1,11.082892
5,-0.161075,0.083532,6,1,8.329698
6,-0.082156,-0.028493,7,1,6.353024
4,0.378619,0.154513,8,1,5.945779
8,-0.082709,0.135109,9,1,5.17902
3,-0.082432,-0.004147,10,1,4.512539


In [10]:
#Summary_corpus 

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [11]:
#Description_corpus 

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)