## Topic Modeling with pyLDAvis
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [6]:
data = dataiku.Dataset("ICT_TICKETS_verfied")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,resolution_name,fields_resolutiondate,fields_created,time_span_days,priority_name,labels,status,components_0,components_1,components_2,description,summary
0,ICT-13684,Verified,2019-01-15 12:43:12-03:00,2018-10-26 07:17:10-03:00,81,High,"[""2018DEC"",""PhA-Verified""]",Closed,06-25 Aqua/QA0,,,Background:\n\nFor TP observing with a fixed o...,AQUA TP target time on-source with absolute of...
1,ICT-7342,Verified,2016-09-26 21:27:12-03:00,2016-05-09 15:02:54-03:00,140,Medium,,Closed,03 Control,,,If for some reason an antenna fails to publish...,PublishMountStatusData is flooding the alarm s...
2,ICT-13592,Verified,2018-11-28 17:36:11-03:00,2018-10-11 23:52:45-03:00,47,Medium,"[""OnKibana"",""simulation""]",Closed,03 Control,,,As it was noticed at [PRTSPR-36686|https://jir...,Increase setPoweredBands timeout
3,ICT-14557,Verified,2019-04-05 20:40:03-03:00,2019-04-01 21:32:18-03:00,3,High,,Closed,08 Scheduling,,,"One of the scheduling improvements, is that af...",SCHEDULING execution events filtering
4,ICT-14647,Verified,2019-08-20 21:55:04-04:00,2019-04-13 21:16:26-04:00,129,Medium,,Closed,01 ACA Control,,,From the investigation of https://jira.alma.cl...,ACACORR/CDPMIF: possible race condition betwee...


In [0]:
#drop missing values
#df = df.dropna()

In [7]:
len(df)

127

In [8]:
#extract text from summary_cleaned
summary_corpus = df['summary'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['AQUA TP target time on-source with absolute off positions should use on position time and names',
 'PublishMountStatusData is flooding the alarm system on failure',
 'Increase  setPoweredBands timeout',
 'SCHEDULING execution events filtering',
 'ACACORR/CDPMIF: possible race condition between deactivation and wvr events',
 'Component references obtained by the total-power processor should not leak memory',
 'doSubscanSequence crash due to CORBA.TRANSIENT in timeToTune method',
 'Error retrieving final metadata',
 "AntLOControllerImpl getAttenuatorValues() and setAttenuatorValues() don't catch and log exceptions when accessing monitor points",
 'Antenna container crash while CASA MeasConvert internal destructor called.',
 'NC Supplier event type name memory leak',
 'Fix permission for loadLkmModule, unloadLkmModule, loadcalibrationTest, unloadcalibrationTest, loadswitchTest, unloadswitchTest',
 'ACACORR node fails to acquire mutex',
 'Array creation - Optimisation of the SAS Polarisa

In [9]:
len(summary_corpus)

127

In [10]:
#extract text from description_cleaned
description_corpus = df['description'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

['Background:\n\nFor TP observing with a fixed off positions, PRTSPR-37055 found that the time per source was incorrectly calculated in AQUA QA0. It is getting <50% per target. \xa0this is for\xa0uid://A002/Xd3e89f/Xe4\n\nAlso, it incorrectly reports the target names as:\xa0OBSERVE_TARGET (28178-0091_OFF_0, 24013+0488_OFF_0, 31946+0076_OFF_0) \xa0(also wrong in the list of scans in AQUA)\xa0whereas they should be\xa028178-0091, 24013+0488, 31946+0076\n\nanother example is\xa0uid://A002/Xd3e89f/X8e21 \xa0 (for just one target)\xa0also\xa0uid://A002/Xd395f6/X15375\n\nRequirement:\n\nThe OFF positions should not be included in the target list.\xa0\n\nThe total on-source time should only include the ON time.',
 'If for some reason an antenna fails to publish mount status data to the CONTROL_REALTIME_XX notify service, it starts activating and deactivating the alarm around 20 times per second. This was seen in [PRTSIR-9899|http://jira.alma.cl/browse/PRTSIR-9899] and ended up crashing the al

In [11]:
len(description_corpus)

127

#### Vectorize & TFIDF

In [12]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

In [13]:
#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

In [14]:
#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

In [15]:
#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [16]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
lda_tfidf.fit(summary_dtm_tfidf)

LatentDirichletAllocation(n_components=4, random_state=1)

In [17]:
#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(n_components=4, random_state=1)

#### Vizualize Topics

In [18]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,0.072114,-0.149731,1,1,13.203353
3,-0.203471,-0.020087,2,1,12.472156
8,0.211517,0.030018,3,1,12.140839
6,0.055115,0.090768,4,1,9.681088
1,-0.011954,-0.183133,5,1,9.489927
4,-0.148902,0.036309,6,1,9.294094
9,0.046391,0.141231,7,1,9.124365
5,0.039448,-0.080678,8,1,8.969764
2,-0.022957,0.017694,9,1,8.486538
0,-0.037302,0.11761,10,1,7.137877


In [21]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.193311,-0.14471,1,1,14.352331
7,0.11979,0.086016,2,1,14.184915
5,-0.18659,-0.124967,3,1,13.053367
9,-0.055623,0.060314,4,1,11.196855
8,0.094589,0.034134,5,1,10.026431
2,0.141173,-0.093632,6,1,7.970397
3,-0.127165,0.209486,7,1,7.711263
0,-0.002465,0.110331,8,1,7.470257
6,0.017243,0.037822,9,1,7.093988
4,-0.194263,-0.174794,10,1,6.940196


In [19]:
#Summary_corpus

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [22]:
#Description_corpus

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)