## Topic Modeling with pyLDAvis
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = dataiku.Dataset("ICT_TICKETS_validated")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,resolution_name,fields_resolutiondate,fields_created,time_span_days,priority_name,labels,status,components_0,components_1,components_2,description,summary
0,ICT-10644,Validated,2017-12-11 15:32:29-03:00,2017-08-10 16:11:27-04:00,122,High,"[""Telcal-Cy5TRR""]",Resolved,10 Telescope Calibration (TelCal),,,Telcal is crashing due a failed assertion on a...,Cycle5: Telcal failed assertion in boost multi...
1,ICT-8046,Validated,2018-11-27 22:02:03-03:00,2016-09-01 19:53:33-03:00,817,High,"[""CorrSubArrays""]",Closed,03-02 Baseline Correlator,,,symptoms reported in PRTSPR-21613 points to a ...,allocation algorithm mishandled SCC configurat...
2,ICT-14897,Validated,2019-06-19 13:07:07-04:00,2019-05-23 20:53:38-04:00,26,Medium,,Closed,03 Control,,,At [https://jira.alma.cl/browse/PRTSPR-39750] ...,TOTALPOWER container has excessive virtual mem...
3,ICT-13659,Validated,2019-03-06 18:48:16-03:00,2018-10-23 01:36:19-03:00,134,Medium,,Closed,11 SSR,11-01 Targets (scan execution),,A LONG polarization SB execution made with ONL...,PointingCalTarget accumulates the TelCal resul...
4,ICT-8124,Validated,2018-05-15 13:21:04-04:00,2016-09-13 21:54:34-03:00,608,Medium,"[""simulation""]",Closed,03 Control,,,The ArrayMountController ofshoot sends command...,Improve the way the array mount controller sen...


In [3]:
#drop missing values
#df = df.dropna()

In [4]:
len(df)

113

In [5]:
#extract text from summary_cleaned
summary_corpus = df['summary'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['Cycle5: Telcal failed assertion in boost multi array',
 'allocation algorithm mishandled SCC configuration slots',
 'TOTALPOWER container has excessive virtual memory growth',
 'PointingCalTarget accumulates the TelCal result and passing all the results to CONTROL',
 'Improve the way the array mount controller send commands to each antenna',
 'alma.Control.InterferometryObservingMode#isObservable CORBA.Timeout (use callbacks for isObservable)',
 'AOS Check returns 502 tsys IndexError: list index out of range: in 1-pol datasets',
 'Array Panel takes up to 5 minutes to present its schedblock list',
 'Resolve a glich in the handling of clean-up targets (triggered by multi phase calibrator SB)',
 'Update QoS configuration file for tuning up bulk data transfer in APE2 and APE1',
 'DATACAPTURER references not being released after executions finish archiving, leading to DC using too many resources and becoming unresponsive',
 'CORR/CCC: scheduling conflict detected',
 'Improvements to SD so

In [6]:
len(summary_corpus)

113

In [7]:
#extract text from description_cleaned
description_corpus = df['description'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

["Telcal is crashing due a failed assertion on a multi array structure, as consequence TECAL and TP container are crashing. \n\nTwo similar events reported in [PRTSPR-27565|https://jira.alma.cl/browse/PRTSPR-27565] and [PRTSPR-27569|https://jira.alma.cl/browse/PRTSPR-27569], \n\nAssigning as high priority, because is affecting E2E if Cycle5.\n\n{noformat}\n2017-08-10T06:07:42.526 [TELCAL - ] TELCAL/Array008/X1cb9/5/PhaseCal/ACA scanDuration=604.80sec integrationTime=1.01sec; windowSize:2 / 600 samples (scan subintegrations)\nmaciContainer: /alma/ACS-DEC2016/boost/include/boost/multi_array/base.hpp:136: Reference boost::detail::multi_array::value_accessor_n<T, NumDims>::access(boost::type<Reference>, boost::detail::multi_array::multi_array_base::index, TPtr, const boost::detail::multi_array::multi_array_base::size_type*, const boost::detail::multi_array::multi_array_base::index*, const boost::detail::multi_array::multi_array_base::index*) const [with Reference = boost::detail::multi_arr

In [8]:
len(description_corpus)

113

#### Vectorize & TFIDF

In [9]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

In [10]:
#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

In [11]:
#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

In [12]:
#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [13]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
lda_tfidf.fit(summary_dtm_tfidf)

LatentDirichletAllocation(n_components=4, random_state=1)

In [14]:
#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(n_components=4, random_state=1)

#### Vizualize Topics

In [16]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,-0.10054,0.169601,1,1,18.209354
6,-0.26363,-0.033744,2,1,14.663619
0,-0.049113,-0.072802,3,1,12.250738
1,0.074176,0.026542,4,1,11.442774
7,0.098289,-0.071579,5,1,8.733379
5,0.073477,0.089394,6,1,8.518226
3,0.083655,-0.027701,7,1,7.31146
2,-0.012076,-0.164947,8,1,6.907799
4,0.101287,0.036278,9,1,6.844622
9,-0.005525,0.048958,10,1,5.11803


In [17]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.107516,0.194932,1,1,19.041654
2,-0.134447,0.017749,2,1,18.634492
8,-0.31579,0.084425,3,1,16.426558
6,-0.178724,-0.021137,4,1,12.445664
3,-0.010373,-0.291595,5,1,7.094231
0,0.100608,0.101103,6,1,7.069555
9,0.058848,0.034737,7,1,6.697961
5,0.182824,0.011434,8,1,6.055481
4,0.070432,-0.054651,9,1,5.054464
7,0.119107,-0.076997,10,1,1.479939


In [18]:
#Summary_corpus

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [19]:
#Description_corpus

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)