## Topic Modeling with pyLDAvis
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = dataiku.Dataset("PRTSIR_filter_antenna")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,description_cleaned,summary_cleaned
0,PRTSIR-18629,happen fsr error messge noformat summary aband...,ape2 aca calibration[pre/0000000002 succeed ho...
1,PRTSIR-9722,run delay model measurement version --fail-- s...,aos aca cm04 el stowpin retracted false timeou...
2,PRTSIR-16042,observe da41 crash axis change standby noforma...,ape2 bl da41 axis go standby
3,PRTSIR-16409,noformat uid://a002 xe64b7b x16027 atmosphere ...,ape2 cm01 fe33 high trec band bbpr fe33
4,PRTSIR-1342,observatory calibration amplitude grid monitor...,aos bl sbex array instance no attribute getsin...


In [3]:
#drop missing values
#df = df.dropna()

In [4]:
len(df)

4046

In [5]:
df = df.dropna(subset=['description_cleaned'])

In [6]:
#extract text from summary_cleaned
summary_corpus = df['summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['ape2 aca calibration[pre/0000000002 succeed however fail create calibration data[id=1',
 'aos aca cm04 el stowpin retracted false timeout true',
 'ape2 bl da41 axis go standby',
 'ape2 cm01 fe33 high trec band bbpr fe33',
 'aos bl sbex array instance no attribute getsinglefieldinterferometryobservingmode',
 'ape1 dv19 band trec flag spectral check band sweep',
 'aos_bl_pr1 dv09 band locking problem',
 'aos dv11 antenna go unaccessable',
 'aos bl invalid array error define array correlator',
 'aos da62 antenna not operational',
 'ape1 bl dv07 srm error cause science sb fail',
 'aos blcorr not specify total power processor consume datum antenna array',
 'aos handover bl_pr#2 dv02 ifproc1 go stop',
 'aos bl error lock frontends pr2',
 'dv06 psd go shutdown hvac problem',
 'aos clo antenna come problem synchronize lorr',
 'antenna crash ace_cdr::total_length message',
 'aos handover da58 el drive not initialize brake disengage time',
 'aos bl pr not lock photonic reference',
 'aos sb err

In [7]:
len(summary_corpus)

4046

In [8]:
#extract text from description_cleaned
description_corpus = df['description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

['happen fsr error messge noformat summary abandoning antenna motion tune need correlator calibration not calibrate correlator underlie error type=20000 code=0 error trace error trace code type file acacorr_observationcontrolimpl.cpp host coj cc-1 line number process acacorr observation_control cppcontainer routine acacorr_observationcontrolimpl::createcallbackparam short description failed handle interface method invocation source object thread orbtask additional data timestamp details error trace code type file calibrationexecutor.java host gas01 line number process control acc javacontainer routine docalibrations short description unrecoverable error occur source object control array3-aca thread requestprocessor-3013 additional data usererrormessage value abandoning antenna motion tune need correlator calibration javaex.class value alma controlexceptions.wrappers acsjfatalex error trace code type file arraycontrollerbase.java host gas01 line number process control acc javacontainer 

In [9]:
len(description_corpus)

4046

#### Vectorize & TFIDF

In [11]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

In [12]:
#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

In [13]:
#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

In [14]:
#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [15]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

In [16]:
#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

#### Vizualize Topics

In [17]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,-0.108692,0.231802,1,1,13.048079
1,0.154847,-0.025773,2,1,12.004483
6,-0.042204,0.165147,3,1,10.880438
3,-0.104517,0.003894,4,1,10.375798
5,0.189585,-0.027621,5,1,10.141733
0,0.17777,-0.002672,6,1,9.706657
9,0.201576,-0.035596,7,1,9.570022
8,-0.128163,-0.171858,8,1,9.041604
7,-0.152496,0.046797,9,1,8.180659
4,-0.187705,-0.18412,10,1,7.050527


In [18]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,0.156752,-0.048383,1,1,21.138319
6,0.078822,-0.108364,2,1,16.370861
5,0.127203,-0.028785,3,1,14.108711
1,-0.429853,-0.2004,4,1,10.97553
2,0.015519,0.086414,5,1,8.12551
0,-0.158075,0.232064,6,1,6.729089
9,0.051126,-0.045003,7,1,6.182292
4,-0.079165,0.159493,8,1,6.137201
3,0.158903,-0.154499,9,1,5.136048
8,0.078768,0.107462,10,1,5.096441


In [19]:
#Summary_corpus

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [20]:
#Description_corpus

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)