## Topic Modeling with pyLDAvis
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = dataiku.Dataset("PRTSIR_dv22")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,description_cleaned,summary_cleaned
0,PRTSIR-2252,"error try run focus sbs bl corr version 0)""(pr...",aos blcorr not specify total power processor c...
1,PRTSIR-3330,-r -b 86.2433e9 -d -n fdm -w -s vy_cma fail fo...,aos bl error lock frontends pr2
2,PRTSIR-11648,observe -b --blanksky -c -m --array array003 -...,ape1 bl pr3 wait retune antenna dv23
3,PRTSIR-8202,execute mention sb correlator aca bl suddenly ...,aos bl aca correlator timeout finish scan plus...
4,PRTSIR-9098,run nogo version --fail-- start end project co...,aos dv01 srm connection failure


In [0]:
#drop missing values
#df = df.dropna()

In [3]:
len(df)

415

In [4]:
df = df.dropna(subset=['description_cleaned'])

In [5]:
#extract text from summary_cleaned
summary_corpus = df['summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['aos blcorr not specify total power processor consume datum antenna array',
 'aos bl error lock frontends pr2',
 'ape1 bl pr3 wait retune antenna dv23',
 'aos bl aca correlator timeout finish scan plus archive go error',
 'aos dv01 srm connection failure',
 'ape2 bl da58 device go stop psa check overtemp=',
 'ape2 bl pm04 timed wait second antenna source get no response antenna',
 'ape2 bl obs callback antlocontroller::lockfrontend function got error antenna',
 'aos da54 timedout wait second antenna source',
 'aos bl error execute sb control acc javacontainer',
 'aos crc03 mmex fail clue strange character square japanese aramaic like character appear shiflog',
 'aos bl error invoke observe mode function timed wait second correlator archive sub scan no point results!\\n',
 'aos bl da51 mount component not operational bus comunication error',
 'aos pr bl dv05 antenna not lock band',
 'aos da50 high trec polarization band',
 'da61 lpr configure',
 'aos bl dv01 control dv01 cppcontainer g

In [6]:
len(summary_corpus)

415

In [7]:
#extract text from description_cleaned
description_corpus = df['description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

['error try run focus sbs bl corr version 0)"(prtspr-4696 prtspr-4697 system complain know issue noformat 01t07:20:58.429 error control array030 error initialize observe mode type=10410 code=5 message=\'problem callback array=\'array030 01t07:20:58.377 error control array030 failure report callback type=10099 code=1 dv25= error trace code type file totalpowerimpl.cpp host dv25-abm line number process control dv25 cppcontainer routine virtual void control::totalpowerimpl::setdataconsumerasynch(const char control::antennacallback short description problem hardware source object control array030 thread orbtask additional data javaex.class value alma.acs.exceptions defaultacsjexception javaex.msg value error trace code type file totalpowerimpl.cpp host dv25-abm line number process control dv25 cppcontainer routine virtual void control::totalpowerimpl::setdataconsumer(const char short description problem hardware source object control array030 thread orbtask additional data detail value not

In [8]:
len(description_corpus)

415

#### Vectorize & TFIDF

In [9]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

In [10]:
#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

In [11]:
#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

In [12]:
#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [13]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

In [14]:
#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

#### Vizualize Topics

In [15]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,-0.077965,0.005395,1,1,17.492806
9,-0.07667,-0.02731,2,1,13.292986
7,-0.008136,-0.232317,3,1,12.762161
2,0.150022,-0.01186,4,1,10.630264
5,0.121341,0.087613,5,1,10.158005
1,-0.049055,-0.148006,6,1,9.992399
3,0.142585,0.070151,7,1,9.014597
6,0.120326,0.046754,8,1,6.269744
0,-0.202838,0.122988,9,1,6.065336
4,-0.119609,0.086593,10,1,4.321701


In [16]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,-0.138062,-0.059348,1,1,17.056255
4,-0.198469,0.036404,2,1,13.543836
1,0.015548,-0.241163,3,1,11.52798
0,-0.108944,-0.109466,4,1,10.814512
7,0.137884,-0.161335,5,1,10.775178
8,0.428833,0.034338,6,1,9.99395
3,-0.137984,-0.004268,7,1,9.808712
2,0.026374,0.279038,8,1,8.358242
6,-0.043848,0.220547,9,1,4.445653
5,0.018669,0.005254,10,1,3.675681


In [17]:
#Summary_corpus

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [18]:
#Description_corpus

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)