## Topic Modeling with pyLDAvis
-kernel = env mypython

In [1]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = dataiku.Dataset("PRTSIR_Band")
df = data.get_dataframe()
df.head()

Unnamed: 0,key,description_cleaned,summary_cleaned
0,PRTSIR-9834,successfully finish follow sb noformat hub tel...,aos aca_7m_pr2 sb datum not find archive
1,PRTSIR-18629,happen fsr error messge noformat summary aband...,ape2 aca calibration[pre/0000000002 succeed ho...
2,PRTSIR-770,running noformat deeper look life cycle molecu...,aos aca yellow triangle scan
3,PRTSIR-9722,run delay model measurement version --fail-- s...,aos aca cm04 el stowpin retracted false timeou...
4,PRTSIR-9286,run noformat failure galactic star formation r...,aos aca failed receive correlation datum aca c...


In [0]:
#drop missing values
#df = df.dropna()

In [4]:
len(df)

3389

In [5]:
df = df.dropna(subset=['description_cleaned'])

In [6]:
#extract text from summary_cleaned
summary_corpus = df['summary_cleaned'].tolist()
# Corpus as a list of text documents
#summary_corpus = [' '.join(text.split()) for text in summary_corpus]
summary_corpus

['aos aca_7m_pr2 sb datum not find archive',
 'ape2 aca calibration[pre/0000000002 succeed however fail create calibration data[id=1',
 'aos aca yellow triangle scan',
 'aos aca cm04 el stowpin retracted false timeout true',
 'aos aca failed receive correlation datum aca cdpcs',
 'ape1 aca tp pm04 axis shutdown',
 'ape2 cm01 fe33 high trec band bbpr fe33',
 'aos bl sbex array instance no attribute getsinglefieldinterferometryobservingmode',
 'ape1 dv19 band trec flag spectral check band sweep',
 'ape1 bl dv03 anomalous tsys trx band pol scan',
 'aos_bl_pr1 dv09 band locking problem',
 'aos dv11 antenna go unaccessable',
 'aos aca no point result telcal exception',
 'aos bl invalid array error define array correlator',
 'aos bl dv17 fe not lock band6',
 'aos aca timed wait second correlator start sub scan',
 'aos bl error lock frontends pr2',
 'da61 very high trx band',
 'aos bl pr not lock photonic reference',
 'aos sb error invoke observe mode function',
 'aos dv14 suddenly container 

In [7]:
len(summary_corpus)

3389

In [8]:
#extract text from description_cleaned
description_corpus = df['description_cleaned'].tolist()
# Corpus as a list of text documents
#description_corpus = [' '.join(text.split()) for text in description_corpus]
description_corpus

 'happen fsr error messge noformat summary abandoning antenna motion tune need correlator calibration not calibrate correlator underlie error type=20000 code=0 error trace error trace code type file acacorr_observationcontrolimpl.cpp host coj cc-1 line number process acacorr observation_control cppcontainer routine acacorr_observationcontrolimpl::createcallbackparam short description failed handle interface method invocation source object thread orbtask additional data timestamp details error trace code type file calibrationexecutor.java host gas01 line number process control acc javacontainer routine docalibrations short description unrecoverable error occur source object control array3-aca thread requestprocessor-3013 additional data usererrormessage value abandoning antenna motion tune need correlator calibration javaex.class value alma controlexceptions.wrappers acsjfatalex error trace code type file arraycontrollerbase.java host gas01 line number process control acc javacontainer 

In [9]:
len(description_corpus)

3389

#### Vectorize & TFIDF

In [10]:
#summary_corpus
summary_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
summary_dtm_tf = summary_tf_vectorizer.fit_transform(summary_corpus)
summary_tfidf_vectorizer = TfidfVectorizer(**summary_tf_vectorizer.get_params())
summary_dtm_tfidf = summary_tfidf_vectorizer.fit_transform(summary_corpus)

In [11]:
#summary_dtm_tf
#summary_tfidf_vectorizer
#summary_dtm_tfidf

In [12]:
#description_corpus
description_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
description_dtm_tf = description_tf_vectorizer.fit_transform(description_corpus)
description_tfidf_vectorizer = TfidfVectorizer(**description_tf_vectorizer.get_params())
description_dtm_tfidf = description_tfidf_vectorizer.fit_transform(description_corpus)

In [13]:
#description_dtm_tf
#description_tfidf_vectorizer
#description_dtm_tfidf

#### LDA MODEL

In [14]:
#summary_corpus
#for TF DTM
summary_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
summary_lda_tf.fit(summary_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(summary_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

In [17]:
#description_corpus
#for TF DTM
description_lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
description_lda_tf.fit(description_dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(description_dtm_tfidf)

LatentDirichletAllocation(random_state=0)

#### Vizualize Topics

In [18]:
#Prepare the visualization for summary_corpus
summary_prepared_data = pyLDAvis.sklearn.prepare(summary_lda_tf, summary_dtm_tf, summary_tf_vectorizer)
summary_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,-0.282622,0.069735,1,1,16.782459
8,0.189675,0.184014,2,1,15.203467
5,-0.152336,0.057567,3,1,12.724393
3,0.080032,-0.143173,4,1,10.149932
9,0.050381,-0.000473,5,1,9.858323
7,-0.096541,-0.021993,6,1,8.968292
6,0.089507,-0.176565,7,1,8.187902
2,-0.197164,-0.0204,8,1,7.526326
0,0.183222,0.163566,9,1,6.846476
1,0.135847,-0.112278,10,1,3.75243


In [19]:
#Prepare the visualization for description_corpus
description_prepared_data = pyLDAvis.sklearn.prepare(description_lda_tf, description_dtm_tf, description_tf_vectorizer)
description_prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,-0.10065,-0.185548,1,1,26.111052
1,-0.15345,0.000263,2,1,12.844936
5,-0.094482,-0.09369,3,1,9.670926
2,0.068687,0.190208,4,1,9.642727
7,0.439991,0.000579,5,1,9.2772
6,-0.088135,0.042612,6,1,8.198135
4,0.105664,-0.249196,7,1,7.520711
9,-0.162563,0.046244,8,1,6.698667
0,-0.091692,0.13217,9,1,5.270406
3,0.07663,0.116359,10,1,4.76524


In [20]:
#Summary_corpus

summary_prepared_data.topic_coordinates['x'] = summary_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
summary_prepared_data.topic_coordinates['y'] = summary_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(summary_prepared_data)

In [21]:
#Description_corpus

description_prepared_data.topic_coordinates['x'] = description_prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
description_prepared_data.topic_coordinates['y'] = description_prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(description_prepared_data)