## Topic Modeling with pyLDAvis

In [1]:
#Cleaning of text column completed before this step using Dataiku Text Preparation Plugin
#Resources: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
#https://towardsdatascience.com/topic-model-visualization-using-pyldavis-fecd7c18fbf6

In [2]:
#Import Libraries
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import os

import pyLDAvis.gensim
import pickle 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read recipe inputs
Text_Cleaned = dataiku.Dataset("Text_Cleaned")
Text_Cleaned_df = Text_Cleaned.get_dataframe()
Text_Cleaned_df

Unnamed: 0,SE_DOWNTIMETYPE,SE_SUBJECT_concat_cleaned
0,Other,pr bl da63 control cppcontainer go shutdown pr...
1,Scheduling,bl ape2 tp statistic waiting end execution inc...
2,Technical,slt test please ignore pr bl totalpowerprocess...
3,Weather,bad wather bad weather bad weather shutdown ac...


In [4]:
# Extract text column as a list
text_list = Text_Cleaned_df['SE_SUBJECT_concat_cleaned'].tolist()
# Corpus as a list of text documents
corpus_full = [' '.join(text.split()) for text in text_list]

In [5]:
corpus_full

["pr bl da63 control cppcontainer go shutdown pr bl failed activate component control array001 totalpowerprocessor',pr bl interferometric point fail getinterferometryoffshoot alma-10_6_0-b-2014 00,pr6 tp not work fdm acs cm05 container crash array dv08 mount subreflector power failure pr1 bl correlator resource conflict receive callbacks expect 3',other da62 high tsys value da58 not lock band7,other da47 not lock band7,pr1 bl correlator resource conflict receive callbacks expect 3',pr1 bl corr cdp_node n16 lp fail stop continue stop sequence ,other dv06 sudden servo failure axis ,other da47 fail lock band 6.,other cm03 cm05 cm12 cm07 container crash time handover control go error dv09 fe wca3 show weird status failed activate component control array /totalpowerprocessor tfint pm01 beam distortion pr1 bl correlator resourse conflict value exception throw clustercommander.cpp:244 little node fail execute method startsubscansequence),other dv06 acd little motor not position',other da49 ac

In [6]:
len(corpus_full)

4

#### Vectorize & TFIDF

In [7]:
#tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
#                                stop_words = 'english',
#                                lowercase = True,
#                                token_pattern = r'\b[a-zA-Z]{3,}\b',
#                                max_df = 0.95, 
#                                min_df = 1)

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm_tf = tf_vectorizer.fit_transform(corpus_full)
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(corpus_full)

In [8]:
#dtm_tf
#tfidf_vectorizer
#dtm_tfidf

#### LDA Model

In [9]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=4, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
#lda_tfidf = LatentDirichletAllocation(n_components=4, random_state=1)
#lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(n_components=4, random_state=0)

#### Vizualize Topics

In [10]:
# define a dictionary of topic names
topic_names = {
    1: 'Technical',
    2: 'Weather',
    3: 'Scheduling',
    4: 'Other'
}

In [11]:
#Prepare the visualization
prepared_data = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
#prepared_data.topic_coordinates.rename(index={0: 'Other', 2: 'Scheduling', 3: 'Technical', 1: 'Weather'}, inplace=True)

# map the topic numbers to their corresponding names
#prepared_data.topic_coordinates['topics'] = prepared_data.topic_coordinates['topics'].map(topic_names)

prepared_data.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,-0.2201,0.011012,1,1,71.935173
1,0.161453,0.214129,2,1,20.826245
2,0.157851,-0.211951,3,1,7.23499
0,-0.099204,-0.01319,4,1,0.003591


In [12]:
prepared_data.topic_coordinates['x'] = prepared_data.topic_coordinates['x'].apply(lambda x: x.real)
prepared_data.topic_coordinates['y'] = prepared_data.topic_coordinates['y'].apply(lambda x: x.real)

pyLDAvis.display(prepared_data)

In [13]:
# Write recipe outputs
Topic_Modeling3 = dataiku.Dataset("Topic_Modeling3")
#Topic_Modeling3.write_with_schema(Topic_Modeling3_df)