# Topic Modelling (using Gensim Python library)

### Use the Gensim Python libary to do topic modeling. 

pip install gensim at first

### IVisualize the LDA topics using pyLDAvis.  

pip install pyldavis at first


In [1]:

import pandas as pd 
import gensim
from gensim import corpora,models
from gensim.models import LdaModel, LsiModel
import warnings
warnings.filterwarnings("ignore")

## 1. Preprocessing 

In [2]:
# Read data
df = pd.read_csv('Canon_200+.csv')
df

Unnamed: 0,title,rating,body
0,"Convenient printer, but could stand a few impr...",4,I had a devil of a time downloading the driver...
1,"Definitely not a ""throw away"" printer!",4,"OK, I have waited a while to post this review ..."
2,Amazing printer! Photo quality (scanning and p...,5,"I love this printer.It prints quickly, scans i..."
3,make sure you have room for this printer,3,I did not anticipate it being this big but non...
4,Excellent Canon MX922 All IN One Printer!,5,I ordered the Canon Office and Business MX922 ...
...,...,...,...
185,It is a great printer.,1,It is a great printer......when it works. It ...
186,B200 error - printer died less than 2 years af...,2,"This printer was fine while it lasted, but the..."
187,This is awesome ink can be a little costly,5,"This is awesome ink can be a little costly, bu..."
188,so good. Easy setup,4,"So far, so good. Easy setup... Just follow the..."


In [3]:
#convert all review text into list format
reviews = df['body'].tolist()
reviews

["I had a devil of a time downloading the drivers for this printer, but that is probably a problem with my computer rather than the printer.  Otherwise, set-up is easy.  For those who need it, there IS a quite good on-line manual for this printer, but the set-up instructions that come with the box don't tell you that.  I can't even find in the instructions an answer to this question, though:  Can you set it up for BOTH wired contact with a desktop PC AND a wireless laptop at the same time?  Seems like a simple and common question to ask, and it's puzzling that Canon hasn't anticipated it.I am very happy with the printer's operation....it does 1- to 2-sided printing and copying quite easily.  There something quite confusing about the screen options on the front though.  There is simply no menu option for printing.  There are options for fax, copying and one or two other things, but I still haven't figured out how to switch FROM copying to normal printing; sometimes the PC just overrides

In [4]:
#remove '\r' and punctuations 
import string

new_reviews = []
for review in reviews: 
    for ch in review:
        if ch in string.punctuation:
            review = review.replace(ch,'') #replace punctuation with nothing
        if ch == '\r':
            review = review.replace(ch,' ') #replace \r with space
    new_reviews.append(review) 

new_reviews

['I had a devil of a time downloading the drivers for this printer but that is probably a problem with my computer rather than the printer  Otherwise setup is easy  For those who need it there IS a quite good online manual for this printer but the setup instructions that come with the box dont tell you that  I cant even find in the instructions an answer to this question though  Can you set it up for BOTH wired contact with a desktop PC AND a wireless laptop at the same time  Seems like a simple and common question to ask and its puzzling that Canon hasnt anticipated itI am very happy with the printers operationit does 1 to 2sided printing and copying quite easily  There something quite confusing about the screen options on the front though  There is simply no menu option for printing  There are options for fax copying and one or two other things but I still havent figured out how to switch FROM copying to normal printing sometimes the PC just overrides the printers setting but sometim

In [5]:
#remove stop words, to lowercase and tokenize
from nltk.corpus import stopwords
mystopwords = stopwords.words('english')

tokens_list = [[word for word in review.lower().split(' ') if word not in mystopwords and word.isalpha()]
         for review in new_reviews]

#remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)

for tokens in tokens_list:
    for token in tokens:
        frequency[token] += 1
        
tokens_list = [[token for token in tokens if frequency[token]>1]
              for tokens in tokens_list]

print(tokens_list)

[['time', 'drivers', 'printer', 'probably', 'problem', 'computer', 'rather', 'printer', 'otherwise', 'setup', 'easy', 'need', 'quite', 'good', 'online', 'manual', 'printer', 'setup', 'instructions', 'come', 'box', 'dont', 'tell', 'cant', 'even', 'find', 'instructions', 'question', 'though', 'set', 'wired', 'contact', 'desktop', 'pc', 'wireless', 'laptop', 'time', 'seems', 'like', 'simple', 'common', 'question', 'ask', 'canon', 'hasnt', 'iti', 'happy', 'printers', 'printing', 'copying', 'quite', 'easily', 'something', 'quite', 'confusing', 'screen', 'options', 'front', 'though', 'simply', 'menu', 'option', 'printing', 'options', 'fax', 'copying', 'one', 'two', 'things', 'still', 'havent', 'figured', 'switch', 'copying', 'normal', 'printing', 'sometimes', 'pc', 'printers', 'setting', 'sometimes', 'bought', 'replace', 'canon', 'bought', 'months', 'ago', 'printer', 'trays', 'broke', 'couple', 'days', 'kind', 'use', 'trays', 'without', 'could', 'never', 'figure', 'replace', 'printer', 'tray

## 2. Generate Term Document Matrix

In [6]:
# generate token dictionary class
dictionary = corpora.Dictionary(tokens_list) 
print(dictionary)

Dictionary(1027 unique tokens: ['ago', 'ask', 'better', 'bit', 'bought']...)


In [7]:
# generate a unique token list 
sort_token = sorted(dictionary.items(),key=lambda k:k[0], reverse = False)
unique_token = [token for (ID,token) in sort_token]

In [8]:
# build a corpus
corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]
print(corpus) 

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 3), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 2), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 2), (50, 1), (51, 2), (52, 1), (53, 6), (54, 2), (55, 3), (56, 1), (57, 1), (58, 2), (59, 3), (60, 1), (61, 2), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 2), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 2), (77, 3), (78, 3), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1)], [(7, 3), (15, 2), (17, 1), (19, 2), (23, 1), (27, 1), (30, 4), (31, 1), (39, 4), (44, 1), (46, 3), (48, 2), (52, 1), (53, 8), (54, 2), (55, 3), (56, 1), (60, 1), (61, 1), (64, 1), (68, 1), (71, 1), (72, 2), (73, 1), (75, 1), (77, 1), (79, 2), (80, 7),

In [9]:
# Save a Term Document Matrix
import numpy as np
matrix = gensim.matutils.corpus2dense(corpus,num_terms=len(dictionary),dtype = 'int')
matrix = matrix.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df = pd.DataFrame(matrix, columns=unique_token)

#write matrix dataframe into csv
matrix_df.to_csv('Term_Document_matrix.csv')

## 3. Topic modeling using LDA

In [10]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=3) #fit lda model

lda.print_topics(10) # Topic matrix (V matrix)

[(0,
  '0.022*"printer" + 0.017*"print" + 0.013*"ink" + 0.010*"one" + 0.009*"set" + 0.009*"get" + 0.008*"easy" + 0.008*"great" + 0.008*"use" + 0.007*"paper"'),
 (1,
  '0.033*"printer" + 0.018*"one" + 0.012*"use" + 0.012*"print" + 0.011*"would" + 0.009*"quality" + 0.008*"paper" + 0.008*"canon" + 0.008*"get" + 0.007*"printing"'),
 (2,
  '0.042*"printer" + 0.023*"print" + 0.015*"canon" + 0.013*"printing" + 0.011*"paper" + 0.010*"one" + 0.008*"time" + 0.008*"like" + 0.008*"ink" + 0.007*"works"')]

In [11]:
# Generate U Matrix for LDA model
corpus_lda = lda[corpus] #transform lda model

#convert corpus_lda to numpy matrix
U_matrix_lda = gensim.matutils.corpus2dense(corpus_lda,num_terms=10).T

#write U_matrix into pandas dataframe and output
U_matrix_lda_df = pd.DataFrame(U_matrix_lda)
U_matrix_lda_df.to_csv('U_matrix_lda.csv')

In [12]:
print (matrix_df.shape)
print (U_matrix_lda_df.shape)

(190, 1027)
(190, 10)


## 3. Visualization of LDA topics using pyLDAvis

In [13]:

import pyLDAvis.gensim

In [14]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, corpus, dictionary)