In [1]:
!pip install pyLDAvis



In [2]:

import numpy as np

# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns

# Model 1: Food and Animals

In [3]:
doc_1 = 'I like to eat broccoli and bananas.'
doc_2 = 'I ate a banana and spinach smoothie for breakfast.'
doc_3 = 'Chinchillas and kittens are cute.'
doc_4 = 'My sister adopted a kitten yesterday.'
doc_5 = 'Look at this cute hamster munching on a piece of broccoli.'

## Step 1: Preprocess our text.

In [4]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Stems
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # stemming
    porter_stemmer = PorterStemmer()
    
    text_processed = [porter_stemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
    except: 
        pass

    return text_processed ## <-- we're keeping our words distinct

In [5]:
text_process(doc_1)

['like', 'eat', 'broccoli', 'banana']

In [6]:
texts = [text_process(doc_1),
         text_process(doc_2),
         text_process(doc_3),
         text_process(doc_4),
         text_process(doc_5)]

In [7]:
texts

[['like', 'eat', 'broccoli', 'banana'],
 ['ate', 'banana', 'spinach', 'smoothi', 'breakfast'],
 ['chinchilla', 'kitten', 'cute'],
 ['sister', 'adopt', 'kitten', 'yesterday'],
 ['look', 'cute', 'hamster', 'munch', 'piec', 'broccoli']]

## Step 2: Fit LDA Model.

In [8]:
from gensim import corpora, models
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

np.random.seed(42)

In [9]:
dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

ldamodel = models.ldamodel.LdaModel(corpus,                     # pass in our corpus
                                    id2word = dictionary,       # matches each word to its "number" or "spot" in the dictionary
                                    num_topics = 2,             # number of topics T to find
                                    passes = 5,                 # number of passes through corpus; similar to number of epochs
                                    minimum_probability = 0.01) # only include topics above this probability threshold

In [10]:
for text in texts:
    print(dictionary.doc2bow(text))

[(0, 1), (1, 1), (2, 1), (3, 1)]
[(0, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(8, 1), (9, 1), (10, 1)]
[(10, 1), (11, 1), (12, 1), (13, 1)]
[(1, 1), (9, 1), (14, 1), (15, 1), (16, 1), (17, 1)]


## Step 3: Visualize LDA model.

In [11]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Step 4: Update model with new data!

In [12]:
doc_6 = 'That cat is so cute! It looks good enough to eat.'

In [13]:
new_doc = text_process(doc_6)

In [14]:
ldamodel.update([dictionary.doc2bow(new_doc)])

In [15]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


# Model 2: Yelp Reviews

In [16]:
review = pd.read_json("../yelp_academic_dataset_review.json")

In [17]:
review.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,"{'funny': 0, 'useful': 5, 'cool': 2}"
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,"{'funny': 0, 'useful': 0, 'cool': 0}"
10,AsSCv0q_BWqIe3mX2JqsOQ,2010-06-16,E11jzpKz9Kw5K7fuARWfRw,5,The oldish man who owns the store is as sweet ...,review,-OMlS6yWkYjVldNhC31wYg,"{'funny': 1, 'useful': 3, 'cool': 1}"
100,CrBsdxqOjPdnfsDxV89GJQ,2010-02-17,SmUMyCUNrT9HEo_DXdgUuQ,4,I have to admit that I find myself thinking th...,review,bZFRqP7s0Vszxeu8_IwYow,"{'funny': 0, 'useful': 1, 'cool': 0}"
1000,vfLog2bLJGl6hAFtGtr0GQ,2012-10-30,bmsk5foqmcBgQ_pylNDpCw,4,Great atmosphere with interesting lights (look...,review,3ltazFFclBfchSYlctX6iA,"{'funny': 0, 'useful': 0, 'cool': 0}"


In [18]:
review.shape

(229907, 8)

In [19]:
texts = []

In [20]:
import time

In [21]:
t0 = time.time()

for i in range(10000):
    if i % 1000 == 0:
        print("Iteration {}".format(i))
        print(str(time.time() - t0) + " seconds elapsed.")
    texts.append(text_process(review.loc[i, 'text']))

Iteration 0
0.000324249267578125 seconds elapsed.
Iteration 1000
25.63820219039917 seconds elapsed.
Iteration 2000
52.177359104156494 seconds elapsed.
Iteration 3000
79.06812024116516 seconds elapsed.
Iteration 4000
105.75904703140259 seconds elapsed.
Iteration 5000
131.3081660270691 seconds elapsed.
Iteration 6000
159.84377312660217 seconds elapsed.
Iteration 7000
188.4672031402588 seconds elapsed.
Iteration 8000
216.40954208374023 seconds elapsed.
Iteration 9000
245.6086630821228 seconds elapsed.


In [22]:
texts[0]

['wife',
 'took',
 'birthday',
 'breakfast',
 'excel',
 'weather',
 'perfect',
 'made',
 'sit',
 'outsid',
 'overlook',
 'ground',
 'absolut',
 'pleasur',
 'waitress',
 'excel',
 'food',
 'arriv',
 'quickli',
 'semi',
 'busi',
 'saturday',
 'morn',
 'look',
 'like',
 'place',
 'fill',
 'pretti',
 'quickli',
 'earlier',
 'get',
 'better',
 'favor',
 'get',
 'bloodi',
 'mari',
 'phenomen',
 'simpli',
 'best',
 'ever',
 'pretti',
 'sure',
 'use',
 'ingredi',
 'garden',
 'blend',
 'fresh',
 'order',
 'amaz',
 'everyth',
 'menu',
 'look',
 'excel',
 'white',
 'truffl',
 'scrambl',
 'egg',
 'veget',
 'skillet',
 'tasti',
 'delici',
 'came',
 '2',
 'piec',
 'griddl',
 'bread',
 'amaz',
 'absolut',
 'made',
 'meal',
 'complet',
 'best',
 'toast',
 'ever',
 'anyway',
 'wait',
 'go',
 'back']

In [26]:
dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

ldamodel = models.ldamodel.LdaModel(corpus,                     # pass in our corpus
                                    id2word = dictionary,       # matches each word to its "number" or "spot" in the dictionary
                                    num_topics = 10,            # number of topics T to find
                                    passes = 5,                 # number of passes through corpus; similar to number of epochs
                                    minimum_probability = 0.01) # only include topics above this probability threshold

In [27]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [25]:
for i in ldamodel.print_topics(num_topics=40, num_words=5):
    print(i)

(0, '0.039*"ice" + 0.037*"cream" + 0.031*"chocol" + 0.023*"flavor" + 0.022*"cake"')
(1, '0.036*"class" + 0.017*"school" + 0.013*"year" + 0.011*"festiv" + 0.010*"student"')
(2, '0.014*"nail" + 0.009*"time" + 0.008*"use" + 0.008*"well" + 0.008*"call"')
(3, '0.022*"yogurt" + 0.022*"gyro" + 0.018*"frozen" + 0.016*"hike" + 0.015*"pet"')
(4, '0.104*"dog" + 0.020*"hot" + 0.016*"ny" + 0.015*"park" + 0.012*"can"')
(5, '0.031*"pho" + 0.030*"noodl" + 0.021*"roll" + 0.016*"spring" + 0.016*"asian"')
(6, '0.023*"buffet" + 0.017*"food" + 0.015*"indian" + 0.012*"place" + 0.012*"restaur"')
(7, '0.027*"fri" + 0.022*"burger" + 0.016*"good" + 0.015*"place" + 0.015*"like"')
(8, '0.035*"game" + 0.034*"mall" + 0.016*"place" + 0.015*"park" + 0.015*"watch"')
(9, '0.012*"spa" + 0.011*"amaz" + 0.011*"beauti" + 0.011*"resort" + 0.010*"design"')
(10, '0.072*"breakfast" + 0.036*"egg" + 0.019*"toast" + 0.017*"brunch" + 0.016*"bacon"')
(11, '0.023*"us" + 0.021*"server" + 0.020*"hour" + 0.016*"happi" + 0.013*"food"')
