In [1]:
!pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 2.1MB/s ta 0:00:011
Collecting numexpr (from pyLDAvis)
[?25l  Downloading https://files.pythonhosted.org/packages/e2/ef/f37e4f11eadc37af2aaf85cd8b13ca27724f67fc28c185bac6eaa8bddb03/numexpr-2.6.5-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (166kB)
[K    100% |████████████████████████████████| 174kB 2.0MB/s ta 0:00:01
[?25hCollecting pytest (from pyLDAvis)
[?25l  Downloading https://files.pythonhosted.org/packages/d3/75/e79b66c9fe6166a90004bb8fb02bab06213c3348e93f3be41d7eaf625554/pytest-3.6.1-py2.py3-none-any.whl (194kB)
[K    100% |████████████████████████████████| 194kB 1.8MB/s ta 0:00:01
[?25hCollecting future (from pyLDAvis)
[?25l  Downloading https://files.pythonhosted.org/packages/00/2

In [9]:
import pandas as pd
import numpy as np

# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
import re

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns

# Model 1: Food and Animals

In [5]:
doc_1 = 'I like to eat broccoli and bananas.'
doc_2 = 'I ate a banana and spinach smoothie for breakfast.'
doc_3 = 'Chinchillas and kittens are cute.'
doc_4 = 'My sister adopted a kitten yesterday.'
doc_5 = 'Look at this cute hamster munching on a piece of broccoli.'

## Step 1: Preprocess our text.

In [19]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Stems
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # stemming
    porter_stemmer = PorterStemmer()
    
    text_processed = [porter_stemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
    except: 
        pass

    return text_processed ## <-- we're keeping our words distinct

In [20]:
text_process(doc_1)

['like', 'eat', 'broccoli', 'banana']

In [21]:
texts=(text_process(doc_1),
      text_process(doc_2),
      text_process(doc_3),
      text_process(doc_4),
      text_process(doc_5))

In [22]:
texts

(['like', 'eat', 'broccoli', 'banana'],
 ['ate', 'banana', 'spinach', 'smoothi', 'breakfast'],
 ['chinchilla', 'kitten', 'cute'],
 ['sister', 'adopt', 'kitten', 'yesterday'],
 ['look', 'cute', 'hamster', 'munch', 'piec', 'broccoli'])

## Step 2: Fit LDA Model.

In [16]:
from gensim import corpora, models

In [17]:
import pyLDAvis.gensim

In [18]:
pyLDAvis.enable_notebook() # for the visual

In [23]:
np.random.seed(42)

In [24]:
dictionary=corpora.Dictionary(texts)

In [35]:
dictionary


<gensim.corpora.dictionary.Dictionary at 0x1a26e7a160>

In [26]:
corpus=[dictionary.doc2bow(text) for text in texts] # turn each documents into a bag of words

In [33]:
#instantiate
ldamodel=models.ldamodel.LdaModel(corpus, # pass in our corpus
                                 id2word= dictionary, ## match documents to spot in dict
                                 num_topics=3,# Hyperparameter num of topics
                                 passes=5,
                                 minimum_probability=0.01) #how many times do we run this

In [31]:
for text in texts:
    print(dictionary.doc2bow(text))

[(0, 1), (1, 1), (2, 1), (3, 1)]
[(0, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(8, 1), (9, 1), (10, 1)]
[(10, 1), (11, 1), (12, 1), (13, 1)]
[(1, 1), (9, 1), (14, 1), (15, 1), (16, 1), (17, 1)]


## Step 3: Visualize LDA model.

In [34]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Step 4: Update model with new data!

In [None]:
doc_6 = 'That cat is so cute! It looks good enough to eat.'

# Model 2: Yelp Reviews

In [None]:
review = pd.read_json("./yelp_academic_dataset_review.json")