In [1]:
from bs4 import BeautifulSoup
import sys
import time
import logging
import argparse
import requests
import codecs
import urllib
import os
import requests
import json
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.corpora import BleiCorpus
from gensim import corpora
from gensim.models import LdaModel

In [2]:
# YELP DATASET: train on business reviews for businesses within the "Coffee & Tea" category

coffee_places = set()

with open('yelp_academic_dataset_business.json') as businesses:
    for item in businesses:
        biz = json.loads(item)
        if biz['categories'] is not None:
            if "Coffee & Tea" in biz['categories']:
                coffee_places.add(biz['business_id'])
        
reviewData = {}

with open('yelp_academic_dataset_review.json') as reviews:
    for item in reviews:
        rev = json.loads(item)
        if rev['business_id'] in coffee_places:
            reviewData[rev['review_id']] = {'text':rev['text'], 'stars':rev['stars']}            

In [3]:
list(reviewData.items())[:100]

[('46nn9iMfGWb-VqfontnFkQ',
  {'stars': 4,
   'text': "The decor for this bakery sure is nifty. Part of me hates is because it's borderline hoarder mentality with the overwhelming amount of random stuff and mismatched tables and chairs. But a part of me also adore it because it is so unique. \n\nMaybe I'm just a closest hoarder.\n\nCame on the day of my flight back to NY, figuring that as a bakery that is opened 24/7, and I'm part of the church belt, it'd be sort of slow. Nope. I was wrong, thankfully the service was quick and I go the front of the line within 10 minutes or so.\n\nI ordered a two macaroons, a mini berry tart, two bags of coffee, terragon & mint tea, and a roast beef sandwich. The tea itself was a bit on the disappointing side(should've just gotten iced black coffee). The tart, macaroons, and roast beef sandwich however, were very tasty. And still kind of left me wanting more. \n\nI dug it. You bet I'll be back if I'm ever back in Charlotte again."}),
 ('_xKDt1ozO_2zG4z

In [4]:
# Split review into sentences, remove stopwords, extract parts-of-speech tags
# (opt. if lots of reviews) store each review into MongoDB db called 'Reviews'

stopWords = set(stopwords.words('english'))

for revId in reviewData:
    reviewWords = []
    sentences = nltk.sent_tokenize(reviewData[revId]['text'].lower())
    
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        text = [w for w in tokens if w not in stopWords]
        tagged_text = nltk.pos_tag(text)
        
        for word, tag in tagged_text:
            reviewWords.append({'word': word, 'pos': tag})
    
    reviewData[revId]['review_words'] = reviewWords

list(reviewData.items())[:100]

[('46nn9iMfGWb-VqfontnFkQ',
  {'review_words': [{'pos': 'NN', 'word': 'decor'},
    {'pos': 'NN', 'word': 'bakery'},
    {'pos': 'JJ', 'word': 'sure'},
    {'pos': 'NN', 'word': 'nifty'},
    {'pos': '.', 'word': '.'},
    {'pos': 'NN', 'word': 'part'},
    {'pos': 'VBZ', 'word': 'hates'},
    {'pos': 'POS', 'word': "'s"},
    {'pos': 'JJ', 'word': 'borderline'},
    {'pos': 'NN', 'word': 'hoarder'},
    {'pos': 'NN', 'word': 'mentality'},
    {'pos': 'VBG', 'word': 'overwhelming'},
    {'pos': 'NN', 'word': 'amount'},
    {'pos': 'JJ', 'word': 'random'},
    {'pos': 'NN', 'word': 'stuff'},
    {'pos': 'VBD', 'word': 'mismatched'},
    {'pos': 'NNS', 'word': 'tables'},
    {'pos': 'NNS', 'word': 'chairs'},
    {'pos': '.', 'word': '.'},
    {'pos': 'NN', 'word': 'part'},
    {'pos': 'RB', 'word': 'also'},
    {'pos': 'RB', 'word': 'adore'},
    {'pos': 'JJ', 'word': 'unique'},
    {'pos': '.', 'word': '.'},
    {'pos': 'RB', 'word': 'maybe'},
    {'pos': 'VBP', 'word': "'m"},
    {'pos

In [5]:
def lemmatize(reviewDict):
    # loop through the reviews
    # get nouns and group them by lemma
    reviewCorpus = {}
    lemmatizer = nltk.WordNetLemmatizer()

    for review_count, review_content in reviewDict.items():
        nouns = []
        words = [w for w in review_content['review_words'] if w['pos'] in ['NN','NNS']]
        
        for w in words:
            nouns.append(lemmatizer.lemmatize(w['word']))
            
        reviewCorpus[review_count] = {'review_stars' : review_content['stars'], \
                                      'review_text' : review_content['text'], \
                                      'review_nouns' : nouns} 
    
    return reviewCorpus

In [14]:
# feed reviews to LDA model using k topics
def train(reviewDict, k):
    
    '''
    create id2word
    cannot filter extremes when the set is too small (only 1 page of yelp)
    '''
    id2word = corpora.Dictionary(reviewDict[review]["review_nouns"] for review in reviewDict)
    print('length of id2word : ' + str(len(id2word)))
    
    # filter extremes (not working)
    # id2word.filter_extremes(keep_n=50)
    # id2word.compactify()

    corpora_dict = corpora.Dictionary(reviewDict[review]["review_nouns"] for review in reviewDict)
    corpora_dict.save('lda_fe/dictionary.dict')
    
    corpus = [corpora_dict.doc2bow(reviewDict[review]["review_nouns"]) for review in reviewDict]
    corpora.BleiCorpus.serialize('lda_fe/corpus.lda-c', corpus)
    corpus = corpora.BleiCorpus('lda_fe/corpus.lda-c')
    
    if k == 50:
        # save lda model for 50 topics
        lda = gensim.models.LdaModel(corpus, num_topics=50, id2word=id2word)
        lda.save('lda_fe/lda_50_topics.lda')
    
    elif k == 25:
        # save lda model for 25 topics
        lda = gensim.models.LdaModel(corpus, num_topics=25, id2word=id2word)
        lda.save('lda_fe/lda_25_topics.lda')
    
    return lda

In [15]:
import requests
import json

def main():
    global REVIEW_DICT 
    REVIEW_DICT = reviewData 
        
    '''
    Check if the folder for the lda model exists
    If it doesnt create the folder 
    '''
    if not os.path.exists('lda_fe'):
        os.makedirs('lda_fe')
    
    train(lemmatize(REVIEW_DICT), 25)

    
if __name__ == '__main__':
    main()

length of id2word : 91531


IndexError: index 50 is out of bounds for axis 1 with size 50

In [None]:
# Get all 50 topics using K=50
WEIGHT_TOPIC = []

dictionary_path = "lda_fe/dictionary.dict"
corpus_path = "lda_fe/corpus.lda-c"
lda_model_path = "lda_fe/lda_50_topics.lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaModel.load(lda_model_path)

# See topics
TOPIC_DICT = dict(lda.show_topics(num_topics=50))

for topicN, topicWeights in TOPIC_DICT.items():
    print('Topic ' + str(topicN) + ' : \n' + str(topicWeights) + '\n')

In [None]:
# Get all 25 topics using K=25

WEIGHT_TOPIC = []

dictionary_path = "lda_fe/dictionary.dict"
corpus_path = "lda_fe/corpus.lda-c"
lda_model_path = "lda_fe/lda_25_topics.lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaModel.load(lda_model_path)

# See topics
TOPIC_DICT = dict(lda.show_topics(num_topics=25))

for topicN, topicWeights in TOPIC_DICT.items():
    print('Topic ' + str(topicN) + ' : \n' + str(topicWeights) + '\n')