In [1]:
from bs4 import BeautifulSoup
import sys
import time
import logging
import argparse
import requests
import codecs
import urllib
import os
import requests
import json
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.corpora import BleiCorpus
from gensim import corpora
from gensim.models import LdaModel

In [5]:
# YELP DATASET: train on business reviews for businesses within the "Coffee & Tea" category

coffee_places = set()

with open('yelp_academic_dataset_business.json') as businesses:
    for item in businesses:
        biz = json.loads(item)
        if biz['categories'] is not None:
            if "Coffee & Tea" in biz['categories']:
                coffee_places.add(biz['business_id'])
        
reviewData = {}

with open('yelp_academic_dataset_review.json') as reviews:
    for item in reviews:
        rev = json.loads(item)
        if rev['business_id'] in coffee_places:
            reviewData[rev['review_id']] = {'text':rev['text'], 'stars':rev['stars']}            

In [43]:
#list(reviewData.items())[:100]

179409

In [6]:
# Split review into sentences, remove stopwords, extract parts-of-speech tags
# (opt. if lots of reviews) store each review into MongoDB db called 'Reviews'

stopWords = set(stopwords.words('english'))

for revId in reviewData:
    reviewWords = []
    sentences = nltk.sent_tokenize(reviewData[revId]['text'].lower())
    
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        text = [w for w in tokens if w not in stopWords]
        tagged_text = nltk.pos_tag(text)
        
        for word, tag in tagged_text:
            reviewWords.append({'word': word, 'pos': tag})
    
    reviewData[revId]['review_words'] = reviewWords

list(reviewData.items())[:100]

[('1UJoZE55HxmEPSIVdFThUg',
  {'review_words': [{'pos': 'NNS', 'word': 'les'},
    {'pos': 'NNS', 'word': 'choses'},
    {'pos': 'VBP', 'word': 'se'},
    {'pos': 'JJ', 'word': 'sont'},
    {'pos': 'NN', 'word': 'beaucoup'},
    {'pos': 'NN', 'word': 'améliorées'},
    {'pos': 'NN', 'word': 'au'},
    {'pos': 'NN', 'word': 'café'},
    {'pos': '.', 'word': '.'},
    {'pos': 'NN', 'word': "j'y"},
    {'pos': 'VBP', 'word': 'ai'},
    {'pos': 'NN', 'word': 'mangé'},
    {'pos': 'JJ', 'word': 'une'},
    {'pos': 'NN', 'word': 'assiette'},
    {'pos': 'NN', 'word': "d'oeufs"},
    {'pos': 'NNS', 'word': 'bénédictines'},
    {'pos': 'VBP', 'word': 'très'},
    {'pos': 'JJ', 'word': 'originale'},
    {'pos': ',', 'word': ','},
    {'pos': 'JJ', 'word': 'et'},
    {'pos': 'NN', 'word': 'mon'},
    {'pos': 'NN', 'word': 'accompagnatrice'},
    {'pos': 'NNS', 'word': 'des'},
    {'pos': 'NNS', 'word': 'crêpes'},
    {'pos': 'JJ', 'word': 'red'},
    {'pos': 'NN', 'word': 'velvet'},
    {'pos': 

In [7]:
def lemmatize(reviewDict):
    # loop through the reviews
    # get nouns and group them by lemma
    reviewCorpus = {}
    lemmatizer = nltk.WordNetLemmatizer()

    for review_count, review_content in reviewDict.items():
        nouns = []
        words = [w for w in review_content['review_words'] if w['pos'] in ['NN','NNS']]
        
        for w in words:
            nouns.append(lemmatizer.lemmatize(w['word']))
            
        reviewCorpus[review_count] = {'review_stars' : review_content['stars'], \
                                      'review_text' : review_content['text'], \
                                      'review_nouns' : nouns} 
    
    return reviewCorpus

In [9]:
# feed reviews to LDA model using k topics
def train(reviewDict, k):
    
    '''
    create id2word
    cannot filter extremes when the set is too small (only 1 page of yelp)
    '''
    id2word = corpora.Dictionary(reviewDict[review]["review_nouns"] for review in reviewDict)
#     id2word.filter_extremes(keep_n=10000)
#     id2word.compactify()

    corpora_dict = corpora.Dictionary(reviewDict[review]["review_nouns"] for review in reviewDict)
    corpora_dict.save('lda/dictionary.dict')
    
    corpus = [corpora_dict.doc2bow(reviewDict[review]["review_nouns"]) for review in reviewDict]
    corpora.BleiCorpus.serialize('lda/corpus.lda-c', corpus)
    corpus = corpora.BleiCorpus('lda/corpus.lda-c')
    
    if k == 50:
        # save lda model for 50 topics
        lda = gensim.models.LdaModel(corpus, num_topics=50, id2word=id2word)
        lda.save('lda/lda_50_topics.lda')
    
    elif k == 25:
        # save lda model for 25 topics
        lda = gensim.models.LdaModel(corpus, num_topics=25, id2word=id2word)
        lda.save('lda/lda_25_topics.lda')
    
    return lda

In [11]:
import requests
import json

def main():
    global REVIEW_DICT 
    REVIEW_DICT = reviewData 
        
    '''
    Check if the folder for the lda model exists
    If it doesnt create the folder 
    '''
    if not os.path.exists('lda'):
        os.makedirs('lda')
    
    train(lemmatize(REVIEW_DICT), 25)

    
if __name__ == '__main__':
    main()

In [41]:
# Get all 50 topics using K=50
WEIGHT_TOPIC = []

dictionary_path = "lda/dictionary.dict"
corpus_path = "lda/corpus.lda-c"
lda_model_path = "lda/lda_50_topics.lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaModel.load(lda_model_path)

# See topics
TOPIC_DICT = dict(lda.show_topics(num_topics=50))

for topicN, topicWeights in TOPIC_DICT.items():
    print('Topic ' + str(topicN) + ' : \n' + str(topicWeights) + '\n')

Topic 0 : 
0.058*"table" + 0.042*"space" + 0.029*"place" + 0.023*"chair" + 0.022*"seat" + 0.020*"lot" + 0.020*"coffee" + 0.019*"sit" + 0.018*"people" + 0.018*"seating"

Topic 1 : 
0.078*"water" + 0.040*"card" + 0.038*"half" + 0.036*"pork" + 0.024*"dollar" + 0.024*"charge" + 0.022*"cash" + 0.022*"cost" + 0.020*"ball" + 0.018*"credit"

Topic 2 : 
0.101*"game" + 0.055*"board" + 0.042*"boyfriend" + 0.032*"size" + 0.030*"room" + 0.026*"purchase" + 0.021*"girlfriend" + 0.019*"disappointment" + 0.017*"hey" + 0.016*"energy"

Topic 3 : 
0.242*"location" + 0.067*"drive" + 0.047*"thru" + 0.040*"time" + 0.035*"car" + 0.031*"service" + 0.024*"staff" + 0.015*"way" + 0.014*"line" + 0.012*"order"

Topic 4 : 
0.056*"butter" + 0.053*"pie" + 0.053*"pizza" + 0.047*"flavour" + 0.024*"cat" + 0.024*"peanut" + 0.022*"pasta" + 0.015*"crust" + 0.014*"oil" + 0.013*"balance"

Topic 5 : 
0.067*"check" + 0.047*"excellent" + 0.046*"wow" + 0.033*"neighborhood" + 0.027*"cold" + 0.021*"food" + 0.020*"jam" + 0.018*"rang

In [42]:
# Get all 25 topics using K=25

WEIGHT_TOPIC = []

dictionary_path = "lda/dictionary.dict"
corpus_path = "lda/corpus.lda-c"
lda_model_path = "lda/lda_25_topics.lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaModel.load(lda_model_path)

# See topics
TOPIC_DICT = dict(lda.show_topics(num_topics=25))

for topicN, topicWeights in TOPIC_DICT.items():
    print('Topic ' + str(topicN) + ' : \n' + str(topicWeights) + '\n')

Topic 0 : 
0.089*"place" + 0.052*"staff" + 0.031*"love" + 0.026*"pie" + 0.025*"selection" + 0.021*"music" + 0.019*"night" + 0.018*"owner" + 0.015*"day" + 0.013*"fun"

Topic 1 : 
0.147*"latte" + 0.048*"game" + 0.045*"chai" + 0.027*"croissant" + 0.025*"vanilla" + 0.025*"board" + 0.023*"almond" + 0.021*"place" + 0.020*"mocha" + 0.017*"milk"

Topic 2 : 
0.045*"time" + 0.042*"place" + 0.042*"food" + 0.034*"order" + 0.031*"service" + 0.021*"crepe" + 0.020*"minute" + 0.018*"people" + 0.014*"star" + 0.013*"wait"

Topic 3 : 
0.030*"guy" + 0.028*"smile" + 0.026*"dog" + 0.020*"lady" + 0.018*"thank" + 0.016*"airport" + 0.015*"girl" + 0.013*"yesterday" + 0.013*"counter" + 0.013*"hummus"

Topic 4 : 
0.155*"donut" + 0.030*"doughnut" + 0.020*"dozen" + 0.018*"apple" + 0.016*"dunkin" + 0.015*"shop" + 0.013*"place" + 0.012*"creme" + 0.011*"mint" + 0.010*"sugar"

Topic 5 : 
0.021*"wir" + 0.013*"vacation" + 0.012*"immer" + 0.011*"earl" + 0.011*"kale" + 0.011*"parfait" + 0.011*"frappe" + 0.010*"vor" + 0.010