In [20]:
from bs4 import BeautifulSoup
import sys
import time
import logging
import argparse
import requests
import codecs
import urllib
import os
import requests
import json
import nltk
from nltk.corpus import stopwords

import gensim
from gensim.corpora import BleiCorpus
from gensim import corpora
from gensim.models import LdaModel

In [None]:
# YELP DATASET: train on business reviews for businesses within the "Coffee & Tea" category

coffee_places = set()

with open('yelp_academic_dataset_business.json') as businesses:
    for item in businesses:
        biz = json.loads(item)
        if biz['categories'] is not None:
            if "Coffee & Tea" in biz['categories']:
                coffee_places.add(biz['business_id'])
        
reviewData = {}

with open('yelp_academic_dataset_review.json') as reviews:
    for item in reviews:
        rev = json.loads(item)
        if rev['business_id'] in coffee_places:
            reviewData[rev['review_id']] = {'text':rev['text'], 'stars':rev['stars']}            

In [98]:
list(reviewData.items())[:100]

[('z--dbq4Qegp23GdECZYI9A',
  {'stars': 3,
   'text': "I love the location since its so close to home and work. I WISH they had a drive thru! With that being said, the service has recently gone down. I go about everyday and no matter what time I go there is always a wait. It seems like only one person is making the drinks. I'm assuming they are just short of staff. The staff that is working there seems very stressed out and not happy. Just doesn't feel very relaxing. Hopefully things get back to normal soon as I do love my coffee!"}),
 ('fMQUroqu5LzPfIGiwdvmwA',
  {'stars': 3,
   'text': 'Beau petit café "3e vague" près de McGill, 4,50 $ le latté ! Il était bon. Ça fait changement des autres Second Cup ou Starbucks au centre-ville. Wi-fi gratuit. Par contre, le service était correct, même pas un sourire et un merci du jeune proprio, mon latté déposé sur le comptoir sans rien dire et continue sa conversation avec son associé.... bof, j\'aime un peu moins.'}),
 ('v6qi_7mTgPIu0tiqnlSNDw',

In [101]:
# Split review into sentences, remove stopwords, extract parts-of-speech tags
# (opt. if lots of reviews) store each review into MongoDB db called 'Reviews'

stopWords = set(stopwords.words('english'))

for revId in reviewData:
    reviewWords = []
    sentences = nltk.sent_tokenize(reviewData[revId]['text'].lower())
    
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        text = [w for w in tokens if w not in stopWords]
        tagged_text = nltk.pos_tag(text)
        
        for word, tag in tagged_text:
            reviewWords.append({'word': word, 'pos': tag})
    
    reviewData[revId]['review_words'] = reviewWords

list(reviewData.items())[:100]

[('z--dbq4Qegp23GdECZYI9A',
  {'review_words': [{'pos': 'NN', 'word': 'love'},
    {'pos': 'NN', 'word': 'location'},
    {'pos': 'IN', 'word': 'since'},
    {'pos': 'JJ', 'word': 'close'},
    {'pos': 'NN', 'word': 'home'},
    {'pos': 'NN', 'word': 'work'},
    {'pos': '.', 'word': '.'},
    {'pos': 'JJ', 'word': 'wish'},
    {'pos': 'NN', 'word': 'drive'},
    {'pos': 'NN', 'word': 'thru'},
    {'pos': '.', 'word': '!'},
    {'pos': 'VBD', 'word': 'said'},
    {'pos': ',', 'word': ','},
    {'pos': 'NN', 'word': 'service'},
    {'pos': 'RB', 'word': 'recently'},
    {'pos': 'VBN', 'word': 'gone'},
    {'pos': '.', 'word': '.'},
    {'pos': 'VB', 'word': 'go'},
    {'pos': 'JJ', 'word': 'everyday'},
    {'pos': 'NN', 'word': 'matter'},
    {'pos': 'NN', 'word': 'time'},
    {'pos': 'VBP', 'word': 'go'},
    {'pos': 'RB', 'word': 'always'},
    {'pos': 'RB', 'word': 'wait'},
    {'pos': '.', 'word': '.'},
    {'pos': 'VBZ', 'word': 'seems'},
    {'pos': 'IN', 'word': 'like'},
    {'po

In [102]:
def lemmatize(reviewDict):
    # loop through the reviews
    # get nouns and group them by lemma
    reviewCorpus = {}
    lemmatizer = nltk.WordNetLemmatizer()

    for review_count, review_content in reviewDict.items():
        nouns = []
        words = [w for w in review_content['review_words'] if w['pos'] in ['NN','NNS']]
        
        for w in words:
            nouns.append(lemmatizer.lemmatize(w['word']))
            
        reviewCorpus[review_count] = {'review_stars' : review_content['stars'], \
                                      'review_text' : review_content['text'], \
                                      'review_nouns' : nouns} 
    
    return reviewCorpus

In [103]:
# feed reviews to LDA model using k topics
def train(reviewDict, k):
    
    '''
    create id2word
    cannot filter extremes when the set is too small (only 1 page of yelp)
    '''
    id2word = corpora.Dictionary(reviewDict[review]["review_nouns"] for review in reviewDict)
#     id2word.filter_extremes(keep_n=10000)
#     id2word.compactify()

    corpora_dict = corpora.Dictionary(reviewDict[review]["review_nouns"] for review in reviewDict)
    corpora_dict.save('lda/dictionary.dict')
    
    corpus = [corpora_dict.doc2bow(reviewDict[review]["review_nouns"]) for review in reviewDict]
    corpora.BleiCorpus.serialize('lda/corpus.lda-c', corpus)
    corpus = corpora.BleiCorpus('lda/corpus.lda-c')
    
    if k == 50:
        # save lda model for 50 topics
        lda = gensim.models.LdaModel(corpus, num_topics=50, id2word=id2word)
        lda.save('lda/lda_50_topics.lda')
    
    else if k == 25:
        # save lda model for 25 topics
        lda = gensim.models.LdaModel(corpus, num_topics=25, id2word=id2word)
        lda.save('lda/lda_25_topics.lda')
    
    return lda

In [104]:
# Scrape Yelp reviews

import requests
import json

def main():
    global REVIEW_DICT 
    REVIEW_DICT = reviewData 
        
    '''
    Check if the folder for the lda model exists
    If it doesnt create the folder 
    '''
    if not os.path.exists('lda'):
        os.makedirs('lda')
    
    train(lemmatize(REVIEW_DICT, 25))

    
if __name__ == '__main__':
    main()

In [105]:
WEIGHT_TOPIC = []

dictionary_path = "lda/dictionary.dict"
corpus_path = "lda/corpus.lda-c"
lda_model_path = "lda/lda_50_topics.lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaModel.load(lda_model_path)

i = 0
for topic in lda.show_topics():
    topic_list = topic[1].replace(' ','').replace('\"', '').split('+')

    weight_topic = [x.split('*') for x in topic_list ]
    global WEIGHT_TOPIC
    WEIGHT_TOPIC.append(weight_topic)
    print((weight_topic) + '\n')

[['0.169', 'cake'], ['0.057', 'cupcake'], ['0.026', 'macaroon'], ['0.021', 'velvet'], ['0.016', 'moist'], ['0.015', 'baby'], ['0.014', 'birthday'], ['0.014', 'salt'], ['0.013', 'sweet'], ['0.013', 'dessert']]

[['0.098', 'line'], ['0.061', 'people'], ['0.031', 'time'], ['0.029', 'order'], ['0.023', 'wait'], ['0.022', 'person'], ['0.021', 'door'], ['0.021', 'way'], ['0.020', 'morning'], ['0.017', 'day']]

[['0.141', 'love'], ['0.077', 'kid'], ['0.040', 'yum'], ['0.032', 'place'], ['0.029', 'family'], ['0.019', 'fresh'], ['0.015', 'city'], ['0.014', 'special'], ['0.013', 'jar'], ['0.012', 'cocoa']]

[['0.058', 'table'], ['0.042', 'space'], ['0.029', 'place'], ['0.023', 'chair'], ['0.022', 'seat'], ['0.020', 'lot'], ['0.020', 'coffee'], ['0.019', 'sit'], ['0.018', 'people'], ['0.018', 'seating']]

[['0.078', 'hour'], ['0.059', 'weekend'], ['0.039', 'place'], ['0.034', 'time'], ['0.024', 'student'], ['0.017', 'food'], ['0.017', 'complaint'], ['0.017', 'evening'], ['0.016', 'system'], ['0.0

In [108]:
# See topics
TOPIC_DICT = dict(lda.show_topics(num_topics=50))

In [None]:
topic_text = []
for review in REVIEW_DICT:
    review_text = REVIEW_DICT[review]['text']
    review_stars = REVIEW_DICT[review]['stars']
    counter = 0
    for weight_topic in WEIGHT_TOPIC:
        for topic in weight_topic:
            if topic[1] in review_text: counter +=1
                
        if counter >= 4: topic_text.append((review_stars, review_text, weight_topic))


In [195]:
# Test on an example – applying LDA model to unseen review
unseenReview = '''I really wanted to like this coffee shop, but after my first visit today I can't say I'm impressed. My biggest issue is with the atmosphere. I believe interior decor plays a big role, and this doesn't seem like it's been renovated since the late 90s/ early 2000s. The flooring, walls, tables, counter, everything contributes to a less-than-pleasing environment and not somewhere I'd like to spend a lot of time. There are spots on the walls where paint is falling off, all tables aren't of same style, the interior temperature was cold, and the music unfitting/ distracting (old hip-hip, rap, and pop).

Secondly it was very dirty when I visited. I had to wipe off my table before I could place my laptop down, there were napkins and paper cups all over the floors, and the service counter was very unappealing. I don't know whether this is a consistent issue or I just happened to visit at a bad time, but it affected my first impression nonetheless. 

As far as the coffee goes, I wasn't impressed. Despite ordering to stay, I was served in a paper cup. It's not a big issue, but I would preferred a regular cup. My caffè mocha was mediocre at best, not the worst I've had, but both Starbucks and Pavement up the street serves better. The prices weren't particularly impressive either, paying over $4.5 (including tax) for a medium (12oz) mocha.

I didn't order any food, but their their display of cookies/ cakes was rather sad. Everything looked dry, unorganized, and very unsanitary. They did have a menu for warm sandwiches, some of which sounded really good, but I didn't have a chance to try any of them. However several of the other customers did order a sandwich, so I can only assume they're worth the price.

The final point I'd like to address is the internet. They do provide free internet which is nice (though expected in this day and age), however it's among the slowest I've experienced. Definitely not somewhere I could get work done, and I don't understand how so many people stay here for an extended period of time on their laptops. A random YouTube video I opened took over 2 minutes to start playing at 240p, and when I adjusted it to 1080p it never continued. It took me over 5 minutes just to open Yelp and navigate to this page.

On the positive side, the cafe is open late until 10pm every night which is a plus. However the nearby Starbucks is open just as late, albeit a little less seating for those late night study sessions.
'''
                                
def predTopics(review_text):
    separated_text = review_text.lower().split()
    
    # apply LDA model
    dictionary_path = "lda/dictionary.dict"
    corpus_path = "lda/corpus.lda-c"
    lda_model_path = "lda/lda_50_topics.lda"

    dictionary = corpora.Dictionary.load(dictionary_path)
    corpus = corpora.BleiCorpus(corpus_path)
    lda = LdaModel.load(lda_model_path)
    
    review_bow = dictionary.doc2bow(separated_text)
    
    return lda[review_bow]

for (topic, weight) in predTopics(unseenReview):
    print('Review has ' + str(weight*100) + '% weight ' + 'for topic \n' + TOPIC_DICT[topic] + '\n')


Review has 7.62426870771% weight for topic 
0.058*"table" + 0.042*"space" + 0.029*"place" + 0.023*"chair" + 0.022*"seat" + 0.020*"lot" + 0.020*"coffee" + 0.019*"sit" + 0.018*"people" + 0.018*"seating"

Review has 2.05755183731% weight for topic 
0.101*"game" + 0.055*"board" + 0.042*"boyfriend" + 0.032*"size" + 0.030*"room" + 0.026*"purchase" + 0.021*"girlfriend" + 0.019*"disappointment" + 0.017*"hey" + 0.016*"energy"

Review has 3.03601666243% weight for topic 
0.067*"check" + 0.047*"excellent" + 0.046*"wow" + 0.033*"neighborhood" + 0.027*"cold" + 0.021*"food" + 0.020*"jam" + 0.018*"range" + 0.016*"bottle" + 0.014*"place"

Review has 1.38018249044% weight for topic 
0.098*"line" + 0.061*"people" + 0.031*"time" + 0.029*"order" + 0.023*"wait" + 0.022*"person" + 0.021*"door" + 0.021*"way" + 0.020*"morning" + 0.017*"day"

Review has 4.57130235468% weight for topic 
0.066*"nice" + 0.055*"staff" + 0.039*"strip" + 0.034*"afternoon" + 0.028*"mall" + 0.027*"helpful" + 0.024*"talk" + 0.022*"sugg

In [114]:
# Next steps
# (1) more robust training set 
# (2) topic labeling 
# (3) factor in star-ratings
# (4) sentiment analysis (Stanford NLP / treebank)

In [196]:
import corenlp
import numpy as np

def sentimentScore(text):
    # Before running corenlp, must start up NLP server using:
    # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

    sentences = list(filter(None, text.replace('\n','').split('.')))
    sentimentVals = []
    for sentence in sentences:
        try:
            sentimentVals.append(corenlp.sentiment_analysis_on_sentence(sentence))
        except:
            continue
        
    return np.mean(sentimentVals)

def get_reviews(theurl):
    reviewInfo = {}
    
    stop = set(stopwords.words('english'))
    
    main_page = requests.get(theurl)
    if main_page.status_code == 200:
        soup = BeautifulSoup(main_page.content, "html.parser")
    else:
        print('Non-200 status code. Get request for url failed.')
    
    review_div = soup.findAll('div',{'itemprop':'review'})

    reviewCount = 1
    
    for i in review_div: # iterating through review_div 
        # get review star rating
        reviewStar = float(i.find('meta',{'itemprop':'ratingValue'}).get('content', None))

        # get review body text
        reviewBody = i.find('p',{'itemprop':'description'})
        for txt in reviewBody:
            if type(txt) != '<p>' and not str(txt).startswith('<p>'):
                reviewText = txt
        
        topics = predTopics(reviewText)
        sentiment = sentimentScore(reviewText)
        
        reviewInfo[reviewCount] = {'review_stars' : reviewStar, \
                                   'review_text': reviewText, \
                                   'review_topics' : topics, \
                                   'sentiment_score' : sentiment} 
        reviewCount += 1
        
    return reviewInfo
    
# def getAllReviews(restaurant_url):
#     reviewDict = {}
    
#     stop=0

#     while(stop == 0):
#         reviewDict.update(get_reviews(restaurant_url))
#         this_page = urllib.request.urlopen(restaurant_url)
#         soup = BeautifulSoup(this_page, "html.parser")
#         review_div = soup.findAll('link',{'rel':'next'})
    
#         if len(review_div) != 0:
#             for i in review_div:
#                 restaurant_url = i.get('href', None)
#         else:
#             stop = 1 

#     return reviewDict


In [197]:
pavementUrl = 'https://www.yelp.com/biz/pavement-coffeehouse-boston'
pavementReviews = get_reviews(pavementUrl)

In [258]:
import pandas as pd

# Topic labels for K=50 topics
TOPIC_LABELS = {
0 : 'tables/space/seating',
1 : 'price',
2 : 'people/atmosphere/energy',
3 : 'location/convenience/service (cars, line)',
4 : 'savory food',
5 : 'Unknown #1',
6 : 'desserts/sweets',
7 : 'wait/line/order',
8 : 'Unknown #2',
9 : 'food / meat',
10 : 'design',
11 : 'convenient / easy to grab food',
12 : 'helpfulness/friendliness of staff',
13 : 'trendiness / uniqueness',
14 : 'bakery items / pastries',
15 : 'alcoholic beverages',
16 : 'Unknown #3',
17 : 'compared to Starbucks',
18 : 'cake / dessert',
19 : 'atmosphere / fun',
20 : 'order / waittime / service',
21 : 'specialty drinks (e.g. latte)',
22 : 'business / chain',
23 : 'seasonal drinks (WINTER?)',
24 : 'food service / servers',
25 : 'crepes',
26 : 'vegan / gluten-free? (slight)',
27 : 'chocolate desserts',
28 : 'star rating rationale?',
29 : 'breakfast/brunch',
30 : 'blended drinks (e.g. macchiato, matcha, fruit)',
31 : 'good for families/kids',
32 : 'hours / opening',
33 : 'tea beverages (e.g. boba)',
34 : 'lunch foods (sandwich, salad, soup)',
35 : 'ice cream, donut, waffle (sweet things)',
36 : 'refreshing things (coconut, tapioca, slushie)',
37 : 'General coffee shop (cup, bean, place)',
38 : 'nighttime / cocktails and bars',
39 : 'Would recommend the place ',
40 : 'Store / shopping',
41 : 'French cafe?',
42 : 'breakfast foods (eggs/bacon/toast,etc)',
43 : 'juice and smoothies',
44 : 'customer service',
45 : 'ambience / interior',
46 : 'Unknown #4',
47 : 'Unknown #5',
48 : 'Hotel cafe',
49 : 'Unknown #6',
}

# Topic labels for K=25 topics

In [252]:
pavementData = {}

for count, items in pavementReviews.items():
    pavementData[count] = {'review_stars' : items['review_stars'],
                           'review_text' : items['review_text'],
                           'review_topics' : sorted([(t,TOPIC_LABELS[t],w) for (t,w) in items['review_topics']], key=lambda x: x[2], reverse=True),
                           'sentiment_score' : items['sentiment_score']}

In [284]:
pavementTopicWeights = {}
topicSentiments = {topic_name: [] for n, topic_name in TOPIC_LABELS.items()}
topicStars = {topic_name: [] for n, topic_name in TOPIC_LABELS.items()}

for count, items in pavementData.items():
    for topic in items['review_topics']:
        if topic[1] not in pavementTopicWeights:
            pavementTopicWeights[topic[1]] = topic[2]            
        else:
            pavementTopicWeights[topic[1]] += topic[2]
            
        topicSentiments[topic[1]].append(items['sentiment_score'])
        topicStars[topic[1]].append(items['review_stars'])

pavementRankedTopics = sorted(list(pavementTopicWeights.items()), key=lambda x: x[1], reverse=True)

# calculate average sentiment scores and star ratings for each topic
for k in topicSentiments:
    topicSentiments[k] = np.mean(topicSentiments[k])
    topicStars[k] = np.mean(topicStars[k])
    
topicSentimentsRanked = sorted(list(topicSentiments.items()), key=lambda x: x[1], reverse=True)
topicStarsRanked = sorted(list(topicStars.items()), key=lambda x: x[1], reverse=True)

In [285]:
'''
Pros of Pavement (>2 sentiment score)
- Tea 
- Employees 
- Juice 
- Trendiness
- Sandwich offerings

Cons of Pavement (<2 sentiments score)
- Atmosphere 
- Pastries and deserts 
- Wait times 
- Price 

'''
topicSentimentsRanked

[('Unknown #5', 3.0),
 ('tea beverages (e.g. boba)', 2.3214285714285712),
 ('food service / servers', 2.304761904761905),
 ('juice and smoothies', 2.2857142857142856),
 ('savory food', 2.2857142857142856),
 ('trendiness / uniqueness', 2.2000000000000002),
 ('Unknown #1', 2.1190476190476191),
 ('lunch foods (sandwich, salad, soup)', 2.0297619047619047),
 ('alcoholic beverages', 2.0076884920634921),
 ('Would recommend the place ', 2.0029265873015873),
 ('helpfulness/friendliness of staff', 1.9957671957671959),
 ('chocolate desserts', 1.9722222222222223),
 ('convenient / easy to grab food', 1.9676587301587303),
 ('food / meat', 1.9642857142857142),
 ('good for families/kids', 1.9402116402116403),
 ('atmosphere / fun', 1.926851851851852),
 ('compared to Starbucks', 1.906084656084656),
 ('location/convenience/service (cars, line)', 1.9058201058201059),
 ('bakery items / pastries', 1.8980158730158727),
 ('seasonal drinks (WINTER?)', 1.8956709956709956),
 ('crepes', 1.892857142857143),
 ('bre

In [286]:
topicStarsRanked

[('juice and smoothies', 5.0),
 ('savory food', 5.0),
 ('tea beverages (e.g. boba)', 4.5),
 ('crepes', 4.25),
 ('chocolate desserts', 4.166666666666667),
 ('food service / servers', 4.0),
 ('trendiness / uniqueness', 4.0),
 ('helpfulness/friendliness of staff', 4.0),
 ('Unknown #5', 4.0),
 ('food / meat', 4.0),
 ('location/convenience/service (cars, line)', 3.8888888888888888),
 ('good for families/kids', 3.8888888888888888),
 ('Unknown #2', 3.8571428571428572),
 ('specialty drinks (e.g. latte)', 3.8333333333333335),
 ('hours / opening', 3.8333333333333335),
 ('tables/space/seating', 3.8181818181818183),
 ('Unknown #1', 3.75),
 ('alcoholic beverages', 3.75),
 ('Would recommend the place ', 3.75),
 ('breakfast foods (eggs/bacon/toast,etc)', 3.75),
 ('lunch foods (sandwich, salad, soup)', 3.75),
 ('business / chain', 3.7272727272727271),
 ('Hotel cafe', 3.7142857142857144),
 ('atmosphere / fun', 3.6666666666666665),
 ('breakfast/brunch', 3.6666666666666665),
 ('nighttime / cocktails and 

In [288]:
# Linear regression of sentiment score on star rating
from scipy import stats

slope, intercept, r_value, p_value, std_err = stats.linregress(list(topicSentiments.values()), list(topicStars.values()))

print('slope : ', slope)
print('intercept : ', intercept)
print('r_squared : ', r_value**2)
print('p_value : ', p_value)
print('std_err : ', std_err)

slope :  1.09542755434
intercept :  1.63514422113
r_squared :  0.465771731906
p_value :  4.79676766587e-08
std_err :  0.169332340372
