In [1]:
from bs4 import BeautifulSoup
import sys
import time
import logging
import argparse
import requests
import codecs
import urllib
import os
import urllib.request
import json
import nltk
from nltk.corpus import stopwords

import gensim
from gensim.corpora import BleiCorpus
from gensim import corpora
from gensim.models import LdaModel


def get_reviews(theurl, reviewCount):
    reviewInfo = {}
    
    stop =  stop = set(stopwords.words('english'))
    
    main_page = urllib.request.urlopen(theurl)
    soup = BeautifulSoup(main_page, "html.parser")
    
    review_div = soup.findAll('div',{'itemprop':'review'})

#     reviewCount = 1
    for i in review_div: # iterating through review_div 
        # get review star rating
        reviewStar = float(i.find('meta',{'itemprop':'ratingValue'}).get('content', None))

        # get review body text
        reviewBody = i.find('p',{'itemprop':'description'})
        for txt in reviewBody:
            if type(txt) != '<p>' and not str(txt).startswith('<p>'):
                reviewText = txt
                
                # Split review into sentences, remove stopwords, extract parts-of-speech tags
                # (opt. if lots of reviews) store each review into MongoDB db called 'Reviews'

                reviewWords = []
                sentences = nltk.sent_tokenize(reviewText.lower())
                
                for sentence in sentences:
                    tokens = nltk.word_tokenize(sentence)
                    text = [w for w in tokens if w not in stop]
                    tagged_text = nltk.pos_tag(text)
                
                for word, tag in tagged_text:
                    reviewWords.append({'word': word, 'pos': tag})
        
        reviewInfo[reviewCount] = {'review_stars' : reviewStar, \
                                   'review_text' : reviewText, \
                                   'review_words' : reviewWords} 
        reviewCount += 1
        
    ## TODO: ITERATE THROUGH ALL PAGES OF REVIEWS FOR RESTAURANT ##
        
    return reviewInfo

In [2]:
def lemmatize(reviewDict):
    # loop through the reviews
    # get nouns and group them by lemma
    reviewCorpus = {}
    lemmatizer = nltk.WordNetLemmatizer()

    for review_count, review_content in reviewDict.items():
        nouns = []
        words = [w for w in review_content['review_words'] if w['pos'] in ['NN','NNS']]
        
        for w in words:
            nouns.append(lemmatizer.lemmatize(w['word']))
            
        reviewCorpus[review_count] = {'review_stars' : review_content['review_stars'], \
                                      'review_text' : review_content['review_text'], \
                                      'review_nouns' : nouns} 
    
    return reviewCorpus

In [32]:
# feed reviews to gensim LDA model

def train(reviewDict):
    
    '''
    create id2word
    cannot filter extremes when the set is too small (only 1 page of yelp)
    '''
    id2word = corpora.Dictionary(reviewDict[review]["review_nouns"] for review in reviewDict)
    #id2word.filter_extremes(keep_n=5)
    #id2word.compactify()


    corpora_dict = corpora.Dictionary(reviewDict[review]["review_nouns"] for review in reviewDict)
    corpora_dict.save('lda/dictionary.dict')
    
    corpus = [corpora_dict.doc2bow(reviewDict[review]["review_nouns"]) for review in reviewDict]
    corpora.BleiCorpus.serialize('lda/corpus.lda-c', corpus)
    corpus = corpora.BleiCorpus('lda/corpus.lda-c')
    
    lda = gensim.models.LdaModel(corpus, num_topics=5, id2word=id2word)
    lda.save('lda/lda_50_topics.lda')
    
    return lda

In [33]:
# Test: First page of Pavement reviews

def main():
    
    '''
    Loops though the pages of a restaurant and collects all the reviews
    Probably split this up into another function later 
    
    Needs to be extended to look at all restaurants over Boston
    '''
    
    url = 'https://www.yelp.com/biz/pavement-coffeehouse-boston'
    stop = 0
    reviewDict ={}
    reviewCount = 0
    while(stop ==0):
        reviewDict.update(get_reviews(url, reviewCount))

        this_page = urllib.request.urlopen(url)
        soup = BeautifulSoup(this_page, "html.parser")
        review_div = soup.findAll('link',{'rel':'next'})
        if len(review_div) != 0:
            for i in review_div:
                url = i.get('href', None)

        else:
            stop = 1 
        reviewCount += 20

    

    
    
    '''
    Check if the folder for the lda model exists
    If it doesnt create the folder 
    '''
    if not os.path.exists('lda'):
        os.makedirs('lda')
    
#     print(train(lemmatize(reviewDict)))
    train(lemmatize(reviewDict))

    


if __name__ == '__main__':
    main()

In [34]:
def print_lda_topics():
    dictionary_path = "lda/dictionary.dict"
    corpus_path = "lda/corpus.lda-c"
    lda_model_path = "lda/lda_50_topics.lda"

    dictionary = corpora.Dictionary.load(dictionary_path)
    corpus = corpora.BleiCorpus(corpus_path)
    lda = LdaModel.load(lda_model_path)

    i = 0
    for topic in lda.show_topics(formatted=True):
        print('#' + str(i) + ': ' + str(topic))
        i += 1
        print()
        
print_lda_topics()

#0: (0, '0.037*"time" + 0.035*"coffee" + 0.022*"service" + 0.022*"staff" + 0.019*"price" + 0.015*"shop" + 0.015*"drink" + 0.015*"friend" + 0.014*"sandwich" + 0.012*"pavement"')

#1: (1, '0.106*"coffee" + 0.042*"place" + 0.021*"star" + 0.016*"service" + 0.016*"drink" + 0.016*"pavement" + 0.014*"area" + 0.013*"shop" + 0.011*"way" + 0.011*"ambiance"')

#2: (2, '0.076*"place" + 0.020*"hour" + 0.017*"bagel" + 0.016*"location" + 0.016*"latte" + 0.012*"coffee" + 0.012*"service" + 0.012*"people" + 0.012*"music" + 0.012*"work"')

#3: (3, '0.033*"bagel" + 0.028*"place" + 0.027*"pavement" + 0.023*"internet" + 0.016*"coffee" + 0.016*"time" + 0.016*"sandwich" + 0.016*"something" + 0.012*"day" + 0.012*"music"')

#4: (4, '0.020*"hipster" + 0.014*"way" + 0.014*"hour" + 0.014*"staff" + 0.014*"wifi" + 0.014*"breakfast" + 0.013*"worth" + 0.009*"bagel" + 0.008*"place" + 0.008*"time"')



In [None]:
# Predict topics for input review here 