In [399]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time

### To use this application, paste the yelp restaurant link in the variable below

In [400]:
site ='https://www.yelp.com/biz/everyday-kitchen-champaign-champaign?osq=Restaurants' #example link
#site = 'https://www.yelp.com/biz/naya-bar-and-eatery-champaign-3?osq=Restaurants' #example link

In [401]:
options = Options()
options.headless = True
#please use the chrome driver that associate with ur chrome version
#https://chromedriver.chromium.org/downloads
driver = webdriver.Chrome('./chromedriver',options=options) 

### Utils

In [402]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

### Scrape function

In [403]:
def scrape_yel_page(site,driver):
    print ('-'*20,'Scraping Yelp page review','-'*20)
    reviews = []
    #execute js on webpage to load element on webpage and get ready to parse the loaded HTML 
    soup = get_js_soup(site,driver)
    if(soup):
        for i in soup.find_all('span', class_=' raw__09f24__T4Ezm'):
            reviews.append(i)
        return reviews
    else:
        return "it's empty"
    

### Get the reviews

In [404]:
#get the reviews

reviews = scrape_yel_page(site,driver)
print(reviews)

-------------------- Scraping Yelp page review --------------------
[<span class=" raw__09f24__T4Ezm">1807 S Neil St</span>, <span class=" raw__09f24__T4Ezm">Champaign, IL 61820</span>, <span class=" raw__09f24__T4Ezm">Yelp users haven’t asked any questions yet about <strong>Everyday Kitchen - Champaign</strong>.</span>, <span class=" raw__09f24__T4Ezm" lang="en">Excellent service! The waitress was super sweet and not sure who it was possibly the owner kindly brought some water for mg doggo. <br/><br/>The breakfast sandwich may sound like any normal one however there is this delicious relish that makes it phenomenal. <br/><br/>Here a short time but coming back because both food and service leaves you feel so good!</span>, <span class=" raw__09f24__T4Ezm">Hi Payal,<br/><br/>I know how precious time is and it means the world that you would take the time to share your experience.  I am so happy that we were able to provide you with an excellent experience and we look forward to welcoming 

### Clean the reviews
- remove all html tag

In [405]:
temp = reviews.copy()
temp = temp[3:len(temp)-3]
cleaned_data = []
for review in temp:
    new_review =  str(review).replace('<span class=" raw__09f24__T4Ezm" lang="en">', '')
    new_review =  str(new_review).replace('<span class=" raw__09f24__T4Ezm">', '')
    new_review =  str(new_review).replace('<br/>', '')
    new_review =  str(new_review).replace('</span>', '')
    new_review =  str(new_review).replace('\\','')
    # \xa0 is actually non-breaking space in Latin1 (ISO 8859-1), and when encoded to utf-8, it is 2 bytes, replace it with 2 bytes
    new_review =  str(new_review).replace(u'\xa0', u' ')
    cleaned_data.append(new_review)
print(len(cleaned_data))

17


### Now do some analysis

- install metapy library so we can tokenized the reviews

In [406]:
!pip install metapy pytoml

[33mDEPRECATION: Python 3.5 reached the end of its life on September 13th, 2020. Please upgrade your Python as Python 3.5 is no longer maintained. pip 21.0 will drop support for Python 3.5 in January 2021. pip 21.0 will remove support for this functionality.[0m


In [407]:
import metapy 

In [408]:
def tok_string(str):
    doc = metapy.index.Document()
    doc.content(str)
    tok = metapy.analyzers.ICUTokenizer(suppress_tags=True) #good
    tok = metapy.analyzers.LowercaseFilter(tok) #good
    tok = metapy.analyzers.LengthFilter(tok, min=2, max=15) #good
    tok = metapy.analyzers.Porter2Filter(tok)
    tok.set_content(doc.content())
    tokens = [token for token in tok]
    return tokens

- start tokenizing

In [409]:
tokenized_review = []
for review in cleaned_data:
    tokenized_review.append(tok_string(review))
print(tokenized_review)

[['excel', 'servic', 'the', 'waitress', 'was', 'super', 'sweet', 'and', 'not', 'sure', 'who', 'it', 'was', 'possibl', 'the', 'owner', 'kind', 'brought', 'some', 'water', 'for', 'mg', 'doggo', 'the', 'breakfast', 'sandwich', 'may', 'sound', 'like', 'ani', 'normal', 'one', 'howev', 'there', 'is', 'this', 'delici', 'relish', 'that', 'make', 'it', 'phenomen', 'here', 'short', 'time', 'but', 'come', 'back', 'becaus', 'both', 'food', 'and', 'servic', 'leav', 'you', 'feel', 'so', 'good'], ['hi', 'payal', 'know', 'how', 'precious', 'time', 'is', 'and', 'it', 'mean', 'the', 'world', 'that', 'you', 'would', 'take', 'the', 'time', 'to', 'share', 'your', 'experi', 'am', 'so', 'happi', 'that', 'we', 'were', 'abl', 'to', 'provid', 'you', 'with', 'an', 'excel', 'experi', 'and', 'we', 'look', 'forward', 'to', 'welcom', 'you', 'back', 'our', 'main', 'restaur', 'will', 'be', 're', 'open', 'on', 'juli', '1st', 'we', 'hope', 'we', 'will', 'be', 'abl', 'to', 'serv', 'you', 'at', 'dinner', 'or', 'brunch', '

### Tokenize positive and negative dataset

- import the data then tokenize it

In [410]:
pos_word = ""
neg_word = ""
neg2_word = ""
with open('positives.txt','r') as f:
    for line in f:
        pos_word+= line
        
with open('negatives.txt','r') as f2:
    for line in f2:
        neg_word += line
pos_token =  tok_string(pos_word)
neg_token = tok_string(neg_word)

### Find the number of postive review and negative review of the restaurant

In [414]:
pos =0
neg = 0
count = 0
for review in tokenized_review:
    num_of_pos = 0
    num_of_neg = 0
    count += 1
    for word in review:
        if(word in pos_token):
            num_of_pos+=1
        if(word in neg_token):
            num_of_neg+=1
    if(num_of_pos>num_of_neg):
        pos +=1
    elif(num_of_pos <num_of_neg):
        neg +=1

In [415]:
print(pos)
print(neg)

17
0


### Create json

In [416]:
import json
restaurant_review = {
    "positive": pos,
    "negative": neg
}
restaurant_review = json.dumps(restaurant_review)
print(restaurant_review)

{"negative": 0, "positive": 17}
