In [250]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time

In [251]:
options = Options()
options.headless = True
#please use the chrome driver that associate with ur chrome version
#https://chromedriver.chromium.org/downloads
driver = webdriver.Chrome('./chromedriver',options=options) 

### Utils

In [252]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

### Scrape function

In [253]:
def scrape_yel_page(site,driver):
    print ('-'*20,'Scraping Yelp page review','-'*20)
    reviews = []
    #execute js on webpage to load element on webpage and get ready to parse the loaded HTML 
    soup = get_js_soup(site,driver)
    if(soup):
        for i in soup.find_all('span', class_=' raw__09f24__T4Ezm'):
            reviews.append(i)
        return reviews
    else:
        return "it's empty"
    

### Get the reviews

In [263]:
#get the reviews
#site ='https://www.yelp.com/biz/everyday-kitchen-champaign-champaign?osq=Restaurants'
site = 'https://www.yelp.com/biz/naya-bar-and-eatery-champaign-3?osq=Restaurants' #url of directory listings of engineering faculty
reviews = scrape_yel_page(site,driver)
#print(reviews)

-------------------- Scraping Yelp page review --------------------


### Clean the reviews
- remove all html tag

In [264]:
temp = reviews.copy()
temp = temp[3:len(temp)-3]
cleaned_data = []
for review in temp:
    new_review =  str(review).replace('<span class=" raw__09f24__T4Ezm" lang="en">', '')
    new_review =  str(new_review).replace('<span class=" raw__09f24__T4Ezm">', '')
    new_review =  str(new_review).replace('<br/>', '')
    new_review =  str(new_review).replace('</span>', '')
    new_review =  str(new_review).replace('\\','')
    # \xa0 is actually non-breaking space in Latin1 (ISO 8859-1), and when encoded to utf-8, it is 2 bytes, replace it with 2 bytes
    new_review =  str(new_review).replace(u'\xa0', u' ')
    cleaned_data.append(new_review)
print(len(cleaned_data))

11


### Now do some analysis

- install metapy library so we can tokenized the reviews

In [265]:
!pip install metapy pytoml

[33mDEPRECATION: Python 3.5 reached the end of its life on September 13th, 2020. Please upgrade your Python as Python 3.5 is no longer maintained. pip 21.0 will drop support for Python 3.5 in January 2021. pip 21.0 will remove support for this functionality.[0m


In [266]:
import metapy 

In [267]:
def tok_string(str):
    doc = metapy.index.Document()
    doc.content(str)
    tok = metapy.analyzers.ICUTokenizer(suppress_tags=True) #good
    tok = metapy.analyzers.LowercaseFilter(tok) #good
    tok = metapy.analyzers.LengthFilter(tok, min=2, max=15) #good
    tok = metapy.analyzers.Porter2Filter(tok)
    tok.set_content(doc.content())
    tokens = [token for token in tok]
    return tokens

- start tokenizing

In [268]:
tokenized_review = []
for review in cleaned_data:
    tokenized_review.append(tok_string(review))
#print(tokenized_review)

### Tokenize positive and negative dataset

- import the data then tokenize it

In [269]:
pos_word = ""
neg_word = ""
neg2_word = ""
with open('positives.txt','r') as f:
    for line in f:
        pos_word+= line
        
with open('negatives.txt','r') as f2:
    for line in f2:
        neg_word += line

count = 0
with open('negatives2.txt','r') as f3:
    for line in f3:
        neg2_word += line
pos_token =  tok_string(pos_word)
neg_token = tok_string(neg_word)
neg2_token = tok_string(neg2_word)

### Find the number of postive review and negative review of the restaurant

In [270]:
pos = 0
neg = 0
for review in tokenized_review:
    num_of_pos = 0
    num_of_neg = 0
    for word in review:
        if(word in pos_token):
            num_of_pos+=1
        if(word in neg_token) or (word in neg2_token):
            num_of_neg+=1
    if(num_of_pos>num_of_neg):
        pos +=1
    else:
        neg+=1

In [273]:
print(pos)
print(neg)

11
0


### Create json

In [276]:
import json
restaurant_review = {
    "positive": pos,
    "negative": neg
}
restaurant_review = json.dumps(restaurant_review)
print(restaurant_review)

{"negative": 0, "positive": 11}
