# Dependencies

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification  # for using pretrained mode from huggingface.co
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd


# Input for Yelp place

In [2]:
url='https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2'
total_page=10

# Getting Reviews from the URL

In [3]:
reviews=[]

url=url+'?start='

#scraping all the pages for reviews
for page in range(0,total_page*10,10):

    pageurl=url+str(page)
    print("scraping url {}".format(pageurl))
    r = requests.get(url+str(page))
    soup = BeautifulSoup(r.text, 'html.parser')
    regex = re.compile('.*comment.*')    # User reviews are under "comment__09f24__gu0rG css-1sufhje" class so only comment  regex was seleted
    results = soup.find_all('p', {'class':regex})
    reviews_single_page = [result.text for result in results]
    reviews = reviews + reviews_single_page

scraping url https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2?start=0
scraping url https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2?start=10
scraping url https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2?start=20
scraping url https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2?start=30
scraping url https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2?start=40
scraping url https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2?start=50
scraping url https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2?start=60
scraping url https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2?start=70
scraping url https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2?start=80
scraping url https://www.yelp.co.uk/biz/federal-cafe-and-bar-manchester-2?start=90


In [4]:
df = pd.DataFrame(np.array(reviews), columns=['review']) # Putting reviews as a pandas DF
df.head(3)

Unnamed: 0,review
0,This review is solely based on the take out me...
1,I've walked by this place a lot and decided to...
2,Federal Cafe & Bar is one of Manchester's most...


# PreTrained BERT models and tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment') #using bert this tokenizer for tokenizing

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment') # using pretrained BERT model

In [6]:
# Sentiment score funtion
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt') # Tokenize the string ie review
    result = model(tokens) # get result from the pretained BERT sentiment model
    return int(torch.argmax(result.logits))+1 # +1 because its return array pos becasue torch.argmax returns the index of the higher value. This model spit out 5 value for each star rating.

In [7]:
df['sentiment'] = df['review'].apply(lambda x: sentiment_score(x[:512])) #  512 was set due to tokenizer limitation: Getting sentiment score for all the reviews and assinging to a column


In [8]:
print('Average rating of this place is',df['sentiment'].mean())

Average rating of this place is 4.2772277227722775
