In [1]:
import numpy as np 
import pandas as pd

import requests 
#for sending a request to the websites in this case yelp

from bs4 import BeautifulSoup
#will give us the potential to format the data and only get what we want and also scrape from the website

import re 
#for matching strings patterns will help us when we scrape review comments from yelp

import torch
#this library we're going to use in our model aswell for argmax function

from transformers import AutoTokenizer, AutoModelForSequenceClassification 
#allow us to get a pre-trained model that we'll use

In [2]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
#loading the bert model
#make sure you download the libraries before running this notebook

In [3]:
req = requests.get('https://www.yelp.com/biz/thats-amore-woodfire-pizza-san-francisco')
#sedning request to yelp.com/biz/thats-amore-woodfire-pizza-san-francisco
#the return data type is a response
req #200 indicates that it was successful


<Response [200]>

In [4]:
req.text # this is the whole html file with all the data in it
# and we only want the comments to make the sentiment analysis

'<!DOCTYPE html><html lang="en-US" prefix="og: http://ogp.me/ns#" style="margin: 0;padding: 0; border: 0; font-size: 100%; font: inherit; vertical-align: baseline;"><head><script>document.documentElement.className=document.documentElement.className.replace(/\x08no-js\x08/,"js");</script><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><meta http-equiv="Content-Language" content="en-US" /><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link rel="mask-icon" sizes="any" href="https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_large_assets/b2bb2fb0ec9c/assets/img/logos/yelp_burst.svg" content="#FF1A1A"><link rel="shortcut icon" href="https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_large_assets/dcfe403147fc/assets/img/logos/favicon.ico"><script> window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;window.ygaPageStartTime=new Date().getTime();</script><script>\n            window.yelp = window.yelp || {};\

In [5]:
soup = BeautifulSoup(req.text, 'html.parser')
# making a soup variable to utilize the beautiful soup library html.parser indicating it's a html file to deal with

In [6]:
print(type(soup))
print(type(req.text))

<class 'bs4.BeautifulSoup'>
<class 'str'>


In [8]:
comment = re.compile('.*comment.*') #getting an expression of 'comment' as every class elemnt of a comment has 'comment' in it
results = soup.find_all('p', {'class':comment}) #now we're only selecting elemts of class that has 'comment' in it


In [9]:
results_modified =[]
for result in (results):
    results_modified.append(result.text)
    

In [11]:
results_modified[0]
#this is the data without any unnecessary elements

"We may have a new favorite pizza restaurant. Wood fired pizza. Great prices. Super friendly staff. Neighborhood place with nice atmosphere.We just returned from Italy--and took a pizza making class in the birthplace of pizza, Naples. We sucked at it... but we can affirm this is authentic Neapolitan crust!All pizzas are one size, medium. We've tried the Genovese (mozzarella, pesto, ricotta, sun dried, tomatoes) and the St. Clair Taekwondo twice (named for a neighboring taekwondo place--mozzarella, sausage, basil, olive oil).The salads are good size and good variety--like a grain bowl ( quinoa, etc), or spinach with roasted almonds. The mixed green includes beets!Beer on tap and by the bottle, plus wine.Eat in special: buy my 2 pizzas and get a free salad--which would feed four light eaters. Re atmosphere: games on the tv on mute--so you can enjoy or ignore.My only complaint... padded seats would be more comfortable. ( I have a skinny, flat ass--no padding)"

In [12]:
arr= np.array(results_modified) #converting the array into a np array so we can use it in pandas
df = pd.DataFrame(arr,columns = ["review"]) #getting a dataframe so we can manipulate data easier
df.head()

Unnamed: 0,review
0,We may have a new favorite pizza restaurant. W...
1,The best pizza in ocean!!We ordered pesto chee...
2,"Great wood fired oven pizzas!One size, great f..."
3,The fetuccine alfredo pasta was creamy but ver...
4,Just passing through as we flew into SFO ... a...


In [13]:
df['sentiment_value'] = 0
for i in range(len(df['sentiment_value'])):
    tokens = tokenizer.encode(df['review'][i], return_tensors='pt') #takes every review and slices it into tokens in a list
    result = model(tokens) #takes the sliced tokens and then put it in the bert model to predict the value
    df['sentiment_value'][i] = int(torch.argmax(result.logits))+1 # changing the data type to an int value from 1 to 5 and 
    #agrmax is finding the max in the output array which is going to represent the value
    #1 is very bad 5 is the best

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_value'][i] = int(torch.argmax(result.logits))+1 # changing the data type to an int value from 1 to 5 and


In [16]:
df.head()

Unnamed: 0,review,sentiment_value
0,We may have a new favorite pizza restaurant. W...,4
1,The best pizza in ocean!!We ordered pesto chee...,5
2,"Great wood fired oven pizzas!One size, great f...",5
3,The fetuccine alfredo pasta was creamy but ver...,3
4,Just passing through as we flew into SFO ... a...,5


In [17]:
df['sentiment_value'].iloc[0]

4

In [18]:
df.describe()

Unnamed: 0,sentiment_value
count,10.0
mean,4.7
std,0.674949
min,3.0
25%,5.0
50%,5.0
75%,5.0
max,5.0


In [None]:
# lets check another resturaunt in yelp


In [19]:
req2=requests.get('https://www.yelp.com/biz/marufuku-ramen-san-francisco-5')
soup2 = BeautifulSoup(req.text, 'html.parser')
comment2 = re.compile('.*comment.*')
results2 = soup.find_all('p', {'class':comment})
results_modified2 =[]
for result in (results2):
    results_modified2.append(result.text)
arr2 = np.array(results_modified2) 
df2 = pd.DataFrame(arr2,columns = ["review"]) 
df2['sentiment_value'] = 0
for i in range(len(df2['sentiment_value'])):
    tokens2 = tokenizer.encode(df2['review'][i], return_tensors='pt') 
    result2 = model(tokens) 
    df2['sentiment_value'][i] = int(torch.argmax(result2.logits))+1
#this code is all the previous code but just in one cell


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['sentiment_value'][i] = int(torch.argmax(result2.logits))+1


In [20]:
df2

Unnamed: 0,review,sentiment_value
0,We may have a new favorite pizza restaurant. W...,5
1,The best pizza in ocean!!We ordered pesto chee...,5
2,"Great wood fired oven pizzas!One size, great f...",5
3,The fetuccine alfredo pasta was creamy but ver...,5
4,Just passing through as we flew into SFO ... a...,5
5,This is exactly what this neighborhood needs- ...,5
6,Great food - thin traditional woodfire pizza a...,5
7,Yes! Try this place first time and I was very ...,5
8,We loved this relaxed atmosphere Italian resta...,5
9,My girlfriend and I really enjoyed the pizza t...,5
