## 1. Install and import dependencies

In [104]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re

## 2. Instantiate the model

In [106]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment") # Can also try with 'https://www.yelp.com/biz/social-brew-cafe-pyrmont'
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

## 3. Encode and calculate sentiment score

In [108]:
tokens = tokenizer.encode("I hate this, this is the worst", return_tensors='pt')

In [109]:
tokens

tensor([[  101,   151, 39487, 10372,   117, 10372, 10127, 10103, 43060,   102]])

In [110]:
tokens[0] # decoding test

tensor([  101,   151, 39487, 10372,   117, 10372, 10127, 10103, 43060,   102])

In [111]:
tokenizer.decode(tokens[0]) # encoding test

'[CLS] i hate this, this is the worst [SEP]'

In [112]:
result = model(tokens)

In [113]:
result     

SequenceClassifierOutput(loss=None, logits=tensor([[ 4.9954,  1.5542, -0.9295, -2.9246, -1.8990]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

Understand result: 
- The output from the model is a one-hot ecoded list of scores. 
- The position with the highest score represents the sentiment of the text.  

In [115]:
torch.argmax(result.logits)+1

tensor(1)

In [116]:
# Another example
tokens = tokenizer.encode("It was good but could have been better", return_tensors='pt')

In [158]:
# Calculate the sentiment
result = model(tokens)

In [162]:
# Extract the sentiment score
result 

SequenceClassifierOutput(loss=None, logits=tensor([[-1.9994,  0.0689,  2.2148,  1.2780, -1.3454]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [119]:
torch.argmax(result.logits)+1

tensor(3)

## 4. Collect Reviews

In [137]:
r = requests.get('https://www.yelp.com/biz/mejico-sydney-2') # Assign the result of the GET request to `r`
soup = BeautifulSoup(r.text, 'html.parser') # Parse the content with BeautifulSoup
regex = re.compile('.*comment.*') # Compile the regex for finding elements with 'comment' in their class names
results = soup.find_all('p', attrs={'class':regex}) # 'p' stands for paragraph
reviews = [result.text for result in results]

In [139]:
results[0].text # Just get the text, not the HTML tag

"Seated without a booking on a super busy Saturday night. Lovely, warm, and Theo right hostess also looked after our table and went out of her way to give detailed ingredients in every dish to avoid allergies for one of us. And the food was great! Guacamole made right at our table, everything prepared with our allergies in mind, and great dish recommendations. We'd been visiting Sydney for about a week from Melbourne, and this was by far our best dining experience. I'd definitely return here in the future."

## 5. Load reviews into dataframe and score

In [144]:
import pandas as pd
import numpy as np

In [150]:
df = pd.DataFrame(np.array(reviews), columns=['review'])

In [152]:
df.head()

Unnamed: 0,review
0,Seated without a booking on a super busy Satur...
1,The food was decent not great.. We had the gu...
2,"Food was okay, guacamole was below average. Se..."
3,The food and service here was really good. It...
4,Visiting from Texas and decided to give this r...


In [154]:
df['review'].iloc[0]

"Seated without a booking on a super busy Saturday night. Lovely, warm, and Theo right hostess also looked after our table and went out of her way to give detailed ingredients in every dish to avoid allergies for one of us. And the food was great! Guacamole made right at our table, everything prepared with our allergies in mind, and great dish recommendations. We'd been visiting Sydney for about a week from Melbourne, and this was by far our best dining experience. I'd definitely return here in the future."

In [156]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return torch.argmax(result.logits)+1

In [164]:
sentiment_score(df['review'].iloc[1])

tensor(2)

In [166]:
# Calculate the sentiment score for all reviews and store in a new column in the dataframe
df['sentiment'] = df['review'].apply(lambda x: sentiment_score(x[:512])) # loop thro|ugh each review 

In [168]:
df

Unnamed: 0,review,sentiment
0,Seated without a booking on a super busy Satur...,tensor(5)
1,The food was decent not great.. We had the gu...,tensor(2)
2,"Food was okay, guacamole was below average. Se...",tensor(2)
3,The food and service here was really good. It...,tensor(5)
4,Visiting from Texas and decided to give this r...,tensor(5)
5,Don't come here expecting legit Mexican food b...,tensor(3)
6,Out of all the restaurants that I tried in Syd...,tensor(5)
7,"Great atmosphere, attentive service, solid mar...",tensor(3)
8,We came here on a Thursday night @ 5pm and by ...,tensor(4)
9,The food is fresh and tasty. The scallop cevi...,tensor(4)


In [170]:
df['review'].iloc[3]

'The food and service here was really good.  It was more like tapas food than Mexican food!  The drinks were amazing too!'