In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

'HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /nlptown/bert-base-multilingual-uncased-sentiment/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CA2A1E2AD0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))' thrown while requesting HEAD https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment/resolve/main/tokenizer_config.json


### Proof of Concept

In [3]:
output_mapping = {
    1: "Terrible",
    2: "Bad",
    3: "Neutral",
    4: "Good",
    5: "Amazing"
}

In [4]:
data = {
    "Review": [
        "This food is absolutely amazing! I love it.",
        "The food was good, but the service was terrible.",
        "I had a neutral experience with the food.",
        "The taste of the food was bad, but the ambiance was great.",
        "This place serves the best food in town. I'm a fan!",
    ],}
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,Review
0,This food is absolutely amazing! I love it.
1,"The food was good, but the service was terrible."
2,I had a neutral experience with the food.
3,"The taste of the food was bad, but the ambianc..."
4,This place serves the best food in town. I'm a...


In [6]:
text = "This is the worst thing I have ever experienced"

In [7]:
tokens = tokenizer.encode(text, return_tensors='pt')

In [8]:
result = model(tokens)

In [9]:
result.logits

tensor([[ 4.8149,  1.8245, -0.6126, -3.0968, -2.1913]],
       grad_fn=<AddmmBackward0>)

In [10]:
model_output = int(torch.argmax(result.logits)) + 1
predicted_label = output_mapping[model_output]
print(f"Predicted Label: {predicted_label}")


Predicted Label: Terrible


In [11]:
def predict_sentiment(review):
    # Tokenize the text
    encoded_text = tokenizer(review, padding=True, truncation=True, return_tensors="pt")

    # Get the model's output
    result = model(**encoded_text)
    model_output = int(torch.argmax(result.logits)) + 1
    predicted_label = output_mapping[model_output]
    return model_output, predicted_label

In [12]:
df['Output'], df['Sentiment'] = zip(*df['Review'].apply(predict_sentiment))


In [13]:
df

Unnamed: 0,Review,Output,Sentiment
0,This food is absolutely amazing! I love it.,5,Amazing
1,"The food was good, but the service was terrible.",2,Bad
2,I had a neutral experience with the food.,3,Neutral
3,"The taste of the food was bad, but the ambianc...",3,Neutral
4,This place serves the best food in town. I'm a...,5,Amazing


### Tested Performance

In [52]:
restaurant = "Lucy Liu Kitchen and Bar Melbourne"
processed_text = restaurant.replace(' ', '-').lower()
base_url = f'https://www.yelp.com/biz/{processed_text}'

processed_text

'lucy-liu-kitchen-and-bar-melbourne'

In [59]:
all_reviews = []
reviews_found = False

while True:
    r = requests.get(base_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    regex = re.compile('.*comment.*')
    results = soup.find_all('p', class_=regex)
    if results:  # Reviews found
            reviews_found = True
            reviews = [result.text for result in results]
        
    # Find the "Next" button URL
    next_button = soup.find('a', class_='next')
    if not next_button:
        break

    next_button = soup.find('a', class_='next-link')  # Locate the 'Next' button by class
    next_url = next_button['href']  # Extract the 'href' attribute (URL)

    # Construct the full URL for the next page if needed
    full_next_url = f'https://www.yelp.com{next_url}'

In [60]:
df = pd.DataFrame({'Review': reviews})

def predict_sentiment(review):
    # Tokenize the text
    encoded_text = tokenizer(review, padding=True, truncation=True, return_tensors="pt")

    # Get the model's output
    result = model(**encoded_text)
    model_output = int(torch.argmax(result.logits)) + 1
    return model_output

if not reviews_found:
    df['Output'] = "Error: 1 No Reviews Found."
else:
    df['Output'] = df['Review'].apply(predict_sentiment)

In [62]:
df

Unnamed: 0,Review,Output,Sentiment
0,My colleague was working here for her second j...,5,Amazing
1,Very good Customer Service but food gave me ga...,3,Neutral
2,Very good food but rude employees and trashy i...,4,Good
3,The best culinary experience I've had for many...,5,Amazing
4,3 out of 5 dishes were excellent. Fried rice ...,3,Neutral
5,I've tried so many Asian fusion restaurants ar...,5,Amazing
6,Solid asian fusion place but some of the dishe...,3,Neutral
7,We arrived at the restaurant without a reserva...,5,Amazing
8,I really enjoyed both my meals at Lucy Liu (ca...,4,Good
9,Everything was absolutely great. Highly recomm...,5,Amazing
