# This file web scrapes reviews and ratings from Yelp.com 

In [1]:
import json
import httpx
from parsel import Selector
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import dataframe_image as dfi

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

## 1.) Function Setup for Scraping

In [253]:
def review_and_ratings(url):
    """
    This function takes in a url and finds all the reviews and ratings html using XPath selectors. Yelp only shows 11 reviews per page, so this function can only return 11 (review,rating) tuples at a time. Return is a tuple of lists.
    """
    # traverse url and obtain metadata from sold individual listing 
    with httpx.Client(http2=True, follow_redirects=True) as client:
        resp = client.get(url)
    sel = Selector(text=resp.text)
    
    # Parse metadata using the two XPath selectors for reviews and ratings
    selected_reviews = sel.xpath(r'//span[@class=" raw__09f24__T4Ezm"]').getall()
    selected_ratings = sel.xpath(r'//div[@class="yelp-emotion-9tnml4"]').getall() # Changed from css-14g69b3 -- 05/02/2024
    
    selected_reviews = selected_reviews[4:] # Get 11 reviews per page
    new_len = len(selected_reviews)
    selected_ratings = selected_ratings[2:new_len+2]
    return selected_reviews, selected_ratings

In [269]:
def get_all_biz(url, stop):
    """
    This function gets all the reviews and ratings html from a restaurant given a specific restaurant url. It iterates through multiple pages until there are no more reviews as Yelp only shows 11 reviews per page. The stop parameter is the last page. Return is a tuple of lists.
    """
    reviews = []
    ratings = []
    # I assume as another tactic by yelp to make scraping trickier, they have implemented page indexes in multiple of 10.
    for i in range(0,stop*10,10): 
        rev,rat = review_and_ratings(url+str(i))
        reviews += rev
        ratings += rat
    return reviews, ratings

## 2.) Test to retrieve all reviews for two pages for GB Eats in Great Barrington

In [256]:
# Test for GB eats
rev,rat = get_all_biz("https://www.yelp.com/biz/gb-eats-great-barrington?start=", 2)

100%|██████████| 5/5 [00:16<00:00,  3.21s/it]


## 3.) Create list of popular restaurants urls in Great Barrington with stop page numbers 

In [260]:
# Goal is 7000, aprx 2.5k
# Note starter urls have a specific "?start=" at the end of the business url, important for indexing later pages
starter_urls = [("https://www.yelp.com/biz/gb-eats-great-barrington?start=", 17),
                ("https://www.yelp.com/biz/baba-louies-sourdough-pizza-great-barrington-3?start=", 35),
                ("https://www.yelp.com/biz/soco-creamery-great-barrington?start=", 19),
                ("https://www.yelp.com/biz/rio-cafe-great-barrington?start=", 7),
                ("https://www.yelp.com/biz/prairie-whale-great-barrington?osq=Restaurants&start=", 35),
                ("https://www.yelp.com/biz/the-well-great-barrington?osq=Restaurants&start=", 11),
                ("https://www.yelp.com/biz/aegean-breeze-restaurant-great-barrington-2?osq=Restaurants&start=", 20),
                ("https://www.yelp.com/biz/the-bistro-box-great-barrington?osq=Restaurants&start=", 18),
                ("https://www.yelp.com/biz/marketplace-kitchen-table-great-barrington?osq=Restaurants&start=", 11),
                ("https://www.yelp.com/biz/barrington-brewery-and-restaurant-great-barrington?osq=Restaurants&start=", 39),
                ("https://www.yelp.com/biz/xicohtencatl-great-barrington?osq=Restaurants&start=", 29),
                ("https://www.yelp.com/biz/pleasant-and-main-housatonic?osq=Restaurants&start=", 21)]

## 4.) Finally retrieve 2.5k reviews and ratings from 12 restaurants in Great Barrington

In [270]:
def get_all(urls=starter_urls):
    """
    This function gets all reviews and ratings html using the starter_urls as an input. Returns a tuple of lists.
    """
    reviews = []
    ratings = []
    for item in tqdm(urls):
        url,stop = item
        try: # Error logging in beginning stages
            rev,rat = get_all_biz(url, stop) # Get all reviews for a buisness
        except:
            print("Failed at: "+url)
            return reviews, ratings
        if len(rev) != len(rat):
            print("Error!!!")
            exit()
        reviews += rev
        ratings += rat
    return reviews, ratings # All reviews for a list of urls

In [272]:
rev,rat = get_all()

100%|██████████| 12/12 [12:46<00:00, 63.92s/it]


## 5.) Function setup to parse html and extract reviews and ratings from html components

In [308]:
def extract_text_from_html(html_string):
    """
    This function uses BeautifulSoup to parse the html string and obtain the review as a string 
    """
    soup = BeautifulSoup(html_string, 'html.parser') # Initialize BeautifulSoup
    text = soup.text.strip() # Find <p> tag and return it
    return text

def extract_rating_from_html(html_string):
    """
    This function uses regex to pull the float value for number of stars for each review.
    """
    rating_regex = r'aria-label="(\d+(?:\.\d+)?)\sstar\srating"' # Regex to get float number of stars
    text = re.findall(rating_regex, html_string) # Findall all cases (only 1)
    return text

## 6.) Iterate through scraped data to extract ratings and reviews and perform manual classification
##### Manual Classification is performed by labeling a review as:
* ##### positive if the number of stars > 3
* ##### negative if the number of stars <= 3

In [321]:
reviews = []
ratings = []
sentiment = []
for i in range(len(rev)):
    star = extract_rating_from_html(rat[i]) # Get number of stars (float)
    if star is not None: # Some reviews don't have stars (disguised as business owner response, same XPath)
        reviews.append(extract_text_from_html(rev[i])) 
        ratings.append(float(star[0]))
        if float(star[0]) > 3.0: # Manual Classification
            sentiment.append(1)
        else:
            sentiment.append(0)
save_df = pd.DataFrame(data={'review': reviews, 'rating': ratings, 'sentiment': sentiment}) # 1 for positive, 0 for negative

## 7.) Save dataset and its unbalanced labels

In [26]:
grouped = save_df.sentiment.value_counts()
grouped.index = ["positive", "negative"]
grouped
dfi.export(pd.DataFrame(grouped).style,'../Results/01_table_review_imbalance.png')

positive    1876
negative     701
Name: count, dtype: int64

In [323]:
save_df.to_csv("../Data/2.5k_reviews.csv", index=False) # Save CSV