# Define previous scraping functions

In [1]:
# helper functions
def hash_function(text):
    return abs(hash(text)) % (10**8)

From YELP

In [2]:
import requests

API_KEY = "e2d13cd3a0msh972b270d9391f69p17bd85jsn44d5cbf79484"
BASE_URL = "https://red-flower-business-data.p.rapidapi.com"
SEARCH_URL = f"{BASE_URL}/business-search"
REVIEWS_URL = f"{BASE_URL}/business-reviews"

# Define the search parameters
headers = {"X-RapidAPI-Key": API_KEY}
params = {
    "query": " ",
    "location": "Rome, Italy",
    "categories": "italian restaurants",
    "limit": 10,
    "sort_by": "HIGHEST_RATED"
}

# Make the request
response = requests.get(SEARCH_URL, headers=headers, params=params)

# Parse the response
if response.status_code == 200:
    data = response.json()
    for business in data['data']:
        url = f"{REVIEWS_URL}?business_id={business['id']}&page=1&page_size=10"
        response_business = requests.get(url, headers=headers)
        if response_business.status_code == 200:
            data_business = response_business.json()['data']
            review_count = data_business.get('total')
            reviews = data_business.get("reviews")
            if len(reviews) > 0:
                first_review = reviews[0]['review_text']
                print(f"Name: {business['name']}")
                print(f"Rating: {business['rating']}")
                print(f"Review count: {review_count}")
                print(f"First Review: {first_review[0:min(40, len(first_review))]}...")
                print("-" * 40)
else:
    print(f"Error: {response.status_code} - {response.text}")

In [3]:
def get_yelp_reviews(max_restaurants, max_reviews):
    """
    retrieves reviews from OpenTable.
    :return: A list of reviews, each being: {'review_id', 'restaurant_id', 'restaurant', 'text', 'date'}
    """

From OpenTalbe

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import hashlib

# needed so to have the lists (restaurants and reviews) loaded from the website
def scroll_down_page(driver, speed=8):
    current_scroll_position, new_height= 0, 1
    while current_scroll_position <= new_height:
        current_scroll_position += speed
        driver.execute_script("window.scrollTo(0, {});".format(current_scroll_position))
        new_height = driver.execute_script("return document.body.scrollHeight")

# obtain the list of restaurants based on the predefined criteria
def scrape_opentable_restaurants(keep_open=False, max_restaurants=10):
    # queries OpenTable restaurants in 'Paris' under 'Italian' cousine category, ordered by rating
    url = "https://www.opentable.com/s?term=paris&cuisineIds%5B%5D=48e9d049-40cf-4cb9-98d9-8c47d0d58986&sortBy=rating"

    # open the browser
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(3)
    # scroll down the page so to have all the restaurants loaded
    scroll_down_page(driver)
    time.sleep(3)

    restaurants = []

    try:
        # Extract restaurant elements
        restaurant_elements = driver.find_elements(By.CLASS_NAME, 'qCITanV81-Y-')
        restaurant_counter = 0
        for restaurant_element in restaurant_elements:
            try:
                restaurant_name = restaurant_element.text # name
                restaurant_link = restaurant_element.get_attribute('href') # link
                restaurant_link = restaurant_link[:-122] # remove parameters
                restaurants.append({'restaurant_name': restaurant_name, 'restaurant_link': restaurant_link})
                restaurant_counter += 1
                if max_restaurants != 0 and restaurant_counter >= max_restaurants:
                    break

            except Exception as e:
                print(f"Error extracting restaurants: {e}")
                continue

    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        if not keep_open:
            driver.quit()

    return restaurants, driver

# scrape reviews from the given restaurant
def scrape_opentable_reviews(driver, url, keep_open=False, max_reviews=10):

    # open the browser if not already open
    if driver is None:
        driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(3)

    reviews = []

    try:
        # scroll down the page so to have all the reviews loaded
        scroll_down_page(driver)
        time.sleep(1)
        # Extract review elements
        review_elements = driver.find_elements(By.CLASS_NAME, 'afkKaa-4T28-')
        review_counter = 0

        for review_element in review_elements:
            try:
                review_text = review_element.find_element(By.CLASS_NAME, '_6rFG6U7PA6M-').text # review text
                review_date = review_element.find_element(By.CLASS_NAME, 'iLkEeQbexGs-').text # review date
                reviews.append({'review_text': review_text, 'review_date': review_date})
                
                review_counter += 1
                if max_reviews != 0 and review_counter >= max_reviews:
                    break

            except Exception as e:
                print(f"Error extracting review: {e}")
                continue

    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        if not keep_open:
            driver.quit()

    return reviews

def get_opentable_reviews(output_csv, max_restaurants=30, max_reviews=10):
    """
    retrieves reviews from OpenTable.
    :return: A list of reviews, each being: {'review_id', 'restaurant_id', 'restaurant', 'text', 'date'}
    """
    
    unique_restaurants_already_scraped = set()
    try:
        with open(output_csv, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)

            for row in reader:
                restaurant_name = row.get('restaurant_id', '').strip()
                unique_restaurants_already_scraped.add(restaurant_name)

    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return []
    
    result = []
    
    if len(unique_restaurants_already_scraped) >= max_restaurants:
        try:
            with open(output_csv, mode='r', encoding='utf-8') as file:
                reader = csv.DictReader(file)

                # Check if 'review_text' column exists
                if 'review_text' not in reader.fieldnames:
                    print("Column 'review_text' not found in the CSV file.")
                    return []

                for row in reader:
                    review = {
                        'review_id': row.get('review_id', '').strip(),
                        'restaurant_id': row.get('restaurant_id', '').strip(),
                        'restaurant': row.get('restaurant', '').strip(),
                        'text': row.get('text', '').strip(),
                        'date': row.get('date', '').strip(),
                    }
                    result.append(review)

        except Exception as e:
            print(f"Error reading CSV file: {e}")
            return []

        return result

    # scrape restaurants
    restaurants, driver = scrape_opentable_restaurants(True, max_restaurants)
    
    # scrape reviews
    for n, restaurant in enumerate(restaurants):
        if restaurant['restaurant_name'] in unique_restaurants_already_scraped:
            continue
        reviews = scrape_opentable_reviews(driver, restaurant['restaurant_link'], True if n < len(restaurants) else False, max_reviews)
        for review in reviews:
            result.append({
                'review_id': hash_function(f"{restaurant['restaurant_name']}{review['review_text']}"),
                'restaurant_id': hash_function(restaurant['restaurant_name']),
                'restaurant': restaurant['restaurant_name'],
                'text': review['review_text'],
                'date': review['review_date']
            })

    # save the results to a CSV file
    with open(output_csv, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['review_id', 'restaurant_id', 'restaurant', 'text', 'date'])
        if file.tell() == 0:  # Check if the file is empty to write the header
            writer.writeheader()
        writer.writerows(result)

    return result

# Save reviews in a CSV file

In [5]:
import os
import csv

get_opentable_reviews("opentable_reviews.csv")
get_yelp_reviews("yelp_reviews.csv")

TypeError: get_yelp_reviews() missing 1 required positional argument: 'max_reviews'

In [None]:
import pandas as pd

# Load the CSV files
opentable_reviews = pd.read_csv('opentable_reviews.csv')
yelp_reviews = pd.read_csv('yelp_reviews.csv')

# Combine the two DataFrames
reviews = pd.concat([opentable_reviews, yelp_reviews], ignore_index=True)

# Save the combined DataFrame to a new CSV file
reviews.to_csv('combined_reviews.csv', index=False)
reviews