# Define previous scraping functions

In [19]:
# helper functions
def hash_function(text):
    return abs(hash(text)) % (10**8)

From YELP

In [7]:
import requests

API_KEY = "10ccda3760msh06408370be84a12p114ca8jsnc974ddf5859c"
BASE_URL = "https://red-flower-business-data.p.rapidapi.com"
SEARCH_URL = f"{BASE_URL}/business-search"
REVIEWS_URL = f"{BASE_URL}/business-reviews"
        

In [13]:
def add_to_file(file_path, fieldnames, data):
  with open(file_path, mode='a', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    for row in data:
        filtered_row = {key: value for key, value in row.items() if key in fieldnames}
        writer.writerow(filtered_row)

In [24]:
def get_yelp_restaurants(start=0):
    headers = {"X-RapidAPI-Key": API_KEY, "x-rapidapi-host": "red-flower-business-data.p.rapidapi.com"}
    params = {
        "query": "italian restaurant",
        "location": "Rome, RM, Italy",
        "yelp_domain": "yelp.com.tr",
        "sort_by": "HIGHEST_RATED",
        "start": start 
    }

    b_data = []
    
    response = requests.get(SEARCH_URL, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json()
        businesses = data["data"]
        print(f"Found {len(businesses)} businesses")
        for business in businesses:
            b_data.append({
                "id": business['id'],
                "name": business['name'],
                "alias": business["alias"],
                "rating": business["rating"],
                "review_count": business["review_count"]
            })
            
    return b_data


In [30]:
import csv
import os

data = get_yelp_restaurants(50) ## The number passed is the start value: the api fetches 10 restaurants at time, with this number we say how many to skip so we can get more (like a pagination)

if not os.path.exists(file_path):
    with open(file_path, mode='w', encoding='utf-8') as csvfile:
       csvfile.write("id,name,alias,rating,review_count\n")
    print(f"File created")
else:
    print(f"File already exists: {file_path}")
    
file_path = "yelp_restaurants.csv"
fieldnames = ["id","name","alias","rating","review_count"]

add_to_file(file_path, fieldnames, data)

Found 10 businesses
File already exists: yelp_restaurants.csv


In [1]:
import csv

def load_restaurants_from_csv(file_path):
    restaurants = []
    
    try:
        with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                restaurant = {
                    'id': row['id'],
                    'name': row['name'],
                    'alias': row['alias'],
                    'rating': float(row['rating']),
                    'review_count': int(row['review_count'])
                }
                restaurants.append(restaurant)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except KeyError as e:
        print(f"Error: Missing expected field {e}")
    except ValueError as e:
        print(f"Error: Data type conversion issue: {e}")

    return restaurants


In [15]:
def get_yelp_reviews(restaurants):
  output = []
  headers = {
      "accept": "application/json",
      "Authorization": "Bearer 3lX3EqE4bLsHCwaN8tyZ3kNNg_tykrIiw8cgEDcbNOeGYo9m22YYW5as-1dPp-f0Gy_X8_12CDEiqVgbM0SdKgKE2x94_w4-_PLu8Kfufdj-kBvbYWCGNmUUjyZHZ3Yx"
  }

  for business in restaurants:
    url = f"https://api.yelp.com/v3/businesses/{business['id']}/reviews?limit=20&sort_by=yelp_sort"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
      data = response.json()
      reviews = data['reviews']
      if len(reviews) > 0:
        for r in reviews:
            if 'text' in r:
              new_data = {
                  'review_id': r['id'], 
                  'restaurant_id': business['id'], 
                  'restaurant': business['name'], 
                  'text': r['text'].replace(",",""), 
                  'date': r['time_created']
              }
              output.append(new_data)
                
  return output

In [4]:
file_path = 'yelp_restaurants.csv'
restaurants = load_restaurants_from_csv(file_path)
print(restaurants)

[{'id': '4OsDhJHURGqfw1_22Gslgg', 'name': 'La Terrazza', 'alias': 'la-terrazza-roma-4', 'rating': 4.9, 'review_count': 32}, {'id': 'O-PwNezcRXYyx2xiu9qVHQ', 'name': 'Bottega Rocchi', 'alias': 'bottega-rocchi-roma', 'rating': 4.8, 'review_count': 5}, {'id': 'N1s7kKRwvSIkyuS-xbdwUA', 'name': 'Il Giardino Ristorante', 'alias': 'il-giardino-ristorante-roma-2', 'rating': 4.7, 'review_count': 23}, {'id': '-i5sf6JOXkYY6BhtO2EMMw', 'name': 'Cantina dei Papi', 'alias': 'cantina-dei-papi-roma', 'rating': 4.7, 'review_count': 566}, {'id': 'dDaMYkgOett1PVEq-qk6Zg', 'name': 'I Monticiani', 'alias': 'i-monticiani-roma', 'rating': 4.7, 'review_count': 31}, {'id': 'ehrW9keNRzswszJh7_nszw', 'name': 'La Tavernaccia', 'alias': 'la-tavernaccia-roma', 'rating': 4.6, 'review_count': 81}, {'id': '7wVeQWsPlNG5RSTB6PipPw', 'name': 'Mama Pasta', 'alias': 'mama-pasta-roma', 'rating': 4.6, 'review_count': 24}, {'id': 'rLaPNO6TLGJLONJjUPhjSg', 'name': 'Al Forno della Soffitta', 'alias': 'al-forno-della-soffitta-ro

In [16]:
import csv
import os

file_path = "yelp_reviews.csv"
fieldnames = ["review_id", "restaurant_id", "restaurant", "text", "date"]

if not os.path.exists(file_path):
    with open(file_path, mode='w', encoding='utf-8') as csvfile:
       csvfile.write("review_id,restaurant_id,restaurant,text,date\n")
    print(f"File created")
else:
    print(f"File already exists: {file_path}")

print("Getting data...")

reviews = get_yelp_reviews(restaurants)

print(f"Saving {len(reviews)} reviews...")
add_to_file(file_path, fieldnames, reviews)

File already exists: yelp_reviews.csv
Getting data...
Saving 179 reviews...


From OpenTalbe

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import hashlib

# needed so to have the lists (restaurants and reviews) loaded from the website
def scroll_down_page(driver, speed=8):
    current_scroll_position, new_height= 0, 1
    while current_scroll_position <= new_height:
        current_scroll_position += speed
        driver.execute_script("window.scrollTo(0, {});".format(current_scroll_position))
        new_height = driver.execute_script("return document.body.scrollHeight")

# obtain the list of restaurants based on the predefined criteria
def scrape_opentable_restaurants(keep_open=False, max_restaurants=10):
    # queries OpenTable restaurants in 'Paris' under 'Italian' cousine category, ordered by rating
    url = "https://www.opentable.com/s?term=paris&cuisineIds%5B%5D=48e9d049-40cf-4cb9-98d9-8c47d0d58986&sortBy=rating"

    # open the browser
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(3)
    # scroll down the page so to have all the restaurants loaded
    scroll_down_page(driver)
    time.sleep(3)

    restaurants = []

    try:
        # Extract restaurant elements
        restaurant_elements = driver.find_elements(By.CLASS_NAME, 'qCITanV81-Y-')
        restaurant_counter = 0
        for restaurant_element in restaurant_elements:
            try:
                restaurant_name = restaurant_element.text # name
                restaurant_link = restaurant_element.get_attribute('href') # link
                restaurant_link = restaurant_link[:-122] # remove parameters
                restaurants.append({'restaurant_name': restaurant_name, 'restaurant_link': restaurant_link})
                restaurant_counter += 1
                if max_restaurants != 0 and restaurant_counter >= max_restaurants:
                    break

            except Exception as e:
                print(f"Error extracting restaurants: {e}")
                continue

    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        if not keep_open:
            driver.quit()

    return restaurants, driver

# scrape reviews from the given restaurant
def scrape_opentable_reviews(driver, url, keep_open=False, max_reviews=10):

    # open the browser if not already open
    if driver is None:
        driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(3)

    reviews = []

    try:
        # scroll down the page so to have all the reviews loaded
        scroll_down_page(driver)
        time.sleep(1)
        # Extract review elements
        review_elements = driver.find_elements(By.CLASS_NAME, 'afkKaa-4T28-')
        review_counter = 0

        for review_element in review_elements:
            try:
                review_text = review_element.find_element(By.CLASS_NAME, '_6rFG6U7PA6M-').text # review text
                review_date = review_element.find_element(By.CLASS_NAME, 'iLkEeQbexGs-').text # review date
                reviews.append({'review_text': review_text, 'review_date': review_date})
                
                review_counter += 1
                if max_reviews != 0 and review_counter >= max_reviews:
                    break

            except Exception as e:
                print(f"Error extracting review: {e}")
                continue

    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        if not keep_open:
            driver.quit()

    return reviews

def get_opentable_reviews(output_csv, max_restaurants=30, max_reviews=10):
    """
    retrieves reviews from OpenTable.
    :return: A list of reviews, each being: {'review_id', 'restaurant_id', 'restaurant', 'text', 'date'}
    """
    
    unique_restaurants_already_scraped = set()
    try:
        with open(output_csv, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)

            for row in reader:
                restaurant_name = row.get('restaurant_id', '').strip()
                unique_restaurants_already_scraped.add(restaurant_name)

    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return []
    
    result = []
    
    if len(unique_restaurants_already_scraped) >= max_restaurants:
        try:
            with open(output_csv, mode='r', encoding='utf-8') as file:
                reader = csv.DictReader(file)

                # Check if 'review_text' column exists
                if 'review_text' not in reader.fieldnames:
                    print("Column 'review_text' not found in the CSV file.")
                    return []

                for row in reader:
                    review = {
                        'review_id': row.get('review_id', '').strip(),
                        'restaurant_id': row.get('restaurant_id', '').strip(),
                        'restaurant': row.get('restaurant', '').strip(),
                        'text': row.get('text', '').strip(),
                        'date': row.get('date', '').strip(),
                    }
                    result.append(review)

        except Exception as e:
            print(f"Error reading CSV file: {e}")
            return []

        return result

    # scrape restaurants
    restaurants, driver = scrape_opentable_restaurants(True, max_restaurants)
    
    # scrape reviews
    for n, restaurant in enumerate(restaurants):
        if restaurant['restaurant_name'] in unique_restaurants_already_scraped:
            continue
        reviews = scrape_opentable_reviews(driver, restaurant['restaurant_link'], True if n < len(restaurants) else False, max_reviews)
        for review in reviews:
            result.append({
                'review_id': hash_function(f"{restaurant['restaurant_name']}{review['review_text']}"),
                'restaurant_id': hash_function(restaurant['restaurant_name']),
                'restaurant': restaurant['restaurant_name'],
                'text': review['review_text'],
                'date': review['review_date']
            })

    # save the results to a CSV file
    with open(output_csv, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['review_id', 'restaurant_id', 'restaurant', 'text', 'date'])
        if file.tell() == 0:  # Check if the file is empty to write the header
            writer.writeheader()
        writer.writerows(result)

    return result

# Save reviews in a CSV file

In [5]:
import os
import csv

get_opentable_reviews("opentable_reviews.csv")
get_yelp_reviews("yelp_reviews.csv")

TypeError: get_yelp_reviews() missing 1 required positional argument: 'max_reviews'

In [None]:
import pandas as pd

# Load the CSV files
opentable_reviews = pd.read_csv('opentable_reviews.csv')
yelp_reviews = pd.read_csv('yelp_reviews.csv')

# Combine the two DataFrames
reviews = pd.concat([opentable_reviews, yelp_reviews], ignore_index=True)

# Save the combined DataFrame to a new CSV file
reviews.to_csv('combined_reviews.csv', index=False)
reviews