# Retrieve Reviews
In this section the reviews from the two different sources are retrieved.

The reviews are then stored in two different CSV files, one for each source.

In [16]:
import pandas as pd
import numpy as np
import csv
!pip install fast-langdetect --quiet
from fast_langdetect import detect

In [17]:
# helper functions
def hash_function(text):
    return abs(hash(text)) % (10**8)

## YELP
Using APIs

In [18]:
import requests

# API_KEY foir rapidapi
API_KEY = "10ccda3760msh06408370be84a12p114ca8jsnc974ddf5859c"
BASE_URL = "https://red-flower-business-data.p.rapidapi.com"
SEARCH_URL = f"{BASE_URL}/business-search"
REVIEWS_URL = f"{BASE_URL}/business-reviews"
        

In [19]:
# Function for adding rows to a file
def add_to_file(file_path, fieldnames, data):
  with open(file_path, mode='a', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    for row in data:
        filtered_row = {key: value for key, value in row.items() if key in fieldnames}
        writer.writerow(filtered_row)

In [20]:
# Function for fetching restaurants
def get_yelp_restaurants(start=0):
    headers = {"X-RapidAPI-Key": API_KEY, "x-rapidapi-host": "red-flower-business-data.p.rapidapi.com"}
    params = {
        "query": "italian restaurant",
        "location": "Rome, RM, Italy",
        "yelp_domain": "yelp.com.tr",
        "sort_by": "HIGHEST_RATED",
        "start": start 
    }

    b_data = []
    
    response = requests.get(SEARCH_URL, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json()
        businesses = data["data"]
        print(f"Found {len(businesses)} businesses")
        for business in businesses:
            b_data.append({
                "id": business['id'],
                "name": business['name'],
                "alias": business["alias"],
                "rating": business["rating"],
                "review_count": business["review_count"]
            })
            
    return b_data


In [21]:
import csv
import os

file_path = "yelp_restaurants.csv"
fieldnames = ["id","name","alias","rating","review_count"]

# Fetching and saving restaurants
for start in [0, 10, 20, 30, 40, 50]:
    data = get_yelp_restaurants(start) ## The number passed is the start value: the api fetches 10 restaurants at time, with this number we say how many to skip so we can get more (like a pagination)

    if not os.path.exists(file_path):
        with open(file_path, mode='w', encoding='utf-8') as csvfile:
            csvfile.write("id,name,alias,rating,review_count\n")
            print(f"File created")
    else:
        print(f"File already exists: {file_path}")
        
    add_to_file(file_path, fieldnames, data)

File already exists: yelp_restaurants.csv
File already exists: yelp_restaurants.csv
File already exists: yelp_restaurants.csv
File already exists: yelp_restaurants.csv
File already exists: yelp_restaurants.csv
File already exists: yelp_restaurants.csv


In [22]:
import csv

# Function for taking saved data
def load_restaurants_from_csv(file_path):
    restaurants = []
    try:
        with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                restaurant = {
                    'id': row['id'],
                    'name': row['name'],
                    'alias': row['alias'],
                    'rating': float(row['rating']),
                    'review_count': int(row['review_count'])
                }
                restaurants.append(restaurant)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except KeyError as e:
        print(f"Error: Missing expected field {e}")
    except ValueError as e:
        print(f"Error: Data type conversion issue: {e}")

    return restaurants


In [23]:
# Function for getting reviews
def get_yelp_reviews(restaurants):
  output = []
  headers = {
      "accept": "application/json",
      "Authorization": "Bearer 3lX3EqE4bLsHCwaN8tyZ3kNNg_tykrIiw8cgEDcbNOeGYo9m22YYW5as-1dPp-f0Gy_X8_12CDEiqVgbM0SdKgKE2x94_w4-_PLu8Kfufdj-kBvbYWCGNmUUjyZHZ3Yx"
  }

  for business in restaurants:
    url = f"https://api.yelp.com/v3/businesses/{business['id']}/reviews?limit=20&sort_by=yelp_sort"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
      data = response.json()
      reviews = data['reviews']
      if len(reviews) > 0:
        for r in reviews:
            if 'text' in r:
              review_text = re.sub(r"\n", "", r['text'].replace(",",""))
              new_data = {
                  'review_id': r['id'], 
                  'restaurant_id': business['id'], 
                  'restaurant': business['name'], 
                  'text': review_text, 
                  'date': r['time_created']
              }
              output.append(new_data)
                
  return output

In [24]:
file_path = 'yelp_restaurants.csv'
restaurants = load_restaurants_from_csv(file_path)
restaurants

[]

In [25]:
import csv
import os

# Finally getting and saving reviews

file_path = "yelp_reviews.csv"
fieldnames = ["review_id", "restaurant_id", "restaurant", "text", "date"]

if not os.path.exists(file_path):
    with open(file_path, mode='w', encoding='utf-8') as csvfile:
       csvfile.write("review_id,restaurant_id,restaurant,text,date\n")
    print(f"File created")
else:
    print(f"File already exists: {file_path}")

print("Getting data...")

reviews = get_yelp_reviews(restaurants)

print(f"Saving {len(reviews)} reviews...")
add_to_file(file_path, fieldnames, reviews)

File already exists: yelp_reviews.csv
Getting data...
Saving 0 reviews...


## OpenTable
Using Selenium

### Define scrapping functions

In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# needed so to have the lists (restaurants and reviews) loaded from the website
def scroll_down_page(driver, speed=8):
    current_scroll_position, new_height= 0, 1
    while current_scroll_position <= new_height:
        current_scroll_position += speed
        driver.execute_script("window.scrollTo(0, {});".format(current_scroll_position))
        new_height = driver.execute_script("return document.body.scrollHeight")

# obtain the list of restaurants based on the predefined criteria
def scrape_opentable_restaurants(keep_open=False, max_restaurants=10):
    # queries OpenTable restaurants in 'Paris' under 'Italian' cousine category, ordered by rating
    url = "https://www.opentable.com/s?term=paris&cuisineIds%5B%5D=48e9d049-40cf-4cb9-98d9-8c47d0d58986&sortBy=rating"

    # open the browser
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(3)
    # scroll down the page so to have all the restaurants loaded
    scroll_down_page(driver)
    time.sleep(3)

    restaurants = []

    try:
        # Extract restaurant elements
        restaurant_elements = driver.find_elements(By.CLASS_NAME, 'qCITanV81-Y-')
        restaurant_counter = 0
        for restaurant_element in restaurant_elements:
            try:
                restaurant_name = restaurant_element.text # name
                restaurant_link = restaurant_element.get_attribute('href') # link
                restaurant_link = restaurant_link[:-122] # remove parameters
                restaurants.append({'restaurant_name': restaurant_name, 'restaurant_link': restaurant_link})
                restaurant_counter += 1
                if max_restaurants != 0 and restaurant_counter >= max_restaurants:
                    break

            except Exception as e:
                print(f"Error extracting restaurants: {e}")
                continue

    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        if not keep_open:
            driver.quit()

    return restaurants, driver

# scrape reviews from the given restaurant
def scrape_opentable_reviews(driver, url, keep_open=False, max_reviews=10):

    # open the browser if not already open
    if driver is None:
        driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(3)

    reviews = []

    try:
        # scroll down the page so to have all the reviews loaded
        scroll_down_page(driver)
        time.sleep(1)
        # Extract review elements
        review_elements = driver.find_elements(By.CLASS_NAME, 'afkKaa-4T28-')
        review_counter = 0

        for review_element in review_elements:
            try:
                review_text = review_element.find_element(By.CLASS_NAME, '_6rFG6U7PA6M-').text # review text
                review_date = review_element.find_element(By.CLASS_NAME, 'iLkEeQbexGs-').text # review date
                reviews.append({'review_text': review_text, 'review_date': review_date})
                
                review_counter += 1
                if max_reviews != 0 and review_counter >= max_reviews:
                    break

            except Exception as e:
                print(f"Error extracting review: {e}")
                continue

    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        if not keep_open:
            driver.quit()

    return reviews


### Get reviews

In [19]:
import os
import csv

def get_opentable_reviews(output_csv, max_restaurants=30, max_reviews=10):
    """
    retrieves reviews from OpenTable.
    :return: A list of reviews, each being: {'review_id', 'restaurant_id', 'restaurant', 'text', 'date'}
    """
    
    unique_restaurants_already_scraped = set()
    if os.path.exists(output_csv):
        try:
            with open(output_csv, mode='r', encoding='utf-8') as file:
                reader = csv.DictReader(file)
    
                for row in reader:
                    restaurant_name = row.get('restaurant_id', '').strip()
                    unique_restaurants_already_scraped.add(restaurant_name)
    
        except Exception as e:
            print(f"Error reading CSV file: {e}")
            return []

    result = []

    if len(unique_restaurants_already_scraped) >= max_restaurants:
        try:
            with open(output_csv, mode='r', encoding='utf-8') as file:
                reader = csv.DictReader(file)

                for row in reader:
                    review = {
                        'review_id': row.get('review_id', '').strip(),
                        'restaurant_id': row.get('restaurant_id', '').strip(),
                        'restaurant': row.get('restaurant', '').strip(),
                        'text': row.get('text', '').strip(),
                        'date': row.get('date', '').strip(),
                    }
                    result.append(review)

        except Exception as e:
            print(f"Error reading CSV file: {e}")
            return []

        return result

    # scrape restaurants
    restaurants, driver = scrape_opentable_restaurants(True, max_restaurants)

    # scrape reviews
    for n, restaurant in enumerate(restaurants):
        if restaurant['restaurant_name'] in unique_restaurants_already_scraped:
            continue
        reviews = scrape_opentable_reviews(driver, restaurant['restaurant_link'], True if n < len(restaurants) else False, max_reviews)
        for review in reviews:
            result.append({
                'review_id': hash_function(f"{restaurant['restaurant_name']}{review['review_text']}"),
                'restaurant_id': hash_function(restaurant['restaurant_name']),
                'restaurant': restaurant['restaurant_name'],
                'text': review['review_text'],
                'date': review['review_date']
            })

    # save the results to a CSV file
    with open(output_csv, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['review_id', 'restaurant_id', 'restaurant', 'text', 'date'])
        if file.tell() == 0:  # Check if the file is empty to write the header
            writer.writeheader()
        writer.writerows(result)

    return result

get_opentable_reviews("opentable_reviews.csv")

opentable_reviews = pd.read_csv("opentable_reviews.csv")
opentable_reviews.head()

Unnamed: 0,review_id,restaurant_id,restaurant,text,date
0,45775551,73093209,Restaurant Zo,"Being from out of town, it’s always worrisome ...",Dined 7 days ago
1,43409730,73093209,Restaurant Zo,"the restaurant Zo was close, and it was far aw...","Dined on August 16, 2024"
2,39410757,73093209,Restaurant Zo,Amazing dinner with friends. Food was deliciou...,"Dined on August 2, 2024"
3,32270298,73093209,Restaurant Zo,Dined on Thursday evening with group of 9 coll...,"Dined on April 18, 2024"
4,44992094,73093209,Restaurant Zo,Great food and staff was very friendly. Fun p...,"Dined on April 5, 2024"


# Data preprocessing
This section is dedicated to the preprocessing of the data retrieved from the two sources.

The foreign characters from the reviews' text are removed and each of the entry is annotated with its language.

## Text cleaning

In [28]:
import re
import pandas as pd

def clean_text(text):
    """
    Cleans the text by removing non-text characters while retaining accented characters.

    Args:
    - text (str): The input text to be cleaned.

    Returns:
    - str: The cleaned text.
    """
    # Keep Unicode letters, digits, spaces, and basic punctuation
    return re.sub(r'Read more$', '', re.sub(r'[^\w\s.,!?\'\"-]', '', text.replace("\n", ""), flags=re.UNICODE), flags=re.UNICODE)

def clean_reviews(df):
    df['text'] = df['text'].apply(clean_text)

opentable_reviews = pd.read_csv("opentable_reviews.csv")
yelp_reviews = pd.read_csv("yelp_reviews.csv")
clean_reviews(opentable_reviews)
clean_reviews(yelp_reviews)

## Language detection

In [29]:
def detect_language(text):
    try:
        return detect(text.replace("\n", ""), low_memory=False)["lang"]
    except Exception:
        return 'unknown'

def annotate_language(df):
    df['language'] = df['text'].apply(detect_language)

annotate_language(opentable_reviews)
annotate_language(yelp_reviews)

## Save preprocessed data

In [30]:
opentable_reviews.to_csv("opentable_reviews_cleaned.csv", index=False)
yelp_reviews.to_csv("yelp_reviews_cleaned.csv", index=False)

In [None]:
opentable_reviews.head()

In [None]:
yelp_reviews.head()