In [1]:
import os
import json
import urllib.request
import random
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service


In [2]:
driver_path = r"F:\Fork_git\Labelling_Menu_Data\menu_scraper\webdriver\chromedriver\chromedriver.exe"


In [3]:
import json
# Opening JSON file
f = open('us_city_codes.json')
# returns JSON object as 
# a dictionary
city_code = json.load(f)
# Closing file
f.close()

In [4]:
city_code

{'New York City New': '60763',
 'Los Angeles': '32655',
 'Chicago': '35805',
 'Houston': '56003',
 'Brooklyn New': '60827',
 'San Francisco': '60713',
 'Las Vegas': '45963',
 'Philadelphia': '60795',
 'San Diego': '60750',
 'San Antonio': '60956',
 'Miami': '34438',
 'Dallas': '55711',
 'Portland': '52024',
 'Atlanta': '60898',
 'Austin': '30196',
 'Seattle': '60878',
 'Orlando': '34515',
 'Phoenix': '31310',
 'Oahu': '29222',
 'Denver': '33388'}

In [5]:


excluded_categories = {'Japanese', 'Singaporean', 'Indian', 'Korean', 'American', 'Type not available', 'Thai', 'Vietnamese', 'British', 'Italian'}

# Update function to get links and names as per the provided HTML snippet
def get_links_and_names(driver):
    # Locate the entire restaurant divs
    restaurant_divs = driver.find_elements(By.XPATH, '//div[@class="vIjFZ Gi o VOEhq"]')
    
    data = []
    
    # Iterating through each restaurant div
    for restaurant_div in restaurant_divs:
        try:
            # Finding name and link
            name_link_elem = restaurant_div.find_element(By.XPATH, './/a[@href and @target="_blank" and contains(@class, "BMQDV")]')
            name = name_link_elem.text
            link = name_link_elem.get_attribute('href')
            
            # Finding restaurant type
            # Note: Might not be available for all entries, handle accordingly
            elems = restaurant_div.find_elements(By.XPATH, './/span[@class="YECgr"]')

            restaurant_type = "Type not available"
            price = "Price not available"



            # Iterate through the found elements and check for the desired conditions
            for elem in elems:
                text = elem.text
                
                # Check if "Chinese" is in text, then it's a type
                if any(symbol in text.lower() for symbol in ['chinese']):
                    restaurant_type = text
                # Check if any of the currency symbols are in text, then it's a price
                elif any(symbol in text for symbol in ["£", "$", "€"]):  # you can add more currencies here
                    price = text

            categories = {category.strip() for category in restaurant_type.split(',')}        
            if not categories.intersection(excluded_categories):
                # Append the restaurant data to the list
                data.append({
                    'name': name, 
                    'link': link, 
                    'type': restaurant_type, 
                    'price': price
                })
        except Exception as e:
            # Log or print exception info and continue with the next restaurant
            print(f"Skipping a restaurant due to an error: {str(e)}")
            continue
    
    return data


# Update function to get page links as per the provided HTML snippet
def get_page_links(driver):
    # XPath to locate the next page button based on the provided HTML snippet
    next_button = driver.find_elements(By.XPATH, '//a[@data-smoke-attr="pagination-next-arrow"]')
    return [btn.get_attribute('href') for btn in next_button]

In [6]:


# Base link template
base_link_template = "https://www.tripadvisor.co.uk/FindRestaurants?geo={}&cuisines=5379&establishmentTypes=10591&priceTypes=10954%2C10955&broadened=false"

# Selenium setup
service = Service(executable_path=driver_path)
options = Options()
driver = webdriver.Chrome(service=service)

# Retrieve the code for the first city
first_city_code = list(city_code.values())[0]
first_city_page = base_link_template.format(first_city_code)
driver.get(first_city_page)

# Wait for user to login
print("Please login to the website and press Enter here to continue.")
input()

# Loop through each city
for city, code in city_code.items():
    base_page = base_link_template.format(code)
    filename = f'us_tripadvisoor/trip_advisor_{city.lower().replace(" ", "_")}.json'

    # Load existing data for the city
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            all_shop_data = json.load(f)
    else:
        all_shop_data = []

    driver.get(base_page)
    time.sleep(2 + random.random() * 5)  # Random delay

    print(f"Processing city: {city}")

    nav_links = get_page_links(driver)
    continue_flag = True
    page_count = 1

    while continue_flag:
        print(f'Accessing page {page_count} of {city}')
        time.sleep(5 + random.random() * 5)  # Random delay
        nav_links = get_page_links(driver)
        shop_data = get_links_and_names(driver)

        for shop in shop_data:
            if shop not in all_shop_data:
                all_shop_data.append(shop)

        page_count += 1

        if len(nav_links) < 1:
            continue_flag = False
        else:
            driver.get(nav_links[0])

    # Dump the data into the city-specific JSON file
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(all_shop_data, f, ensure_ascii=False, indent=4)


    print(f"Saved {len(all_shop_data)} total shop data to {filename}.")

    time.sleep(5 + random.random() * 5)  # Random delay between city crawls

driver.quit()

Please login to the website and press Enter here to continue.
Processing city: New York City New
Accessing page 1 of New York City New
Accessing page 2 of New York City New
Accessing page 3 of New York City New
Accessing page 4 of New York City New
Accessing page 5 of New York City New
Accessing page 6 of New York City New
Accessing page 7 of New York City New
Saved 168 total shop data to us_tripadvisoor/trip_advisor_new_york_city_new.json.
Processing city: Los Angeles
Accessing page 1 of Los Angeles
Accessing page 2 of Los Angeles
Accessing page 3 of Los Angeles
Saved 71 total shop data to us_tripadvisoor/trip_advisor_los_angeles.json.
Processing city: Chicago
Accessing page 1 of Chicago
Accessing page 2 of Chicago
Accessing page 3 of Chicago
Saved 66 total shop data to us_tripadvisoor/trip_advisor_chicago.json.
Processing city: Houston
Accessing page 1 of Houston
Accessing page 2 of Houston
Accessing page 3 of Houston
Accessing page 4 of Houston
Saved 86 total shop data to us_tripadv

In [None]:
shop_data = get_links_and_names(driver)

In [None]:
shop_data

[{'name': '61. New Kamara Restaurant',
  'link': 'https://www.tripadvisor.co.uk/Restaurant_Review-g32655-d19859331-Reviews-New_Kamara_Restaurant-Los_Angeles_California.html',
  'type': 'Chinese',
  'price': '££ - £££'},
 {'name': '63. Kung Pao Bistro',
  'link': 'https://www.tripadvisor.co.uk/Restaurant_Review-g32655-d468653-Reviews-Kung_Pao_Bistro-Los_Angeles_California.html',
  'type': 'Chinese, Cantonese',
  'price': '££ - £££'},
 {'name': '64. China Star',
  'link': 'https://www.tripadvisor.co.uk/Restaurant_Review-g32655-d509379-Reviews-China_Star-Los_Angeles_California.html',
  'type': 'Chinese',
  'price': '££ - £££'},
 {'name': '65. New Flavors',
  'link': 'https://www.tripadvisor.co.uk/Restaurant_Review-g32655-d5453267-Reviews-New_Flavors-Los_Angeles_California.html',
  'type': 'Chinese',
  'price': '££ - £££'},
 {'name': '66. Western Doma Noodles',
  'link': 'https://www.tripadvisor.co.uk/Restaurant_Review-g32655-d525985-Reviews-Western_Doma_Noodles-Los_Angeles_California.html

In [None]:
def random_sleep(min_time=1, max_time=3):
    time.sleep(random.uniform(min_time, max_time))

# Define a function to download and save images from URLs
def download_image(img_url, save_path):
    if not os.path.exists(save_path):
        urllib.request.urlretrieve(img_url, save_path)
        print(f'Saved image: {save_path}')

# Define a function to get image URLs
def get_image_urls(driver):
    elems = driver.find_elements(By.XPATH, '//div[@class="img"]/a/img')
    return [elem.get_attribute('src') for elem in elems]

def get_high_res_image_url(driver):
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="main-pic-stage"]/img')))
        elem = driver.find_element(By.XPATH, '//div[@class="main-pic-stage"]/img')
        return elem.get_attribute('src')
    except:
        return None

def go_to_next_image(driver):
    try:
        next_button = driver.find_element(By.XPATH, '//a[@class="next J_pic-next"]')
        ActionChains(driver).click(next_button).perform()
        return True
    except NoSuchElementException:
        return False