In [13]:
import os
import json
import urllib.request
import random
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service

In [14]:
driver_path = r"F:\Fork_git\Labelling_Menu_Data\menu_scraper\webdriver\chromedriver-win64\chromedriver.exe"

In [15]:
def random_sleep(min_time=1, max_time=3):
    time.sleep(random.uniform(min_time, max_time))

# Define a function to download and save images from URLs
def download_image(img_url, save_path):
    if not os.path.exists(save_path):
        urllib.request.urlretrieve(img_url, save_path)
        print(f'Saved image: {save_path}')

# Define a function to get image URLs
def get_image_urls(driver):
    elems = driver.find_elements(By.XPATH, '//div[@class="img"]/a/img')
    return [elem.get_attribute('src') for elem in elems]

def get_high_res_image_url(driver):
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="main-pic-stage"]/img')))
        elem = driver.find_element(By.XPATH, '//div[@class="main-pic-stage"]/img')
        return elem.get_attribute('src')
    except:
        return None

def go_to_next_image(driver):
    try:
        next_button = driver.find_element(By.XPATH, '//a[@class="next J_pic-next"]')
        ActionChains(driver).click(next_button).perform()
        return True
    except NoSuchElementException:
        return False

In [16]:


# Load the JSON file, or create a new one if it does not exist
if os.path.exists('blue-frog-menu.json'):
    with open('blue-frog-menu.json', 'r', encoding='utf-8') as f:
        print('Loading existing JSON file...')
        restaurants = json.load(f)
else:
    with open('blue-frog.json', 'r', encoding='utf-8') as f:
        print('Creating new JSON file...')
        restaurants = json.load(f)
        for restaurant in restaurants:
            restaurant['menu_photos'] = []
            restaurant['receipt_photos'] = []
            restaurant['mix_photos'] = []

# Create a directory for storing images
if not os.path.exists('images'):
    os.makedirs('images')

options = Options()
# options.add_argument("--headless")  # Enable headless mode
options.add_argument('--disable-blink-features=AutomationControlled')
# options.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(70, 91)}.0.4472.124 Safari/537.36')
service = Service(executable_path=driver_path, option = options)



driver = webdriver.Chrome(service=service)

# Random viewport size
random_width = random.choice([1920, 1366, 1024])
random_height = random.choice([1080, 768, 600])
driver.set_window_size(random_width, random_height)


# Process each restaurant
for restaurant in restaurants:

    if any(restaurant[key] for key in ['menu_photos', 'receipt_photos', 'mix_photos']):
        print(f'Skipping restaurant {restaurant["name"]}. Already collected photos.')
        continue
    print(f'Processing restaurant: {restaurant["name"]}')

    encountered_error = False

    for photo_type, list_key in [('/photos/tag-%E4%BB%B7%E7%9B%AE%E8%A1%A8-%E8%8F%9C%E5%8D%95', 'menu_photos'),
                                 ('/photos/tag-价目表-账单', 'receipt_photos'), 
                                 ('/photos/tag-价目表', 'mix_photos')]:
        if photo_type == '/photos/tag-价目表' and not encountered_error:
            continue  # Skip this category if no error was encountered in the previous two
        img_count = 1 # Create a counter for the image

        url = restaurant['link'] + photo_type + '#p=' + str(img_count)
        driver.get(url)

        while True:
            error_elems = driver.find_elements(By.XPATH, '//div[@class="aboutBox errorMessage"]')
            # If the error message exists, return None
            if error_elems:
                print("Encountered an error. Retrying in a bit...")
                random_sleep(10, 20)
                encountered_error = True
                break

            high_res_photo_url = get_high_res_image_url(driver)
            if high_res_photo_url is None:
                print(f'Finished collecting {list_key} from {restaurant["name"]}')
                break

            
            img_path = os.path.join('images', f"{restaurant['name']}_{list_key}_{img_count}.jpg")

            if high_res_photo_url not in restaurant[list_key]:
                restaurant[list_key].append(high_res_photo_url)
                # Download and save images
                download_image(high_res_photo_url, img_path)
            else:
                print(f'Skipped existing image: {img_path}')

            img_count += 1  # Increment the counter

            driver.execute_script("window.scrollTo(0, document.body.scrollHeight/4);")
            random_sleep(0.5, 2)

            # Go to the next image
            if not go_to_next_image(driver):
                print(f'Finished collecting {list_key} from {restaurant["name"]}')
                break

            

        # Save the updated JSON after each type of photos are processed
        with open('blue-frog-menu.json', 'w', encoding='utf-8') as f:
            json.dump(restaurants, f, ensure_ascii=False, indent=4)

driver.quit()

print('Done!')



Loading existing JSON file...
1111
Skipping restaurant bluefrog蓝蛙(长宁来福士店). Already collected photos.
Skipping restaurant bluefrog蓝蛙(正大乐城店). Already collected photos.
Skipping restaurant bluefrog蓝蛙(恒丰路店). Already collected photos.
Skipping restaurant bluefrog蓝蛙(仲盛世界商城店). Already collected photos.
Skipping restaurant bluefrog蓝蛙(虹桥龙湖天街店). Already collected photos.
Skipping restaurant bluefrog蓝蛙(七宝万科广场店). Already collected photos.
Skipping restaurant bluefrog蓝蛙(189弄购物中心店). Already collected photos.
Processing restaurant: bluefrog蓝蛙(万象城店)
Saved image: images\bluefrog蓝蛙(万象城店)_menu_photos_1.jpg
Skipped existing image: images\bluefrog蓝蛙(万象城店)_menu_photos_2.jpg
Saved image: images\bluefrog蓝蛙(万象城店)_menu_photos_3.jpg
Saved image: images\bluefrog蓝蛙(万象城店)_menu_photos_4.jpg
Saved image: images\bluefrog蓝蛙(万象城店)_menu_photos_5.jpg
Saved image: images\bluefrog蓝蛙(万象城店)_menu_photos_6.jpg
Saved image: images\bluefrog蓝蛙(万象城店)_menu_photos_7.jpg
Saved image: images\bluefrog蓝蛙(万象城店)_menu_photos_8.jpg
Saved ima

KeyboardInterrupt: 

In [None]:
driver.quit()

In [None]:
# service = Service(executable_path=driver_path)

# options = Options()
# # options.add_argument("--headless")  # Enable headless mode
# # options.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(70, 91)}.0.4472.124 Safari/537.36')

# #
# driver = webdriver.Chrome(service=service)