In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
import requests
import os
import re
import pandas as pd
import glob

In [2]:
def extract_number(filename):
    # 获取文件名部分
    base_name = os.path.basename(filename)
    # 提取文件名中的数字部分
    number = ''.join(filter(str.isdigit, base_name))
    return int(number) if number else float('inf')
# 指定包含 CSV 文件的目录路径
folder_path = 'foodcom_url'

# 使用 glob 库获取目录下所有 CSV 文件的路径
csv_files = sorted(glob.glob(os.path.join(folder_path, '*.csv')), key=extract_number)

# 读取所有 CSV 文件并将它们存储在一个列表中
dataframes = [pd.read_csv(file) for file in csv_files]

# 将所有 DataFrame 合并成一个 DataFrame
combined_df = pd.concat(dataframes, ignore_index=True).iloc[1:, :]
recipe_url_list = combined_df.values.tolist()
recipe_url_list = [recipe_url[0] for recipe_url in recipe_url_list]

In [3]:
save_dir = r"C:\Users\USER\OneDrive\final_group_project\foodcom_images"
# save_dir = r"C:\Users\student\OneDrive\final_group_project\foodcom_images"
save_path = []
# 定義正則表達式模式
patterns = {
    "Calories": r"Calories:\s+([\d.]+)",
    "Calories from Fat": r"Calories from Fat\s+([\d.]+)\s+g",
    "Total Fat": r"Total Fat\s+([\d.]+)\s+g",
    "Saturated Fat": r"Saturated Fat\s+([\d.]+)\s+g",
    "Cholesterol": r"Cholesterol\s+([\d.]+)\s+mg",
    "Sodium": r"Sodium\s+([\d.]+)\s+mg",
    "Total Carbohydrate": r"Total Carbohydrate\s+([\d.]+)\s+g",
    "Dietary Fiber": r"Dietary Fiber\s+([\d.]+)\s+g",
    "Sugars": r"Sugars\s+([\d.]+)\s+g",
    "Protein": r"Protein\s+([\d.]+)\s+g",
}
# 定義包含非法字符的正則表達式模式
illegal_chars_pattern = r'[\/\\\?<>:\*\|"]'

column_names = ['Filename', 'Calories', 'Calories from Fat', 'Total Fat', 'Saturated Fat', 'Cholesterol', \
                'Sodium', 'Total Carbohydrate', 'Dietary Fiber', 'Sugars', 'Protein']
data_df = pd.DataFrame(columns=column_names)

In [4]:

# 啟動瀏覽器工具的選項
my_options = webdriver.ChromeOptions()
# my_options.add_argument("--headless")                #不開啟實體瀏覽器背景執行
my_options.add_argument("--start-maximized")         #最大化視窗
my_options.add_argument("--incognito")               #開啟無痕模式
my_options.add_argument("--disable-popup-blocking") #禁用彈出攔截
my_options.add_argument("--disable-notifications")  #取消 chrome 推播通知


# 初始化 WebDriver
service = Service(executable_path="./chromedriver.exe")  # 替換為 ChromeDriver 的路徑
# 使用 Chrome 的 WebDriver
# my_service = Service(executable_path="./chromedriver.exe")
driver = webdriver.Chrome(
    options = my_options,
    service = service
)
# driver.implicitly_wait(15)

In [5]:

for i, recipe_url in enumerate(recipe_url_list[:200]):
    print(f"i = {i}, recipe_url = {recipe_url}")
    try:
        driver.get(recipe_url)
        time.sleep(3)  # 等待頁面加載
    except:
        print('recipe_url not found')
    
    filename = recipe_url[:].split('/')[-1] + '.jpg'
    # filename = driver.find_element(By.CSS_SELECTOR, "div > h1").text
    # # 使用 re.sub() 函數去除非法字符
    # filename = re.sub(illegal_chars_pattern, '', filename)
    # filename = filename.replace(" ", "_")
    print(f"filename = {filename}")

    try:
        image = driver.find_element(By.CSS_SELECTOR, 'img[style*="--aspect-ratio: 5/4;"]')  # 替換為正確的CSS選擇器
        image_url = image.get_attribute('src')
        print('Image URL:', image_url)
    except:
        print('Image not found')
    # 爬取食譜圖片

    # 下载图片
    response = requests.get(image_url)
    time.sleep(2)  # 等待頁面加載
    if response.status_code == 200:
        # 拼接保存路径
        save_path = os.path.join(save_dir, filename)
        # 保存图片到指定路径
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print('img save to:', save_path)
    else:
        print('download img failed!')

    # press "Nutrition information" button
    nutrition_section_btn = driver.find_element(By.CSS_SELECTOR, 'button.facts__nutrition')  # 替換為正確的CSS選擇器
    nutrition_section_btn.click()

    # concat nutrition data to a string
    nutrition = ''
    nutrition_facts = driver.find_elements(By.CSS_SELECTOR, 'p.recipe-nutrition__item.svelte-epeb0m')
    for nutrition_fact in nutrition_facts:
        nutrition += nutrition_fact.text

    # 提取數值
    nutrition_info = [filename]
    for key, pattern in patterns.items():
        match = re.search(pattern, nutrition)
        if match:
            nutrition_info.append(float(match.group(1)))


    # append to nutrition data
    data_df.loc[len(data_df.index)] = nutrition_info

# save nutrition data to foodcom_data.csv
file_path = 'foodcom_images/foodcom_data.csv'
if not os.path.exists(file_path):
    data_df.to_csv(file_path, index=False)
else:
    data_df.to_csv(file_path, mode='a', index=False, header=False)


i = 0, recipe_url = https://www.food.com/recipe/barbs-gumbo-82288
filename = barbs-gumbo-82288.jpg
Image URL: https://img.sndimg.com/food/image/upload/f_auto,c_thumb,q_55,w_744,ar_5:4/v1/img/recipes/82/28/8/cB14froTlmpDfRF49wmQ_gumbo%2520SITE-3.jpg
img save to: C:\Users\USER\OneDrive\final_group_project\foodcom_images\barbs-gumbo-82288.jpg
i = 1, recipe_url = https://www.food.com/recipe/bourbon-chicken-45809
filename = bourbon-chicken-45809.jpg
Image URL: https://img.sndimg.com/food/image/upload/f_auto,c_thumb,q_55,w_744,ar_5:4/v1/img/recipes/45/80/9/MwuCd6HpQ5mDvn4OLRkA_0S9A9886.jpg
img save to: C:\Users\USER\OneDrive\final_group_project\foodcom_images\bourbon-chicken-45809.jpg
i = 2, recipe_url = https://www.food.com/recipe/best-banana-bread-2886
filename = best-banana-bread-2886.jpg
Image URL: https://img.sndimg.com/food/image/upload/f_auto,c_thumb,q_55,w_744,ar_5:4/v1/img/recipes/28/86/wBZxNua1T8yfDwbfo9Fz_0S9A9315.jpg
img save to: C:\Users\USER\OneDrive\final_group_project\foodcom

In [36]:
# 關閉 WebDriver
driver.quit()