In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import re




In [None]:
csv_file = "recipes.csv"

with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["Recipe_Name", "Recipe_Ingredients", "Recipe"])
    

In [3]:
def get_recipe_detail(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/116.0.0.0 Safari/537.36"
    }

    resp = requests.get(url, headers=headers)
    try:
        resp.encoding = "utf-8"
        html = resp.text
    except UnicodeDecodeError:
        resp.encoding = resp.apparent_encoding
        html = resp.text

    soup = BeautifulSoup(html, "html.parser")

    # 获取食谱名称
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "未找到"

    # 获取食材列表
    ingredients = []
    container = soup.select_one("div.recipe-ingredient")
    if container:
        for a in container.select("a.ing-line"):
            name_tag = a.select_one("div.ing-name")
            if name_tag:
                ingredients.append(name_tag.get_text(strip=True))

    # 获取步骤列表
    step_nodes = soup.select("div.step p.step-text")
    steps = []
    for i, step in enumerate(step_nodes, 1):
        text = step.get_text(strip=True)
        if text:
            steps.append(f"步骤{i}：{text}")

    return title, ingredients, steps

In [None]:
# ======= 循环爬取 =======
start_id = 107567000
end_id =   107580000  # 小范围测试
recipe_count = 0


for recipe_id in range(start_id, end_id):
    url = f"https://m.xiachufang.com/recipe/{recipe_id}/"
    print(f"正在爬取 ID {recipe_id} -> {url}")
    try:
        result = get_recipe_detail(url)
        if result:
            name, ingredients, steps = result
            if name == "未找到" or not ingredients or not steps:
                print("  ❌ 不是有效菜谱")
                continue
            with open(csv_file, "a", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                writer.writerow([name, ";".join(ingredients), ";".join(steps)])
            print(f"  ✅ 成功: {name}")
        else:
            print("  ❌ 不是菜谱")
    except Exception as e:
        print(f"  ⚠ 错误: {e}")
    time.sleep(0.5)


正在爬取 ID 107567000 -> https://m.xiachufang.com/recipe/107567000/
  ✅ 成功: 梅干菜排骨
正在爬取 ID 107567001 -> https://m.xiachufang.com/recipe/107567001/
  ✅ 成功: 蜂蜜柠檬玛德琳
正在爬取 ID 107567002 -> https://m.xiachufang.com/recipe/107567002/
  ✅ 成功: 番茄牛腩
正在爬取 ID 107567003 -> https://m.xiachufang.com/recipe/107567003/
  ✅ 成功: 敷衍版口水鸡
正在爬取 ID 107567004 -> https://m.xiachufang.com/recipe/107567004/
  ✅ 成功: 自制杨梅酒
正在爬取 ID 107567005 -> https://m.xiachufang.com/recipe/107567005/
  ✅ 成功: 青菜包，详细教程
正在爬取 ID 107567006 -> https://m.xiachufang.com/recipe/107567006/
  ✅ 成功: 海苔肉松纸杯蛋糕
正在爬取 ID 107567007 -> https://m.xiachufang.com/recipe/107567007/
  ✅ 成功: 吊龙青菜鸡蛋面(牛肉面)
正在爬取 ID 107567008 -> https://m.xiachufang.com/recipe/107567008/
  ❌ 不是有效菜谱
正在爬取 ID 107567009 -> https://m.xiachufang.com/recipe/107567009/
  ❌ 不是有效菜谱
正在爬取 ID 107567010 -> https://m.xiachufang.com/recipe/107567010/
  ✅ 成功: 不回软的炸蘑菇
正在爬取 ID 107567011 -> https://m.xiachufang.com/recipe/107567011/
  ✅ 成功: 蒸乌米饭
正在爬取 ID 107567012 -> https://m.xiachufang.com/recipe/1