In [1]:
!pip install -q beautifulsoup4 requests


In [4]:
# SCRAPING RECETTES HELLOFRESH → + desc_part_1..6 + total_time, prep_time, difficulty
import json, re, time, math
from pathlib import Path
import requests
from bs4 import BeautifulSoup

UA = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    )
}

UNITS = [
    "g","kg","mg","ml","cl","l",
    "cs","cc","c.à.s","c.à.c",
    "sachet","sachet(s)",
    "pièce","pièce(s)",
    "tranche","tranche(s)",
    "botte","botte(s)",
    "pincée","pincée(s)",
    "brin","brin(s)",
    "bouquet","bouquet(s)",
    "paquet","paquet(s)",
    "pot","pot(s)",
    "cm",
    "boule", "boule(s)",
    "boîte(s)", "boîte",
    "filet(s)",
    "tube(s)", "tube"
]
TEXT_QTY = ["selon le goût", "au goût", "à volonté"]

FRACTION_MAP = {
    "¼": 1/4, "½": 1/2, "¾": 3/4,
    "⅐": 1/7, "⅑": 1/9, "⅒": 1/10,
    "⅓": 1/3, "⅔": 2/3,
    "⅕": 1/5, "⅖": 2/5, "⅗": 3/5, "⅘": 4/5,
    "⅙": 1/6, "⅚": 5/6,
    "ⅈ": None,  # rare, laissé ici au cas où
    "⅛": 1/8, "⅜": 3/8, "⅝": 5/8, "⅞": 7/8,
}
FRACTIONS_CLASS = "".join(k for k,v in FRACTION_MAP.items() if v is not None)
NUM = rf"(?:\d+(?:[.,]\d+)?|\d+/\d+|[{re.escape(FRACTIONS_CLASS)}])"

UNIT = r"(?:{})(?!\S)".format("|".join([re.escape(u) for u in UNITS]))
QTY_CORE = rf"{NUM}(?:\s*{UNIT})?"
LEADING_QTY_RE = re.compile(rf"^\s*({QTY_CORE}(?:\s+{QTY_CORE})*)\s+(.+?)\s*$")
TRAILING_QTY_RE = re.compile(rf"^\s*(.+?)\s+({QTY_CORE}(?:\s+{QTY_CORE})*)\s*$")

def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s.strip())

def frac_to_float(s: str):
    s = s.strip().replace(",", ".")
    if s in FRACTION_MAP and FRACTION_MAP[s] is not None:
        return FRACTION_MAP[s]
    if "/" in s:
        try:
            a, b = s.split("/")
            return float(a) / float(b)
        except Exception:
            return None
    try:
        return float(s)
    except Exception:
        return None

def split_qty_name(line: str):
    s = normalize_space(line)
    for t in TEXT_QTY:
        if s.lower().startswith(t):
            rem = normalize_space(s[len(t):])
            name = rem if rem else s
            return t, name
    m = LEADING_QTY_RE.match(s)
    if m:
        qty_text = normalize_space(m.group(1))
        name = normalize_space(m.group(2))
        return qty_text, name
    m = TRAILING_QTY_RE.match(s)
    if m:
        name = normalize_space(m.group(1))
        qty_text = normalize_space(m.group(2))
        return qty_text, name
    return None, s

def split_number_and_unit(qty_text: str):
    if qty_text is None:
        return None, ""
    t = qty_text.strip()
    for txt in TEXT_QTY:
        if t.lower() == txt:
            return None, txt
    m = re.match(rf"^\s*({NUM})(?:\s+(.+))?\s*$", t)
    if not m:
        return None, t
    n_raw = m.group(1)
    unit = (m.group(2) or "").strip()
    n = frac_to_float(n_raw)
    if n is not None:
        if abs(n - round(n)) < 1e-9:
            n = int(round(n))
        else:
            n = math.floor(n * 100 + 1e-9) / 100.0
    return n, unit

def clean_title(title: str) -> str:
    return re.sub(r"\s*Recette\s*\|\s*HelloFresh\s*$", "", title).strip()

# ---- Instructions
def get_instructions(soup: BeautifulSoup):
    steps = []
    for box in soup.find_all(attrs={"data-test-id": "instruction-step"}):
        items = [li.get_text(" ", strip=True) for li in box.find_all("li")]
        if items:
            steps.append(" ".join(items))
            continue
        txt = box.get_text(" ", strip=True)
        if txt:
            steps.append(txt)
    if steps:
        return steps
    header = None
    for h in soup.find_all(["h2", "h3", "h4"]):
        if h.get_text(strip=True).lower().startswith("instructions"):
            header = h
            break
    if header:
        section_steps = []
        for sib in header.parent.next_siblings:
            name = getattr(sib, "name", None)
            if name in ("h2", "h3", "h4"):
                break
            if hasattr(sib, "find_all"):
                for ul in sib.find_all("ul"):
                    items = [li.get_text(" ", strip=True) for li in ul.find_all("li")]
                    if items:
                        section_steps.append(" ".join(items))
        if section_steps:
            return section_steps
    return steps

# ---- Durées & difficulté
ISO_RE = re.compile(r"^P(T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?)$", re.I)

def iso_to_human(iso: str) -> str:
    """
    'PT40M' -> '40 minutes', 'PT1H30M' -> '1 h 30 min'
    """
    if not iso:
        return ""
    m = ISO_RE.match(iso.strip())
    if not m:
        return ""
    h = int(m.group(2) or 0)
    mnt = int(m.group(3) or 0)
    parts = []
    if h:
        parts.append(f"{h} h")
    if mnt:
        parts.append(f"{mnt} min" if not h else f"{mnt} min")
    return " ".join(parts) if parts else ""

# --- Precise DOM fallback for 'Temps total' / 'Temps de préparation' / 'Difficulté'

def _value_after_label(soup: BeautifulSoup, label: str, want_digits=False):
    """
    Find the node whose text == label, then scan the next few spans/divs
    for the first non-empty value (preferring one with digits when want_digits=True).
    """
    n = soup.find(string=re.compile(rf"^\s*{re.escape(label)}\s*$", re.I))
    if not n:
        return ""

    # search nearby first (same row / immediate siblings), then a short forward scan
    anchors = []
    # parent chain for local search
    p = getattr(n, "parent", None)
    for _ in range(4):  # climb a few levels max
        if not p:
            break
        anchors.append(p)
        p = getattr(p, "parent", None)

    # candidates: spans/divs under those anchors (row) and a short forward walk
    seen_ids = set()
    cands = []
    for a in anchors:
        for el in a.find_all(["span", "div"], recursive=True):
            if id(el) in seen_ids: 
                continue
            seen_ids.add(id(el))
            cands.append(el)

    # also scan forward a bit in document order
    for el in n.find_all_next(["span", "div"], limit=12):
        if id(el) in seen_ids:
            continue
        seen_ids.add(id(el))
        cands.append(el)

    # choose first usable text that is not the label itself
    for el in cands:
        t = el.get_text(" ", strip=True)
        if not t:
            continue
        if re.fullmatch(rf"\s*{re.escape(label)}\s*", t, flags=re.I):
            continue
        if want_digits and not re.search(r"\d", t):
            continue
        return t
    return ""

def extract_meta_from_dom(soup: BeautifulSoup):
    """
    DOM-driven fallback: pick the value next to the label.
    """
    total = _value_after_label(soup, "Temps total", want_digits=True)
    prep  = _value_after_label(soup, "Temps de préparation", want_digits=True)
    diff  = _value_after_label(soup, "Difficulté", want_digits=False)
    return total, prep, diff

def parse_recipe_jsonld_only(url: str):
    r = requests.get(url, headers=UA, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # Titre
    title = None
    og = soup.find(attrs={"property": "og:title"})
    if og and og.get("content"):
        title = clean_title(og["content"])
    if not title:
        h1 = soup.find("h1")
        title = clean_title(h1.get_text(strip=True) if h1 else "Recette")

    ingredients_obj = {}
    found_any = False
    total_time = ""
    prep_time = ""
    difficulty = ""

    # JSON-LD
    for tag in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(tag.string)
        except Exception:
            continue
        objs = data if isinstance(data, list) else [data]
        for obj in objs:
            if not isinstance(obj, dict):
                continue
            t = obj.get("@type")
            if (isinstance(t, list) and "Recipe" in t) or t == "Recipe":
                # 1) Times via JSON-LD si présents
                if not total_time:
                    total_time = iso_to_human(obj.get("totalTime", ""))
                if not prep_time:
                    prep_time = iso_to_human(obj.get("prepTime", ""))
                # parfois difficulté n’existe pas en JSON-LD

                # 2) Ingrédients
                for raw in (obj.get("recipeIngredient") or []):
                    qty_text, name = split_qty_name(str(raw))
                    qty_num, unit = split_number_and_unit(qty_text)
                    if name:
                        ingredients_obj[name] = {"qty": qty_num, "unit": unit or ""}
                        found_any = True
    # Fallback texte/DOM pour time/difficulty si manquants
    if not total_time or not prep_time or not difficulty:
        t_fallback, p_fallback, d_fallback = extract_meta_from_dom(soup)
        if not total_time and t_fallback:
            total_time = t_fallback
        if not prep_time and p_fallback:
            prep_time = p_fallback
        if not difficulty and d_fallback:
            difficulty = d_fallback

    # Instructions
    steps = get_instructions(soup)
    desc = {}
    for i in range(1, 7):
        if i < 6:
            desc[f"desc_part_{i}"] = steps[i-1] if i-1 < len(steps) else ""
        else:
            desc[f"desc_part_{i}"] = " ".join(steps[5:]) if len(steps) > 5 else (steps[5] if len(steps) > 5 else "")

    return {
        "name": title,
        "link": url,
        "ingredients": ingredients_obj if found_any else {},
        "total_time": total_time,         # ex. '40 minutes' ou '1 h 30 min'
        "prep_time": prep_time,           # ex. '35 minutes'
        "difficulty": difficulty,         # ex. 'Intermédiaire'
        **desc
    }

def scrape_many_to_file(urls, out_path="../data/recettes_hellofresh.txt"):
    existing = {}
    out_file = Path(out_path)
    if out_file.exists():
        try:
            with open(out_file, encoding="utf-8") as f:
                old_data = json.load(f)
                for rec in old_data:
                    if rec and rec.get("link"):
                        existing[rec["link"]] = rec
        except Exception as e:
            print(f"[warn] Impossible de charger l’existant ({e})")

    results = dict(existing)
    seen = set()

    for u in urls:
        url = u.strip().rstrip("/")
        if not url or url in seen:
            continue
        seen.add(url)

        if url in existing:
            print(f"[skip] {url} déjà présent, on garde l’ancien")
            continue

        try:
            rec = parse_recipe_jsonld_only(url)
            results[url] = rec
            print(f"[ok] {url} ajouté")
            time.sleep(0.25)
        except Exception as e:
            print(f"[warn] {url}: {e}")
            if url not in results:
                results[url] = {"name": None, "link": url, "ingredients": {}}

    final_list = list(results.values())
    out_file.write_text(json.dumps(final_list, ensure_ascii=False, indent=2), encoding="utf-8")
    return out_path, final_list

# --- Exemple d'utilisation 
# --- # Charger les URLs depuis un fichier JSON (.txt) 
with open("../data/urls_hellofresh.txt", encoding="utf-8") as f: 
    urls = json.load(f) 
out_file, data = scrape_many_to_file(urls) 
print("Fichier écrit:", out_file) 
print(f"{len(data)} recettes sauvegardées")

[skip] https://www.hellofresh.fr/recipes/saumon-en-papillote-and-pistou-maison-625fbe3806486938dc2f4584 déjà présent, on garde l’ancien
[ok] https://www.hellofresh.fr/recipes/galette-complete-jambon-emmental-626fe9dca921ac38f30227a2 ajouté
[skip] https://www.hellofresh.fr/recipes/salade-de-chevre-nectarine-and-tomate-628b8b61c6ee1b00810d8902 déjà présent, on garde l’ancien
[skip] https://www.hellofresh.fr/recipes/curry-daubergine-rotie-and-pois-chiches-629e031506287fd74e0efc02 déjà présent, on garde l’ancien
[skip] https://www.hellofresh.fr/recipes/linguine-and-pesto-depinards-maison-62b26056828c7ddc7107e451 déjà présent, on garde l’ancien
[ok] https://www.hellofresh.fr/recipes/courgette-grillee-labneh-and-beurre-au-curcuma-62b26217932de228a600313c ajouté
[skip] https://www.hellofresh.fr/recipes/salade-couscous-perle-feta-rotie-and-grenade-62c31ec1b6367154fe0f1dc1 déjà présent, on garde l’ancien
[skip] https://www.hellofresh.fr/recipes/salade-de-couscous-perle-and-fromage-grille-62cc07