In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pandas as pd
import time
import os
import re
import json

# Chemin vers le Chromedriver
chromedriver_path = r'C:\Users\pc\Desktop\scrp\chromedriver-win64\chromedriver.exe'

# Initialiser le driver avec Service
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service)

# Chargement ou création du fichier scrape.json
scraped_articles_file = "scrape.json"
if os.path.exists(scraped_articles_file):
    with open(scraped_articles_file, "r", encoding="utf-8") as f:
        scraped_articles = json.load(f)
else:
    scraped_articles = []

# Pour faciliter la vérification, créer un ensemble des IDs déjà visités
visited_ids = {article.get("id") for article in scraped_articles}

# Liste des termes de recherche
countries = ["عسل","وجبة سريعة","كوكيز","كسكس","طاجين",""]

# Fonction de nettoyage du nom de dossier/fichier
def clean_filename(s):
    emoji_pattern = re.compile(
        "[" 
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # Flags
        "]+", flags=re.UNICODE)
    s = emoji_pattern.sub(r'', s)
    return "".join(c for c in s if c.isalnum() or c in (' ', '_')).rstrip()

# Fonction pour sauvegarder immédiatement le fichier scrape.json
def save_scraped_articles():
    with open(scraped_articles_file, "w", encoding="utf-8") as f:
        json.dump(scraped_articles, f, ensure_ascii=False, indent=4)

# Pour chaque terme dans la liste, créer l'URL de recherche et extraire les recettes
for country in countries:
    search_url = f"https://cookpad.com/sa/search/{country}?order=recent"
    print(f"\nRecherche pour {country}: {search_url}")
    driver.get(search_url)
    time.sleep(3)

    # Créer un dossier principal pour ce pays
    country_folder = clean_filename(country)
    if not os.path.exists(country_folder):
        os.makedirs(country_folder)

    # Nouvelle logique de scroll pour charger plus de recettes
    desired_article_count = 300
    prev_count = 0
    while True:
        article_lis = driver.find_elements(By.CSS_SELECTOR, "li[id^='recipe_']")
        current_count = len(article_lis)
        print(f"Nombre d'articles chargés: {current_count}")
        if current_count >= desired_article_count:
            break
        if current_count == prev_count:
            print("Aucun nouvel article détecté lors du défilement supplémentaire.")
            break
        prev_count = current_count
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(15)

    article_lis = driver.find_elements(By.CSS_SELECTOR, "li[id^='recipe_']")
    articles = []
    for li in article_lis:
        try:
            article_id = li.get_attribute("id").replace("recipe_", "")
            link = li.find_element(By.TAG_NAME, "a").get_attribute("href")
            articles.append((article_id, link))
        except Exception as e:
            print(f"Erreur lors de la récupération du lien pour un article: {e}")

    print(f"Nombre d'articles trouvés pour {country}: {len(articles)}")

    articles = articles[:300]  # Limiter le nombre d'articles à traiter

    for article_id, article_link in articles:
        # Vérifier si l'article a déjà été visité
        if article_id in visited_ids:
            print(f"المقال ذو المعرف {article_id} قد تمت زيارته من قبل. يتم الانتقال للمقال التالي.")
            continue

        print(f"\nTraitement de l'article {article_id} de {country}...")
        driver.get(article_link)
        time.sleep(3)

        # Extraction du titre
        try:
            title_elem = driver.find_element(By.TAG_NAME, "h1")
            recipe_title = title_elem.text.strip()
        except Exception as e:
            recipe_title = "Titre non trouvé"
        print("Titre:", recipe_title)

        # Extraction de l'image principale
        try:
            img_elem = driver.find_element(By.CSS_SELECTOR, "img[fetchpriority='high']")
            img_url = img_elem.get_attribute("src")
        except Exception as e:
            img_url = None
        print("Image URL:", img_url)

        # Extraction des ingrédients
        try:
            ingredients_div = driver.find_element(By.CSS_SELECTOR, "div.ingredient-list")
            ingredient_items = ingredients_div.find_elements(By.TAG_NAME, "li")
            ingredients = [item.text.strip() for item in ingredient_items]
        except Exception as e:
            ingredients = []
        print("Ingrédients:", ingredients)

        # Extraction des étapes
        try:
            steps_ol = driver.find_element(By.CSS_SELECTOR, "ol.list-none")
            step_items = steps_ol.find_elements(By.TAG_NAME, "li")
            steps = []
            for li in step_items:
                try:
                    container = li.find_element(By.CSS_SELECTOR, "div.w-full.grid.gap-sm.print\\:grid-flow-col")
                    children = container.find_elements(By.XPATH, "./*")
                    for child in children:
                        p_elems = child.find_elements(By.TAG_NAME, "p")
                        if p_elems:
                            text = p_elems[0].text.strip()
                            if text:
                                steps.append(text)
                        a_elems = child.find_elements(By.TAG_NAME, "a")
                        for a in a_elems:
                            try:
                                picture = a.find_element(By.TAG_NAME, "picture")
                                img = picture.find_element(By.TAG_NAME, "img")
                                img_src = img.get_attribute("src")
                                if img_src:
                                    steps.append(img_src)
                            except Exception as e_img:
                                href = a.get_attribute("href")
                                if href:
                                    steps.append(href)
                except Exception as e:
                    p_elems = li.find_elements(By.TAG_NAME, "p")
                    if p_elems:
                        text = p_elems[0].text.strip()
                        if text:
                            steps.append(text)
        except Exception as e:
            steps = []
        print("Étapes (texte et images):", steps)

        # Vérification des conditions
        condition_met = False

        # Condition 1 : "ينوون تحضير" avec un nombre ≥ 25
        try:
            intent_elem = driver.find_element(By.XPATH, "//span[contains(text(), 'ينوون تحضير')]")
            intent_text = intent_elem.text.strip()
            m = re.search(r"(\d+)", intent_text)
            if m:
                intent_count = int(m.group(1))
                if intent_count >= 10:
                    condition_met = True
                    print("شرط 1 تحقق: يوجد {} كوكباديين ينوون تحضير الوصفة.".format(intent_count))
            else:
                print("لم يتم العثور على رقم في شرط 'ينوون تحضير'.")
        except Exception as e:
            print("شرط 1 غير متوفر:", e)

        # Condition 2 : إذا لم يتحقق الشرط الأول، التحقق من وجود ≥ 25 تعليقات
        if not condition_met:
            try:
                comments_elem = driver.find_element(By.XPATH, "//span[contains(text(), 'تعليقات')]")
                comments_text = comments_elem.text.strip()
                m = re.search(r"(\d+)", comments_text)
                if m:
                    comments_count = int(m.group(1))
                    if comments_count >= 10:
                        condition_met = True
                        print("شرط 2 تحقق: يوجد {} تعليق.".format(comments_count))
                else:
                    print("لم يتم العثور على رقم في شرط 'تعليقات'.")
            except Exception as e:
                print("شرط 2 غير متوفر:", e)

        # Condition 3 : إذا لم يتحقق أي من الشرطين précédent, التحقق من وجود ≥ 30 تفاعلات
        if not condition_met:
            try:
                reactions_ul = driver.find_element(By.CSS_SELECTOR, "ul[data-controller='reactions']")
                reaction_items = reactions_ul.find_elements(By.CSS_SELECTOR, "li.reaction")
                total_reactions = 0
                for item in reaction_items:
                    try:
                        count_elem = item.find_element(By.CSS_SELECTOR, "span[data-reactions-target='count']")
                        count = int(count_elem.text.strip())
                        total_reactions += count
                    except Exception as e:
                        pass
                if total_reactions >= 15:
                    condition_met = True
                    print("شرط 3 تحقق: مجموع التفاعلات هو {}.".format(total_reactions))
                else:
                    print("مجموع التفاعلات أقل من المطلوب.")
            except Exception as e:
                print("شرط 3 غير متوفر:", e)

        # Créer le record pour l'article visité
        record = {
            "id": article_id,
            "titre": recipe_title,
            "lien": article_link,
            "excel": None,
            "raison": ""
        }

        if condition_met:
            # Préparation et sauvegarde dans un fichier Excel
            max_len = max(len(ingredients), len(steps))
            if len(ingredients) < max_len:
                ingredients.extend([None] * (max_len - len(ingredients)))
            if len(steps) < max_len:
                steps.extend([None] * (max_len - len(steps)))

            data_rows = []
            for i in range(max_len):
                data_rows.append({
                    "titre": None,
                    "ingrédient": ingredients[i],
                    "étape": steps[i],
                    "img_link": None
                })
            df_rows = pd.DataFrame(data_rows)

            # Extraction du temps de préparation avec un sélecteur CSS plus précis
            try:
                time_elem = driver.find_element(By.CSS_SELECTOR, "div.max-lg\\:hidden.print\\:block.text-cookpad-gray-600.text-cookpad-14.mb-rg span.mise-icon-text")
                preparation_time = time_elem.text.strip()
            except Exception as e:
                preparation_time = None
            print("Temps de préparation:", preparation_time)

            # Ajout du temps de préparation dans le DataFrame
            top_row = pd.DataFrame([{
                "titre": recipe_title,
                "temps": preparation_time,  # Nouvelle colonne
                "ingrédient": None,
                "étape": None,
                "img_link": img_url
            }])
            final_df = pd.concat([top_row, df_rows], ignore_index=True)

            excel_file_name = clean_filename(recipe_title) + ".xlsx"
            excel_path = os.path.join(country_folder, excel_file_name)

            writer = pd.ExcelWriter(excel_path, engine='xlsxwriter')
            final_df.to_excel(writer, index=False, sheet_name='Recette')

            workbook  = writer.book
            worksheet = writer.sheets['Recette']
            wrap_format = workbook.add_format({'text_wrap': True})
            worksheet.set_column('C:C', 40, wrap_format)
            writer.close()

            print("Données sauvegardées dans :", excel_path)
            record["excel"] = excel_path
            record["raison"] = "conditions remplies et données extraites"
        else:
            print("Les conditions ne sont pas remplies, la recette ne sera pas sauvegardée en Excel.")
            record["raison"] = "visité mais conditions non remplies"

        # Ajouter l'article à la liste et à l'ensemble visited_ids
        scraped_articles.append(record)
        visited_ids.add(article_id)

        # Sauvegarder immédiatement le fichier après le traitement de chaque article
        save_scraped_articles()

        time.sleep(3)
driver.quit()
print("Extraction terminée.")


Recherche pour عسل: https://cookpad.com/sa/search/عسل?order=recent
Nombre d'articles chargés: 30
Nombre d'articles chargés: 210
Nombre d'articles chargés: 210
Aucun nouvel article détecté lors du défilement supplémentaire.
Nombre d'articles trouvés pour عسل: 210
المقال ذو المعرف 24657000 قد تمت زيارته من قبل. يتم الانتقال للمقال التالي.
المقال ذو المعرف 24639357 قد تمت زيارته من قبل. يتم الانتقال للمقال التالي.
المقال ذو المعرف 24629144 قد تمت زيارته من قبل. يتم الانتقال للمقال التالي.
المقال ذو المعرف 24629108 قد تمت زيارته من قبل. يتم الانتقال للمقال التالي.
المقال ذو المعرف 24506151 قد تمت زيارته من قبل. يتم الانتقال للمقال التالي.
المقال ذو المعرف 24501665 قد تمت زيارته من قبل. يتم الانتقال للمقال التالي.
المقال ذو المعرف 24447355 قد تمت زيارته من قبل. يتم الانتقال للمقال التالي.
المقال ذو المعرف 24424326 قد تمت زيارته من قبل. يتم الانتقال للمقال التالي.
المقال ذو المعرف 24301196 قد تمت زيارته من قبل. يتم الانتقال للمقال التالي.
المقال ذو المعرف 24239471 قد تمت زيارته من قبل. يتم 


Traitement de l'article 24346726 de وجبة سريعة...
