In [29]:
import os, re, time, random, html
import pandas as pd
import requests
from bs4 import BeautifulSoup

# üëâ Tu peux laisser √ßa comme √ßa pour l'instant
os.environ["SCRAPERAPI_KEY"] = "d6ffba9297d69750c8e9b7ad61de158b"

SCRAPERAPI_KEY = os.getenv("SCRAPERAPI_KEY")
if not SCRAPERAPI_KEY:
    raise RuntimeError("SCRAPERAPI_KEY manquante (variable d‚Äôenvironnement).")

SCRAPER_ENDPOINT = "https://api.scraperapi.com/"

# ==========================
#  FONCTIONS UTILITAIRES
# ==========================

def scraperapi_get(url: str, timeout=60) -> str:
    """Appelle ScraperAPI et renvoie le HTML brut de la page."""
    params = {
        "api_key": SCRAPERAPI_KEY,
        "url": url,
        "country_code": "us",
        "device_type": "desktop",
    }
    r = requests.get(SCRAPER_ENDPOINT, params=params, timeout=timeout)
    r.raise_for_status()
    return r.text


def clean_text(s: str) -> str:
    """Nettoie un texte : d√©s-√©chappe HTML, enl√®ve les espaces en trop."""
    if not s:
        return ""
    s = html.unescape(s)
    return " ".join(s.split()).strip()


def extract_asin_from_url(url: str) -> str:
    """Extrait l'ASIN depuis une URL Amazon."""
    # /dp/ASIN  ou  /gp/offer-listing/ASIN  ou  /product/ASIN
    m = re.search(r"/(?:dp|gp/offer-listing|product)/([A-Z0-9]{10})", url)
    if m:
        return m.group(1)
    # fallback: chercher une cha√Æne de 10 majuscules/chiffres
    m2 = re.search(r"([A-Z0-9]{10})", url)
    return m2.group(1) if m2 else ""


def normalize_to_dp(url: str) -> str:
    """Transforme un lien offer-listing en lien /dp/ASIN plus stable."""
    url = html.unescape(str(url)).strip()
    asin = extract_asin_from_url(url)
    if asin:
        return f"https://www.amazon.com/dp/{asin}"
    return url


def parse_amazon_product(html_text: str) -> dict:
    """Extrait title, description et image_url depuis une page produit Amazon."""
    soup = BeautifulSoup(html_text, "html.parser")

    # ----- TITRE -----
    title_el = soup.select_one("#productTitle") or soup.select_one("h1#title span")
    title = clean_text(title_el.get_text()) if title_el else ""

    # ----- DESCRIPTION (bullets + bloc description) -----
    bullets = [
        clean_text(li.get_text())
        for li in soup.select("#feature-bullets li span")
    ]
    bullets = [b for b in bullets if b]

    desc_block = soup.select_one("#productDescription")
    desc_text = clean_text(desc_block.get_text()) if desc_block else ""

    if bullets:
        description = " | ".join(bullets[:12])
    elif desc_text:
        description = desc_text[:1500]   # on √©vite les pav√©s √©normes
    else:
        description = ""

    # ----- IMAGE PRINCIPALE -----
    img_url = ""
    og = soup.select_one('meta[property="og:image"]')
    if og and og.get("content"):
        img_url = og["content"].strip()

    if not img_url:
        landing = soup.select_one("img#landingImage")
        if landing:
            img_url = (landing.get("data-old-hires") or landing.get("src") or "").strip()

    return {
        "title": title,
        "description": description,
        "image_url": img_url,
    }


# ==========================
#  FONCTION PRINCIPALE
# ==========================

def enrich_df_with_scraping(df: pd.DataFrame, product_link_col: str = "Product Link") -> pd.DataFrame:
    """
    Prend TON DataFrame d√©j√† nettoy√© (cleaning + one-hot),
    scrape chaque produit via la colonne product_link_col,
    et renvoie un nouveau DataFrame avec 4 colonnes en plus :
    product_url, title, description, image_url, error.
    """
    if product_link_col not in df.columns:
        raise ValueError(f"Colonne '{product_link_col}' introuvable dans le DataFrame.")

    links = df[product_link_col].astype(str).tolist()
    results = []

    for i, raw_link in enumerate(links):
        url = normalize_to_dp(raw_link)

        try:
            html_page = scraperapi_get(url)
            parsed = parse_amazon_product(html_page)

            results.append({
                "product_url": url,
                "title": parsed["title"],
                "description": parsed["description"],
                "image_url": parsed["image_url"],
                "error": ""   # aucune erreur
            })

            print(f"[OK] {i+1}/{len(links)} -> {parsed['title'][:70]}")
        except Exception as e:
            print(f"[ERR] row {i+1}: {e}")
            results.append({
                "product_url": url,
                "title": "",
                "description": "",
                "image_url": "",
                "error": str(e),
            })

        # petite pause pour ne pas bourriner l'API
        time.sleep(random.uniform(1.2, 2.5))

    scraped_df = pd.DataFrame(results)

    # On s'assure que les index correspondent
    df = df.reset_index(drop=True)
    scraped_df = scraped_df.reset_index(drop=True)

    # Fusion horizontale : toutes TES colonnes + celles du scraping
    final_df = pd.concat([df, scraped_df], axis=1)

    return final_df


# ==========================
#  EXEMPLE D'UTILISATION
# ==========================

if __name__ == "__main__":
    # ‚õîÔ∏è ATTENTION : ceci est un exemple si tu veux partir du CSV brut.
    # Si tu as d√©j√† un df nettoy√© en m√©moire dans ton notebook,
    # ignore cette partie et utilise enrich_df_with_scraping(df, ...) directement.

    # 1) Charger le CSV original
    df_original = pd.read_csv("Amazon_Best_Seller_2021_June 2.csv")

    # üéØ ICI tu ferais ton cleaning + one-hot encoding
    # df_clean = ...  # toutes tes transformations
    df_clean = df_original.copy()  # <‚Äì √† remplacer par TON df d√©j√† clean

    # 2) Enrichir avec les donn√©es scrap√©es
    df_enriched = enrich_df_with_scraping(df_clean, product_link_col="Product Link")

    # 3) Sauvegarder TOUT dans un seul fichier final
    df_enriched.to_csv("Amazon_Best_Seller_2021_June_2_FULL_enriched.csv",
                       index=False, encoding="utf-8")

    print("‚úÖ Fichier final cr√©√© : Amazon_Best_Seller_2021_June_2_FULL_enriched.csv")


[OK] 1/707 -> Fire TV Stick 4K streaming device with Alexa Voice Remote (includes TV
[OK] 2/707 -> Echo Dot (3rd Gen, 2018 release) - Smart speaker with Alexa - Charcoal
[OK] 3/707 -> Echo Dot (4th Gen, 2020 release) | Smart speaker with Alexa | Charcoal
[OK] 4/707 -> Roku Express | HD Roku Streaming Device with Simple Remote (no TV cont
[OK] 5/707 -> Echo Dot (4th Gen) | Charcoal with Sengled Bluetooth Color bulb | Alex
[OK] 6/707 -> Fire HD 8 tablet, 8" HD display, 32 GB, (2020 release), designed for p
[OK] 7/707 -> Blink Mini - Compact indoor plug-in smart security camera, 1080p HD vi
[OK] 8/707 -> Amazon Fire HD 10 tablet, 10.1", 1080p Full HD, 32 GB, (2021 release),
[OK] 9/707 -> Roku Streaming Stick+ | HD/4K/HDR Streaming Device with Long-range Wir
[OK] 10/707 -> Amazon Smart Plug | Works with Alexa | Simple setup, endless possibili
[OK] 11/707 -> Fire HD 8 Kids tablet, 8" HD display, ages 3-7, 32 GB, includes a 1-ye
[OK] 12/707 -> Blink Outdoor (3rd Gen) - wireless, weather-resi

In [32]:
import pandas as pd

# charge ton fichier final
df = pd.read_csv("Amazon_Best_Seller_2021_June_2_FULL_enriched.csv")

print("Avant nettoyage :", df.shape)

# 1Ô∏è‚É£ supprimer colonnes product_url et error
df.drop(columns=["product_url", "error"], inplace=True, errors="ignore")

# 2Ô∏è‚É£ supprimer lignes o√π title est vide / NaN
df = df[df["title"].notna()]                  # enl√®ve NaN
df = df[df["title"].str.strip() != ""]        # enl√®ve cha√Ænes vides

print("Apr√®s nettoyage :", df.shape)

# sauvegarder version propre
df.to_csv("Amazon_Best_Seller_FINAL.csv", index=False, encoding="utf-8")

print("‚úÖ Fichier cr√©√© : Amazon_Best_Seller_FINAL.csv")
# ==========================

Avant nettoyage : (707, 13)
Apr√®s nettoyage : (589, 11)
‚úÖ Fichier cr√©√© : Amazon_Best_Seller_FINAL.csv
