In [3]:
from pathlib import Path

# Revenir au dossier parent (ici : NicheFinder)
root = Path.cwd().parent

# Affichage de tous les fichiers et dossiers de ce dossier principal
for item in root.iterdir():
    if item.is_dir():
        print(f"📁 Dossier : {item.name}")
    else:
        print(f"📄 Fichier : {item.name}")

📁 Dossier : scraper
📁 Dossier : Streamlit
📄 Fichier : .DS_Store
📄 Fichier : uv.lock
📄 Fichier : pyproject.toml
📄 Fichier : amazon_objets_connectes_v38.csv
📁 Dossier : utils
📄 Fichier : README.md
📁 Dossier : .venv
📄 Fichier : df_clean_2.csv
📄 Fichier : .python-version
📁 Dossier : .git
📄 Fichier : main.py
📁 Dossier : cleaner
📁 Dossier : data
📁 Dossier : notebooks


In [7]:
from pathlib import Path

# Chemin vers ton dossier principal (à adapter si besoin)
chemin_principal = Path("/Users/samiraedoube/Downloads/CV/Data/NicheFinder/NicheFinder")

# Contenu du script bash pour lancer l'ETL
script = """#!/bin/bash
# Script pour lancer l'ETL

# Activation de l'environnement virtuel (à adapter si besoin)
source ~/.venv/bin/activate

# Lancement du script ETL
python3 etl.py
"""

# Chemin complet vers le fichier à créer
fichier_script = chemin_principal / "run_etl.sh"

# Écriture du fichier
with open(fichier_script, "w") as f:
    f.write(script)

print(f"✅ Script créé ici : {fichier_script}")

✅ Script créé ici : /Users/samiraedoube/Downloads/CV/Data/NicheFinder/NicheFinder/run_etl.sh


In [None]:
#scrapping

from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Configuration Selenium
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"

products = []
page = 1
max_pages = 20  # Nombre de pages à scraper (ajuster si nécessaire)

while page <= max_pages:
    print(f"Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        price = None
        price_whole = item.select_one("span.a-price > span.a-offscreen")
        if price_whole:
            try:
                price = float(
                    price_whole.text.strip().replace("€", "").replace(",", ".")
                )
            except:
                price = None

        rating = None
        rating_tag = item.select_one("span.a-icon-alt")
        if rating_tag:
            rating_match = re.search(r"(\d+,\d+)", rating_tag.text)
            if rating_match:
                rating = float(rating_match.group(1).replace(",", "."))

        # Votes (avis clients)
        votes = None
        votes_text = None
        votes_elem = item.find("span", class_="a-size-base s-underline-text")
        if votes_elem:
            votes_text = votes_elem.text.strip()
        else:
            alt_votes = item.select_one("div.a-row.a-size-small span.a-size-base")
            if alt_votes:
                votes_text = alt_votes.text.strip()
        if votes_text:
            cleaned = re.sub(r"[^\d]", "", votes_text)
            if cleaned.isdigit():
                votes = int(cleaned)

        # Ventes le mois dernier
        sales = None
        sales_elem = item.find("span", class_="a-size-base a-color-secondary")
        if sales_elem and "acheté" in sales_elem.text:
            sales_match = re.search(r"(\d[\d\s]+)", sales_elem.text)
            if sales_match:
                sales = int(sales_match.group(1).replace(" ", "").replace("\u202f", ""))

        # Image
        image_elem = item.find("img")
        image_url = image_elem["src"] if image_elem else None

        # Marque
        brand = None
        if title:
            brand = title.split()[0]

        # URL produit
        url = None
        link_tag = item.select_one("a.a-link-normal.s-link-style.a-text-normal")
        if link_tag and link_tag.get("href"):
            url = "https://www.amazon.fr" + link_tag["href"]

        # Prime (booléen)
        prime = bool(item.select_one("i.a-icon-prime"))

        products.append(
            {
                "title": title,
                "brand": brand,
                "price": price,
                "rating": rating,
                "votes": votes,
                "sales_last_month": sales,
                "image_url": image_url,
                "url": url,
                "prime": prime,
                "category": "Objets connectés",
                "rank": rank,
                "scraped_at": datetime.now(),
            }
        )

    page += 1

driver.quit()

# Sauvegarde
dfv38 = pd.DataFrame(products)
dfv38.to_csv("amazon_objets_connectes_v38.csv", index=False)
print("✅ Données sauvegardées : amazon_objets_connectes_v38.csv")

In [None]:
#clean_data

import sys
from pathlib import Path

root = Path(__file__).resolve().parent.parent  # remonte à la racine
sys.path.append(str(root))

from utils.io_utils import get_project_root, make_dir

import pandas as pd
from utils.io_utils import get_project_root, make_dir


def clean_amazon() -> Path:
    """Lit data/raw_data.csv, nettoie, écrit data/df_clean.csv, retourne son Path."""
    root = get_project_root()
    data_dir = root / "data"
    raw_path = data_dir / "raw_data.csv"
    clean_path = data_dir / "df_clean.csv"

    if not raw_path.exists():
        raise FileNotFoundError(f"{raw_path} introuvable. Lance d’abord le scraping.")

    df = pd.read_csv(raw_path)

    # Nettoyage rapide
    df = df.drop_duplicates().dropna(subset=["title"])
    num_cols = ["price", "rating", "votes"]
    for col in num_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

    df.to_csv(clean_path, index=False)
    print(f"✅ Données nettoyées : {clean_path}")
    return clean_path