In [23]:
import requests
from bs4 import BeautifulSoup

def get_amazon_bestsellers(category_url, n=10):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
    }
    response = requests.get(category_url, headers=headers)
    if response.status_code != 200:
        print("Erreur lors de la récupération de la page")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Trouver tous les liens produits qui ont cette classe spécifique
    product_links = soup.select('a.a-link-normal.aok-block')[:n]

    bestsellers = []
    for a in product_links:
        href = a.get('href')
        full_link = 'https://www.amazon.fr' + href if href else None

        img = a.find('img')
        title = img['alt'] if img and 'alt' in img.attrs else "Titre inconnu"

        bestsellers.append({'title': title, 'link': full_link})

    return bestsellers

if __name__ == "__main__":
    url = "https://www.amazon.fr/gp/bestsellers/electronics/"
    bestsellers = get_amazon_bestsellers(url, n=20)

    for i, product in enumerate(bestsellers, 1):
        print(f"{i}. {product['title']}")
        print(f"   Lien: {product['link']}\n")

1. Apple AirTag
   Lien: https://www.amazon.fr/Apple-MX532ZMA-Nouveau-AirTag/dp/B0935DN1BN/ref=zg_bs_g_electronics_d_sccl_1/260-0894714-1452108?psc=1

2. Titre inconnu
   Lien: https://www.amazon.fr/Apple-MX532ZMA-Nouveau-AirTag/dp/B0935DN1BN/ref=zg_bs_g_electronics_d_sccl_1/260-0894714-1452108?psc=1

3. Titre inconnu
   Lien: https://www.amazon.fr/Apple-MX532ZMA-Nouveau-AirTag/dp/B0935DN1BN/ref=zg_bs_g_electronics_d_sccl_1/260-0894714-1452108?psc=1

4. Amazon Fire TV Stick HD (Nouvelle génération) | TV gratuite et en direct, télécommande vocale Alexa, contrôle de la maison co
   Lien: https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQMWQDH4/ref=zg_bs_g_electronics_d_sccl_2/260-0894714-1452108?psc=1

5. Titre inconnu
   Lien: https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQMWQDH4/ref=zg_bs_g_electronics_d_sccl_2/260-0894714-1452108?psc=1

6. Titre inconnu
   Lien: https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQMWQDH4/ref=zg_bs_g_electronics_d_sccl_2/260-0894714-1452108?psc=1

7. Imou 2K(3MP) 

In [33]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time

def scrape_aliexpress_selenium(url):
    options = Options()
    options.add_argument('--headless')  # exécution sans ouvrir la fenêtre du navigateur
    options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=options)

    driver.get(url)
    time.sleep(5)  # attendre que la page charge JS (tu peux augmenter si nécessaire)

    products = []

    product_cards = driver.find_elements(By.CSS_SELECTOR, 'div.manhattan--content--1lP57Ag')
    if not product_cards:
        print("Aucun produit trouvé sur la page.")
        driver.quit()
        return pd.DataFrame()

    for card in product_cards:
        try:
            title = card.find_element(By.CSS_SELECTOR, 'a.manhattan--titleText--WccSjUS').text
        except:
            title = ''
        try:
            price = card.find_element(By.CSS_SELECTOR, 'div.manhattan--price-sale--1CCSZfK').text
        except:
            price = ''
        try:
            orders = card.find_element(By.CSS_SELECTOR, 'div.manhattan--order--1lP57Ag').text
        except:
            orders = ''

        products.append({'title': title, 'price': price, 'orders': orders})

    driver.quit()
    return pd.DataFrame(products)

url = "https://fr.aliexpress.com/w/wholesale-best-seller.html?page=2&g=y&SearchText=best+seller"
df = scrape_aliexpress_selenium(url)
print(df)

Aucun produit trouvé sur la page.
Empty DataFrame
Columns: []
Index: []


In [43]:
import requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0'}
url = 'https://www.amazon.fr/gp/bestsellers/?ref_=nav_cs_bestsellers'

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# On sélectionne tous les liens produits dans la grille des best sellers
product_links = soup.select('a.a-link-normal.aok-block')

# Affiche les 5 premiers produits
for link in product_links[:5]:
    title = link.get('title') or link.text.strip()  # certains liens ont le titre comme attribut
    href = link.get('href')
    full_link = f"https://www.amazon.fr{href}" if href else "Lien manquant"

    print(f"Produit : {title}")
    print(f"Lien : {full_link}\n")
print("Statut de la requête :", response.status_code)
print("Contenu partiel de la page :")
print(soup.prettify()[:2000]) 

Statut de la requête : 200
Contenu partiel de la page :
<!DOCTYPE html>
<!--[if lt IE 7]> <html lang="fr" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->
<!--[if IE 7]>    <html lang="fr" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->
<!--[if IE 8]>    <html lang="fr" class="a-no-js a-lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="a-no-js" lang="fr">
 <!--<![endif]-->
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <title dir="ltr">
   Amazon.fr
  </title>
  <meta content="width=device-width" name="viewport"/>
  <link href="https://images-na.ssl-images-amazon.com/images/G/01/AUIClients/AmazonUI-3c913031596ca78a3768f4e934b1cc02ce238101.secure.min._V1_.css" rel="stylesheet"/>
  <script>
   if (true === true) {
    var ue_t0 = (+ new Date()),
        ue_csm = window,
        ue = { t0: ue_t0, d: function() { return (+new Date() - ue_t0); } },
  

In [45]:
import requests
from bs4 import BeautifulSoup

# En-têtes pour passer les protections de base
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    'Accept-Language': 'fr-FR,fr;q=0.9'
}

url = 'https://www.amazon.fr/gp/bestsellers/luminaires-eclairage'  # Catégorie spécifique si possible

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
with open("amazon_test.html", "w", encoding="utf-8") as f:
    f.write(soup.prettify())
# Sélectionne tous les blocs produit
product_blocks = soup.select('div.zg-grid-general-faceout')

for product in product_blocks[:5]:  # Limité à 5 produits
    # Titre
    title_tag = product.select_one('.p13n-sc-truncate-desktop-type2')
    title = title_tag.get('title') if title_tag else "Titre non trouvé"

    # Lien
    link_tag = product.find('a', class_='a-link-normal aok-block')
    link = f"https://www.amazon.fr{link_tag['href']}" if link_tag and link_tag.get('href') else "Lien non trouvé"

    # Prix
    price_tag = product.select_one('.p13n-sc-price')
    price = price_tag.text.strip() if price_tag else "Prix non trouvé"

    print(f"Produit : {title}")
    print(f"Lien : {link}")
    print(f"Prix : {price}")
    print("-" * 60)

In [46]:
import requests
from bs4 import BeautifulSoup

# Fonction pour scraper une page
def scraper_page(url):
    response = requests.get(url)

# On vérifie que le site réponde avant d'aller plus loin
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        titles = soup.select('div.p13n-sc-truncate-desktop-type2.p13n-sc-truncated')

# On crée une liste vide pour stocker     
        data = []

# On récupére le texte
        for title in titles:
            print(title.text)
            data.append(title.text)
       
# On programme la sortie de la fonction avec le return : notre texte.
        return data # Sous un format liste
# Autrement, on affiche le message d'erreur.
    else:
        print(f"Erreur lors de la récupération de la page {url}.")
# On peut aussi traiter le message d'erreur avec un "try" "except" si on connais le message d'erreur. 

In [47]:
scraper_page('https://www.amazon.fr/gp/bestsellers/?ref_=nav_cs_bestsellers')

Erreur lors de la récupération de la page https://www.amazon.fr/gp/bestsellers/?ref_=nav_cs_bestsellers.


In [48]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time

# Configuration Chrome
options = Options()
options.add_argument('--headless')  # pour ne pas ouvrir la fenêtre
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/gp/bestsellers/electronics/ref=zg_bs_pg_{}?_encoding=UTF8&pg={}"

all_data = []

# Scrape les 5 premières pages (change range si besoin)
for page in range(1, 6):
    print(f"Scraping page {page}...")
    url = base_url.format(page, page)
    driver.get(url)
    time.sleep(3)

    products = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

    for product in products:
        try:
            title = product.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
        except:
            title = None
        try:
            price = product.find_element(By.CSS_SELECTOR, ".a-price > .a-offscreen").text.replace("€", "").replace(",", ".")
        except:
            price = None
        try:
            rating = product.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerHTML").split(" ")[0]
        except:
            rating = None
        try:
            votes = product.find_element(By.CSS_SELECTOR, ".a-size-small").text.replace(" ", "").replace(",", "")
        except:
            votes = None
        try:
            link = product.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            link = None

        all_data.append({
            "Titre": title,
            "Prix (€)": price,
            "Note": rating,
            "Nombre de votes": votes,
            "Lien": link
        })

driver.quit()

df = pd.DataFrame(all_data)
print(df.head())

# Facultatif : enregistrer dans un CSV
df.to_csv("amazon_bestsellers_hightech.csv", index=False)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
                                               Titre Prix (€) Note  \
0                                       Apple AirTag     None  4,6   
1  Imou 2K(3MP) Caméra Surveillance WiFi Intérieu...     None  4,4   
2  Tapo 2K(3MP) Caméra Surveillance WiFi intérieu...     None  4,6   
3  DURACELL CR2032 Piles Boutons au lithium 3V (l...     None  4,7   
4  Amazon Fire TV Stick HD (Nouvelle génération) ...     None  4,5   

  Nombre de votes                                               Lien  
0         127 835  https://www.amazon.fr/Apple-MX532ZMA-Nouveau-A...  
1          42 181  https://www.amazon.fr/Imou-Surveillance-Int%C3...  
2          34 195  https://www.amazon.fr/Tapo-Surveillance-int%C3...  
3         124 763  https://www.amazon.fr/DURACELL-CR2032-Piles-Bo...  
4          21 611  https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQ...  


In [49]:
df

Unnamed: 0,Titre,Prix (€),Note,Nombre de votes,Lien
0,Apple AirTag,,46,127 835,https://www.amazon.fr/Apple-MX532ZMA-Nouveau-A...
1,Imou 2K(3MP) Caméra Surveillance WiFi Intérieu...,,44,42 181,https://www.amazon.fr/Imou-Surveillance-Int%C3...
2,Tapo 2K(3MP) Caméra Surveillance WiFi intérieu...,,46,34 195,https://www.amazon.fr/Tapo-Surveillance-int%C3...
3,DURACELL CR2032 Piles Boutons au lithium 3V (l...,,47,124 763,https://www.amazon.fr/DURACELL-CR2032-Piles-Bo...
4,Amazon Fire TV Stick HD (Nouvelle génération) ...,,45,21 611,https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQ...
5,"Aioneus Chargeur USB C, 40W 4 Port Prise USBC ...",,45,12 323,https://www.amazon.fr/Aioneus-Chargeur-Secteur...
6,"INIU Batterie Externe, 22.5W 10000mAh Power Ba...",,46,25 917,https://www.amazon.fr/INIU-Batterie-10500mAh-C...
7,Amazon Basics Lot de 6 piles bouton CR2032 au ...,,47,198 995,https://www.amazon.fr/Amazon-Basics-bouton-lit...
8,Duracell Plus Piles AA (lot de 24) - Alcalines...,,47,40 482,https://www.amazon.fr/Piles-AA-Duracell-Plus-l...
9,"JBL GO 4, Enceinte Bluetooth ultra-portable, s...",,47,8 320,https://www.amazon.fr/JBL-Ultra-Portable-percu...


In [50]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import time
import random

# Config Selenium
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

all_data = []

# Amazon a généralement 50 pages max pour les best sellers
max_pages = 50

for page in range(1, max_pages + 1):
    print(f"Scraping page {page}...")
    url = f"https://www.amazon.fr/gp/bestsellers/electronics/ref=zg_bs_pg_{page}?_encoding=UTF8&pg={page}"
    driver.get(url)
    time.sleep(random.uniform(2.5, 4))  # délai anti-bot

    products = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

    if not products:
        print("Fin détectée ou page vide.")
        break

    for product in products:
        try:
            title = product.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
        except NoSuchElementException:
            title = None

        try:
            price = product.find_element(By.CSS_SELECTOR, ".a-price .a-offscreen").text
            price = price.replace("€", "").replace(",", ".").strip()
        except NoSuchElementException:
            price = None

        try:
            rating = product.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerHTML").split(" ")[0]
        except NoSuchElementException:
            rating = None

        try:
            votes = product.find_element(By.CSS_SELECTOR, ".a-size-small").text
            votes = votes.replace(" ", "").replace(",", "").replace(".", "").strip()
        except NoSuchElementException:
            votes = None

        try:
            link = product.find_element(By.TAG_NAME, "a").get_attribute("href")
        except NoSuchElementException:
            link = None

        all_data.append({
            "Titre": title,
            "Prix (€)": price,
            "Note": rating,
            "Nombre de votes": votes,
            "Lien": link
        })

    # Petite pause entre les pages
    time.sleep(random.uniform(1.5, 3))

driver.quit()

# Création DataFrame
df = pd.DataFrame(all_data)

# Nettoyage des doublons éventuels
df.drop_duplicates(subset=["Titre", "Lien"], inplace=True)

# Affichage
print(df.head())
print(f"Total produits scrappés : {len(df)}")

# Enregistrement facultatif
df.to_csv("amazon_bestsellers_hightech_full.csv", index=False)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Fin détectée ou page vide.
                                               Titre Prix (€) Note  \
0                                       Apple AirTag     None  4,6   
1  Imou 2K(3MP) Caméra Surveillance WiFi Intérieu...     None  4,4   
2  Tapo 2K(3MP) Caméra Surveillance WiFi intérieu...     None  4,6   
3  DURACELL CR2032 Piles Boutons au lithium 3V (l...     None  4,7   
4  Amazon Fire TV Stick HD (Nouvelle génération) ...     None  4,5   

  Nombre de votes                                               Lien  
0         127 835  https://www.amazon.fr/Apple-MX532ZMA-Nouveau-A...  
1          42 181  https://www.amazon.fr/Imou-Surveillance-Int%C3...  
2          34 195  https://www.amazon.fr/Tapo-Surveillance-int%C3...  
3         124 763  https://www.amazon.fr/DURACELL-CR2032-Piles-Bo...  
4          21 611  https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQ...  
Total produits scrappés : 60


In [54]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

headers = {'User-Agent': 'Mozilla/5.0'}

def scrape_category(url, category_name):
    products = []

    for page in range(1, 3):  # 2 pages = environ 50 produits
        paginated_url = f"{url}?pg={page}"
        response = requests.get(paginated_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        items = soup.select('div.zg-grid-general-faceout')

        for rank, item in enumerate(items, start=(page - 1) * 25 + 1):
            title_tag = item.select_one('.p13n-sc-truncate-desktop-type2') or item.select_one('._cDEzb_p13n-sc-css-line-clamp-3_g3dy1')
            title = title_tag.get_text(strip=True) if title_tag else None

            price_tag = item.select_one('.a-price span.a-offscreen')
            price = float(price_tag.text.replace('€', '').replace(',', '.')) if price_tag else None

            rating_tag = item.select_one('.a-icon-alt')
            rating = float(rating_tag.text.split()[0].replace(',', '.')) if rating_tag else None

            reviews_tag = item.select_one('.a-size-small')
            num_reviews = int(reviews_tag.text.replace("\xa0", "").replace(" ", "").replace(",", "")) if reviews_tag else None


            link_tag = item.select_one('a.a-link-normal')
            url_product = "https://www.amazon.fr" + link_tag['href'] if link_tag else None

            prime_icon = item.select_one('.a-icon-prime')
            is_prime = True if prime_icon else False

            product = {
                "title": title,
                "price": price,
                "rating": rating,
                "num_reviews": num_reviews,
                "category": category_name,
                "url": url_product,
                "brand": None,  # À ajouter plus tard si possible
                "is_prime": is_prime,
                "bestseller_rank": rank,
                "scraped_at": datetime.now()
            }

            products.append(product)

        time.sleep(1)  # pour éviter les blocages Amazon

    return pd.DataFrame(products)

In [55]:
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beauté et Parfum": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine et Maison": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vidéo": "https://www.amazon.fr/gp/bestsellers/videogames",
}

df_list = []

for cat_name, url in category_urls.items():
    print(f"Scraping {cat_name}...")
    df_cat = scrape_category(url, cat_name)
    df_list.append(df_cat)

# Fusionner tous les DataFrames
df_all = pd.concat(df_list, ignore_index=True)

# Sauvegarder en CSV
df_all.to_csv("amazon_bestsellers.csv", index=False)
print("Scraping terminé, données enregistrées.")

Scraping High-tech...
Scraping Beauté et Parfum...
Scraping Cuisine et Maison...
Scraping Mode...
Scraping Animalerie...
Scraping Jeux et Jouets...
Scraping Jeux vidéo...
Scraping terminé, données enregistrées.


In [57]:
import pandas as pd

df = pd.read_csv("amazon_bestsellers.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            59 non-null     object 
 1   price            0 non-null      float64
 2   rating           59 non-null     float64
 3   num_reviews      59 non-null     float64
 4   category         60 non-null     object 
 5   url              60 non-null     object 
 6   brand            0 non-null      float64
 7   is_prime         60 non-null     bool   
 8   bestseller_rank  60 non-null     int64  
 9   scraped_at       60 non-null     object 
dtypes: bool(1), float64(4), int64(1), object(4)
memory usage: 4.4+ KB


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import random

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

category_urls = {
    "High-tech": "https://www.idealo.fr/cat/3605/informatique.html",
    "Cuisine": "https://www.idealo.fr/cat/13489/cuisine.html",
    "Jeux video": "https://www.idealo.fr/cat/10678/jeux-video.html",
    "Beauté": "https://www.idealo.fr/cat/13492/beaute-sante.html"
}

def scrape_idealo_category(url, category_name):
    products = []

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Erreur lors de la requête pour {category_name}")
        return pd.DataFrame()

    soup = BeautifulSoup(response.content, "html.parser")
    items = soup.select("div.offerList-item")

    for item in items:
        title_tag = item.select_one(".offerList-title")
        title = title_tag.text.strip() if title_tag else None

        price_tag = item.select_one(".offerList-price")
        try:
            price = float(price_tag.text.replace("\xa0", "").replace("\u20ac", "").replace(",", ".")) if price_tag else None
        except:
            price = None

        product = {
            "title": title,
            "price": price,
            "category": category_name,
            "scraped_at": datetime.now()
        }
        products.append(product)

    return pd.DataFrame(products)

# Scraping principal
df_list = []
for cat, url in category_urls.items():
    print(f"Scraping {cat}...")
    df = scrape_idealo_category(url, cat)
    df_list.append(df)
    time.sleep(random.uniform(2, 5))  # Pause humaine entre les requêtes

# Fusion finale
df_all = pd.concat(df_list, ignore_index=True)
df_all.to_csv("idealo_products.csv", index=False)
print("Scraping terminé. Fichier enregistré : idealo_products.csv")

Scraping High-tech...


In [122]:
import pandas as pd

dfv2 = pd.read_csv("amazon_best_v2.csv")
dfv2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            193 non-null    object 
 1   price            0 non-null      float64
 2   rating           214 non-null    float64
 3   num_reviews      197 non-null    float64
 4   category         218 non-null    object 
 5   url              218 non-null    object 
 6   brand            0 non-null      float64
 7   is_prime         218 non-null    bool   
 8   bestseller_rank  218 non-null    int64  
 9   scraped_at       218 non-null    object 
dtypes: bool(1), float64(4), int64(1), object(4)
memory usage: 15.7+ KB


In [7]:
import requests
import pandas as pd
from datetime import datetime

# Remplacez par votre propre clé API SerpAPI
SERPAPI_KEY = "ed285c21a9caeffec3e5e686e5df49e86215b847ff360f9e253d8930cc3f96e6"  # <-- Remplace par ta vraie clé

# Amazon France Best Seller Category IDs valides
departments = {
    "High-tech": "13921051",
    "Cuisine & Maison": "57004031",
    "Beauté": "197858031",
    "Mode": "2454132031",
    "Jeux vidéo": "530490"
}

def scrape_amazon_bestsellers(api_key, category_id, category_name):
    url = "https://serpapi.com/search.json"
    params = {
        "engine": "amazon_bestsellers",
        "amazon_domain": "amazon.fr",
        "category_id": category_id,
        "api_key": api_key
    }

    response = requests.get(url, params=params)
    data = response.json()

    # Afficher les erreurs détaillées
    if "error" in data:
        print(f"❌ Erreur pour {category_name}: {data['error']}")
        return pd.DataFrame()

    products = []
    for item in data.get("bestsellers", []):
        products.append({
            "title": item.get("title"),
            "price": item.get("price"),
            "rating": item.get("rating"),
            "reviews": item.get("ratings_total"),
            "category": category_name,
            "url": item.get("link"),
            "scraped_at": datetime.now()
        })

    return pd.DataFrame(products)

# Lancer le scraping
all_data = []
for name, cat_id in departments.items():
    print(f"🔍 Scraping {name}...")
    df = scrape_amazon_bestsellers(SERPAPI_KEY, cat_id, name)
    print(f"✅ {name} : {len(df)} produits extraits\n")
    all_data.append(df)

# Fusionner et exporter
df_final = pd.concat(all_data, ignore_index=True)
df_final.to_csv("amazon_best_serpapi.csv", index=False)
print("🎉 Scraping Amazon terminé avec SerpAPI. Fichier : amazon_best_serpapi.csv")


🔍 Scraping High-tech...
❌ Erreur pour High-tech: Unsupported `amazon_bestsellers` search engine.
✅ High-tech : 0 produits extraits

🔍 Scraping Cuisine & Maison...
❌ Erreur pour Cuisine & Maison: Unsupported `amazon_bestsellers` search engine.
✅ Cuisine & Maison : 0 produits extraits

🔍 Scraping Beauté...
❌ Erreur pour Beauté: Unsupported `amazon_bestsellers` search engine.
✅ Beauté : 0 produits extraits

🔍 Scraping Mode...
❌ Erreur pour Mode: Unsupported `amazon_bestsellers` search engine.
✅ Mode : 0 produits extraits

🔍 Scraping Jeux vidéo...
❌ Erreur pour Jeux vidéo: Unsupported `amazon_bestsellers` search engine.
✅ Jeux vidéo : 0 produits extraits

🎉 Scraping Amazon terminé avec SerpAPI. Fichier : amazon_best_serpapi.csv


In [5]:
dffff = pd.read_csv("amazon_best_serpapi.csv")
dffff.info()

EmptyDataError: No columns to parse from file

In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beauté": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vidéo": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(url, category):
    driver.get(url)
    time.sleep(3)  # Laisse le temps à la page de charger

    items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")
    products = []

    for rank, item in enumerate(items, start=1):
        try:
            title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
        except:
            title = None

        try:
            price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
            price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("€", "").replace(",", ".").strip())
        except:
            price = None

        try:
            rating_text = item.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerText")
            rating = float(rating_text.split()[0].replace(",", "."))
        except:
            rating = None

        try:
            reviews_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-base").text
            reviews = int(reviews_tag.replace("\u202f", "").replace(",", "").strip())
        except:
            reviews = None

        try:
            url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
        except:
            url_product = None

        products.append({
            "title": title,
            "price": price,
            "rating": rating,
            "reviews": reviews,
            "url": url_product,
            "category": category,
            "rank": rank,
            "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

    return pd.DataFrame(products)

# Scraping de toutes les catégories
df_list = []
for name, link in category_urls.items():
    print(f"Scraping {name}...")
    try:
        df = scrape_category(link, name)
        print(f"{name}: {len(df)} produits trouvés")
        df_list.append(df)
    except Exception as e:
        print(f"Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)
df_final.to_csv("amazon_best_v3.csv", index=False)
driver.quit()
print("✅ Fichier amazon_best_v3.csv généré avec succès.")

Scraping High-tech...
High-tech: 30 produits trouvés
Scraping Beauté...
Beauté: 30 produits trouvés
Scraping Cuisine...
Cuisine: 30 produits trouvés
Scraping Mode...
Mode: 30 produits trouvés
Scraping Animalerie...
Animalerie: 30 produits trouvés
Scraping Jeux et Jouets...
Jeux et Jouets: 30 produits trouvés
Scraping Jeux vidéo...
Jeux vidéo: 30 produits trouvés
✅ Fichier amazon_best_v3.csv généré avec succès.


In [17]:
df_v3 = pd.read_csv('amazon_best_v3.csv')
df_v3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       186 non-null    object 
 1   price       200 non-null    float64
 2   rating      206 non-null    float64
 3   reviews     0 non-null      float64
 4   url         210 non-null    object 
 5   category    210 non-null    object 
 6   rank        210 non-null    int64  
 7   scraped_at  210 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 13.3+ KB


In [18]:
df_v3.head()

Unnamed: 0,title,price,rating,reviews,url,category,rank,scraped_at
0,Apple AirTag,29.99,4.6,,https://www.amazon.fr/Apple-MX532ZMA-Nouveau-A...,High-tech,1,2025-06-20 14:25:34
1,Imou 2K(3MP) Caméra Surveillance WiFi Intérieu...,18.99,4.4,,https://www.amazon.fr/Imou-Surveillance-Int%C3...,High-tech,2,2025-06-20 14:25:34
2,Tapo 2K(3MP) Caméra Surveillance WiFi intérieu...,18.99,4.6,,https://www.amazon.fr/Tapo-Surveillance-int%C3...,High-tech,3,2025-06-20 14:25:34
3,Amazon Fire TV Stick HD (Nouvelle génération) ...,44.99,4.5,,https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQ...,High-tech,4,2025-06-20 14:25:34
4,DURACELL CR2032 Piles Boutons au lithium 3V (l...,4.75,4.7,,https://www.amazon.fr/DURACELL-CR2032-Piles-Bo...,High-tech,5,2025-06-20 14:25:34


In [19]:
df['reviews'].notnull().sum()


np.int64(0)

In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beauté": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vidéo": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
            except:
                title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("€", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            try:
                # essaie plusieurs formats pour les votes
                votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-base").text
                votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                    votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
                except:
                    votes = None

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les catégories
df_list = []
for name, base_url in category_urls.items():
    print(f"Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"{name}: {len(df)} produits trouvés")
        df_list.append(df)
    except Exception as e:
        print(f"Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)
df_final.to_csv("amazon_best_v4.csv", index=False)
driver.quit()
print("✅ Fichier amazon_best_v4.csv généré avec succès.")


Scraping High-tech...
High-tech: 60 produits trouvés
Scraping Beauté...
Beauté: 60 produits trouvés
Scraping Cuisine...
Cuisine: 60 produits trouvés
Scraping Mode...
Mode: 60 produits trouvés
Scraping Animalerie...
Animalerie: 60 produits trouvés
Scraping Jeux et Jouets...
Jeux et Jouets: 60 produits trouvés
Scraping Jeux vidéo...
Jeux vidéo: 60 produits trouvés
✅ Fichier amazon_best_v4.csv généré avec succès.


In [21]:
df_v4 = pd.read_csv('amazon_best_v4.csv')
df_v4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       380 non-null    object 
 1   price       407 non-null    float64
 2   rating      415 non-null    float64
 3   votes       56 non-null     float64
 4   url         420 non-null    object 
 5   category    420 non-null    object 
 6   rank        420 non-null    int64  
 7   scraped_at  420 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 26.4+ KB


In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beauté": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vidéo": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
            except:
                title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("€", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            try:
                votes_tag = item.find_element(By.XPATH, ".//span[contains(@aria-label, 'notation') or contains(@aria-label, 'avis') or contains(@aria-label, 'évaluations')]").get_attribute("aria-label")
                votes = int(votes_tag.split()[0].replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                    votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
                except:
                    votes = None

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les catégories
df_list = []
for name, base_url in category_urls.items():
    print(f"Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"{name}: {len(df)} produits trouvés")
        df_list.append(df)
    except Exception as e:
        print(f"Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)

# Ajout d'un score
df_final['score'] = df_final['rating'] * df_final['votes']

# Sauvegarde
df_final.to_csv("amazon_best_v5.csv", index=False)
driver.quit()
print("✅ Fichier amazon_best_v5.csv généré avec succès.")

Scraping High-tech...
High-tech: 60 produits trouvés
Scraping Beauté...
Beauté: 60 produits trouvés
Scraping Cuisine...
Cuisine: 60 produits trouvés
Scraping Mode...
Mode: 60 produits trouvés
Scraping Animalerie...
Animalerie: 60 produits trouvés
Scraping Jeux et Jouets...
Jeux et Jouets: 60 produits trouvés
Scraping Jeux vidéo...
Jeux vidéo: 60 produits trouvés
✅ Fichier amazon_best_v5.csv généré avec succès.


In [23]:
df_v5 = pd.read_csv('amazon_best_v4.csv')
df_v5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       380 non-null    object 
 1   price       407 non-null    float64
 2   rating      415 non-null    float64
 3   votes       56 non-null     float64
 4   url         420 non-null    object 
 5   category    420 non-null    object 
 6   rank        420 non-null    int64  
 7   scraped_at  420 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 26.4+ KB


In [26]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beauté": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vidéo": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                # Titre : essayer plusieurs structures
                try:
                    title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
                except:
                    title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-1_1Fn1y").text
            except:
                title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("€", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            try:
                # Votes : version fiable avec span.a-size-small (vue dans capture)
                votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    votes_tag = item.find_element(By.XPATH, ".//span[contains(@aria-label, 'notation') or contains(@aria-label, 'avis') or contains(@aria-label, 'évaluations')]").get_attribute("aria-label")
                    votes = int(votes_tag.split()[0].replace("\u202f", "").replace(",", "").strip())
                except:
                    votes = None

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les catégories
df_list = []
for name, base_url in category_urls.items():
    print(f"Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"{name}: {len(df)} produits trouvés")
        df_list.append(df)
    except Exception as e:
        print(f"Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)

# Ajout d'un score
df_final['score'] = df_final['rating'] * df_final['votes']

# Sauvegarde
df_final.to_csv("amazon_best_v6.csv", index=False)
driver.quit()
print("✅ Fichier amazon_best_v6.csv généré avec succès.")

Scraping High-tech...
High-tech: 0 produits trouvés
Scraping Beauté...
Beauté: 30 produits trouvés
Scraping Cuisine...
Cuisine: 60 produits trouvés
Scraping Mode...
Mode: 60 produits trouvés
Scraping Animalerie...
Animalerie: 60 produits trouvés
Scraping Jeux et Jouets...
Jeux et Jouets: 60 produits trouvés
Scraping Jeux vidéo...
Jeux vidéo: 60 produits trouvés
✅ Fichier amazon_best_v6.csv généré avec succès.


In [27]:
df_v6 = pd.read_csv('amazon_best_v6.csv')
df_v6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       325 non-null    object 
 1   price       321 non-null    float64
 2   rating      326 non-null    float64
 3   votes       45 non-null     float64
 4   url         330 non-null    object 
 5   category    330 non-null    object 
 6   rank        330 non-null    int64  
 7   scraped_at  330 non-null    object 
 8   score       45 non-null     float64
dtypes: float64(4), int64(1), object(4)
memory usage: 23.3+ KB


In [28]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beauté": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vidéo": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        if not items:
            print(f"❌ Aucun produit trouvé pour {category} page {page} !")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
            except:
                try:
                    title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-1_1Fn1y").text
                except:
                    title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("€", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, "span.a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            try:
                votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    votes_tag = item.find_element(By.XPATH, ".//span[contains(@aria-label, 'notation') or contains(@aria-label, 'avis') or contains(@aria-label, 'évaluations')]").get_attribute("aria-label")
                    votes = int(votes_tag.split()[0].replace("\u202f", "").replace(",", "").strip())
                except:
                    votes = None

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les catégories
df_list = []
for name, base_url in category_urls.items():
    print(f"🔍 Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"✅ {name}: {len(df)} produits trouvés")
        df_list.append(df)
    except Exception as e:
        print(f"❌ Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)

# Ajout d'un score
df_final['score'] = df_final['rating'] * df_final['votes']

# Sauvegarde
df_final.to_csv("amazon_best_v7.csv", index=False)
driver.quit()
print("✅ Fichier amazon_best_v7.csv généré avec succès.")

🔍 Scraping High-tech...
✅ High-tech: 60 produits trouvés
🔍 Scraping Beauté...
✅ Beauté: 60 produits trouvés
🔍 Scraping Cuisine...
✅ Cuisine: 60 produits trouvés
🔍 Scraping Mode...
✅ Mode: 60 produits trouvés
🔍 Scraping Animalerie...
✅ Animalerie: 60 produits trouvés
🔍 Scraping Jeux et Jouets...
✅ Jeux et Jouets: 60 produits trouvés
🔍 Scraping Jeux vidéo...
✅ Jeux vidéo: 60 produits trouvés
✅ Fichier amazon_best_v7.csv généré avec succès.


In [29]:
df_v7 = pd.read_csv('amazon_best_v7.csv')
df_v7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       414 non-null    object 
 1   price       407 non-null    float64
 2   rating      415 non-null    float64
 3   votes       56 non-null     float64
 4   url         420 non-null    object 
 5   category    420 non-null    object 
 6   rank        420 non-null    int64  
 7   scraped_at  420 non-null    object 
 8   score       56 non-null     float64
dtypes: float64(4), int64(1), object(4)
memory usage: 29.7+ KB


In [32]:
df_v7.sample(50)

Unnamed: 0,title,price,rating,votes,url,category,rank,scraped_at,score
389,Animal Crossing : New Horizons pour Nintendo S...,44.16,4.8,,https://www.amazon.fr/Animal-Crossing-Horizons...,Jeux vidéo,30,2025-06-20 14:57:43,
318,,,,,https://www.amazon.fr/Pok%C3%A9mon-JCC-Collect...,Jeux et Jouets,19,2025-06-20 14:57:24,
193,"SINOPHANT Legging Femmes Pantalon de Sport, Je...",11.99,4.5,,https://www.amazon.fr/SINOPHANT-Legging-Femmes...,Mode,14,2025-06-20 14:56:49,
204,QINCAO Boxers Homme Lot de 6 Coton Pas d'étiqu...,24.99,4.5,,https://www.amazon.fr/QINCAO-sous-v%C3%AAtemen...,Mode,25,2025-06-20 14:56:50,
70,GARNIER Ambre Solaire - Natural Bronzer - Mous...,9.49,4.2,,https://www.amazon.fr/Garnier-Ambre-Solaire-Na...,Beauté,11,2025-06-20 14:56:10,
219,Enlision Hommes Chemises en Lin Solid Henley S...,25.99,4.1,,https://www.amazon.fr/Enlision-Henley-Chemise-...,Mode,60,2025-06-20 14:56:56,
125,"Dreo 20dB ventilateur colonne silencieux, Vite...",89.99,4.6,,https://www.amazon.fr/Dreo-Ventilateur-tour-av...,Cuisine,6,2025-06-20 14:56:31,
390,"Ozeino Casque Gaming pour PS5 PC PS4, Casque G...",19.99,4.5,,https://www.amazon.fr/Ozeino-Couleurs-Transduc...,Jeux vidéo,51,2025-06-20 14:57:47,
191,Dim Chaussettes Ecodim Talon/Pointes Renforcés...,10.5,4.4,,https://www.amazon.fr/Dim-Ecodim-Chaussettes-p...,Mode,12,2025-06-20 14:56:48,
266,"FRONTLINE Spot on Chien - Anti Puces, Anti-Tiq...",18.74,4.3,,https://www.amazon.fr/FRONTLINE-Spot-Chien-Ant...,Animalerie,27,2025-06-20 14:57:07,


In [33]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime
import re

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beauté": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vidéo": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        if not items:
            print(f"❌ Aucun produit trouvé pour {category} page {page} !")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
            except:
                try:
                    title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-1_1Fn1y").text
                except:
                    title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("€", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, "span.a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            # ✅ Nouvelle extraction robuste des votes
            try:
                votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    aria_label = item.find_element(By.CSS_SELECTOR, "div.a-icon-row").get_attribute("aria-label")
                    match = re.search(r'([0-9\u202f\s]+) évaluations', aria_label)
                    if match:
                        votes = int(match.group(1).replace("\u202f", "").replace(" ", ""))
                    else:
                        votes = None
                except:
                    votes = None

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les catégories
df_list = []
for name, base_url in category_urls.items():
    print(f"🔍 Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"✅ {name}: {len(df)} produits trouvés")
        df_list.append(df)
    except Exception as e:
        print(f"❌ Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)

# Ajout d'un score pondéré
df_final['score'] = df_final['rating'] * df_final['votes']

# Sauvegarde
df_final.to_csv("amazon_best_v8.csv", index=False)
driver.quit()
print("✅ Fichier amazon_best_v8.csv généré avec succès.")


🔍 Scraping High-tech...
✅ High-tech: 60 produits trouvés
🔍 Scraping Beauté...
✅ Beauté: 60 produits trouvés
🔍 Scraping Cuisine...
✅ Cuisine: 60 produits trouvés
🔍 Scraping Mode...
✅ Mode: 60 produits trouvés
🔍 Scraping Animalerie...
✅ Animalerie: 60 produits trouvés
🔍 Scraping Jeux et Jouets...
✅ Jeux et Jouets: 60 produits trouvés
🔍 Scraping Jeux vidéo...
✅ Jeux vidéo: 60 produits trouvés
✅ Fichier amazon_best_v8.csv généré avec succès.


In [36]:
df_v8 = pd.read_csv('amazon_best_v8.csv')
df_v8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       414 non-null    object 
 1   price       407 non-null    float64
 2   rating      415 non-null    float64
 3   votes       56 non-null     float64
 4   url         420 non-null    object 
 5   category    420 non-null    object 
 6   rank        420 non-null    int64  
 7   scraped_at  420 non-null    object 
 8   score       56 non-null     float64
dtypes: float64(4), int64(1), object(4)
memory usage: 29.7+ KB


In [41]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime
import re

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beauté": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vidéo": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        if not items:
            print(f"❌ Aucun produit trouvé pour {category} page {page} !")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
            except:
                try:
                    title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-1_1Fn1y").text
                except:
                    title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("€", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, "span.a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            # ✅ Extraction du nombre de votes avec plusieurs tentatives
            votes = None
            try:
                # Méthode 1 : balise visible directe
                votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    # Méthode 2 : via aria-label complet
                    aria_label = item.find_element(By.CSS_SELECTOR, "div.a-icon-row").get_attribute("aria-label")
                    match = re.search(r'(\d[\d\u202f\s]*)\s*\u00e9valuations', aria_label.lower())
                    if match:
                        votes = int(match.group(1).replace("\u202f", "").replace(" ", ""))
                except:
                    pass

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les catégories
df_list = []
for name, base_url in category_urls.items():
    print(f"🔍 Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"✅ {name}: {len(df)} produits trouvés")
        df_list.append(df)
    except Exception as e:
        print(f"❌ Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)

# Ajout d'un score pondéré si les deux colonnes existent
df_final['score'] = df_final.apply(lambda row: row['rating'] * row['votes'] if pd.notnull(row['votes']) and pd.notnull(row['rating']) else None, axis=1)

# Sauvegarde
df_final.to_csv("amazon_best_v10.csv", index=False)
with open("amazon_best_v10.html", "w", encoding="utf-8") as f:
    f.write(driver.page_source)
driver.quit()

print("✅ Fichier amazon_best_v10.csv généré avec succès.")

🔍 Scraping High-tech...
✅ High-tech: 60 produits trouvés
🔍 Scraping Beauté...
✅ Beauté: 60 produits trouvés
🔍 Scraping Cuisine...
✅ Cuisine: 60 produits trouvés
🔍 Scraping Mode...
✅ Mode: 60 produits trouvés
🔍 Scraping Animalerie...
✅ Animalerie: 60 produits trouvés
🔍 Scraping Jeux et Jouets...
✅ Jeux et Jouets: 60 produits trouvés
🔍 Scraping Jeux vidéo...
✅ Jeux vidéo: 60 produits trouvés
✅ Fichier amazon_best_v10.csv généré avec succès.


In [43]:
df_v10 = pd.read_csv('amazon_best_v10.csv')
df_v10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       415 non-null    object 
 1   price       406 non-null    float64
 2   rating      416 non-null    float64
 3   votes       56 non-null     float64
 4   url         420 non-null    object 
 5   category    420 non-null    object 
 6   rank        420 non-null    int64  
 7   scraped_at  420 non-null    object 
 8   score       56 non-null     float64
dtypes: float64(4), int64(1), object(4)
memory usage: 29.7+ KB


In [44]:
df_v10.head()

Unnamed: 0,title,price,rating,votes,url,category,rank,scraped_at,score
0,Apple AirTag,29.99,4.6,,https://www.amazon.fr/Apple-MX532ZMA-Nouveau-A...,High-tech,1,2025-06-20 16:08:23,
1,Imou 2K(3MP) Caméra Surveillance WiFi Intérieu...,18.99,4.4,,https://www.amazon.fr/Imou-Surveillance-Int%C3...,High-tech,2,2025-06-20 16:08:23,
2,Tapo 2K(3MP) Caméra Surveillance WiFi intérieu...,18.99,4.6,,https://www.amazon.fr/Tapo-Surveillance-int%C3...,High-tech,3,2025-06-20 16:08:24,
3,Amazon Fire TV Stick HD (Nouvelle génération) ...,44.99,4.6,,https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQ...,High-tech,4,2025-06-20 16:08:24,
4,DURACELL CR2032 Piles Boutons au lithium 3V (l...,4.75,4.7,,https://www.amazon.fr/DURACELL-CR2032-Piles-Bo...,High-tech,5,2025-06-20 16:08:24,


In [55]:
import time
from datetime import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Configuration du navigateur
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&ref=lp_4551203031_sar"
driver.get(url)
time.sleep(3)

data = []
page = 1

while True:
    print(f"Scraping page {page}...")
    cards = driver.find_elements(By.XPATH, '//div[contains(@data-component-type, "s-search-result")]')

    for rank, card in enumerate(cards, start=1):
        try:
            title = card.find_element(By.XPATH, ".//h2//span").text.strip()
        except:
            title = None

        try:
            price_text = card.find_element(By.CSS_SELECTOR, "span.a-price > span.a-offscreen").text.strip()
            price = float(price_text.replace(" ", "").replace(" ", "").replace("EUR", "").replace(",", "."))
        except:
            price = None

        try:
            rating_text = card.find_element(By.CSS_SELECTOR, "span.a-icon-alt").get_attribute("innerHTML")
            rating = float(rating_text.split()[0].replace(',', '.'))
        except:
            rating = None

        try:
            votes_text = card.find_element(By.XPATH, ".//span[@class='a-size-base' or @class='a-size-small']").text
            votes = int(votes_text.replace("\u202f", "").replace(",", "").replace(" ", ""))
        except:
            votes = None

        try:
            score = round(rating * votes, 1) if rating and votes else None
        except:
            score = None

        try:
            brand = card.find_element(By.XPATH, ".//span[@class='a-size-base-plus a-color-base']").text
        except:
            brand = None

        try:
            product_url = card.find_element(By.XPATH, ".//h2/a").get_attribute("href")
        except:
            product_url = None

        try:
            image_url = card.find_element(By.XPATH, ".//img").get_attribute("src")
        except:
            image_url = None

        data.append({
            "title": title,
            "price": price,
            "rating": rating,
            "votes": votes,
            "score": score,
            "brand": brand,
            "url": product_url,
            "image_url": image_url,
            "category": "High-Tech > Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        })

    # Tentative d'aller à la page suivante
    try:
        next_button = driver.find_element(By.XPATH, '//a[contains(@class,"s-pagination-next")]')
        if 'disabled' in next_button.get_attribute("class"):
            break
        else:
            driver.execute_script("arguments[0].scrollIntoView();", next_button)
            time.sleep(2)
            next_button.click()
            page += 1
            time.sleep(3)
    except:
        print("Fin de pagination ou erreur")
        break

# Fermeture du navigateur
driver.quit()

# Enregistrement des données
df = pd.DataFrame(data)
df.to_csv("amazon_hightech_objets_connectes_v12.csv", index=False)
print(df.info())


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

In [None]:
df_v12 = pd.read_csv('amazon_hightech_objets_connectes_v12.csv')
df_v12.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       0 non-null      float64
 1   price       118 non-null    float64
 2   rating      0 non-null      float64
 3   votes       46 non-null     float64
 4   score       0 non-null      float64
 5   brand       0 non-null      float64
 6   url         0 non-null      float64
 7   image_url   120 non-null    object 
 8   category    135 non-null    object 
 9   rank        135 non-null    int64  
 10  scraped_at  135 non-null    object 
dtypes: float64(7), int64(1), object(3)
memory usage: 11.7+ KB


In [13]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Configuration Chrome non-headless (affichage navigateur)
options = webdriver.ChromeOptions()
# options.add_argument('--headless')  # à désactiver temporairement
options.add_argument('--start-maximized')

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL Amazon - Objets connectés
base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031"

# Données collectées
data = []

# Nombre de pages à parcourir
N_PAGES = 3  # Ajustable

for page in range(1, N_PAGES + 1):
    print(f"📄 Scraping page {page}")
    url = f"{base_url}&page={page}"
    driver.get(url)
    time.sleep(3)

    try:
        # Attendre la présence des résultats
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//div[@data-component-type='s-search-result']"))
        )
    except:
        print("❌ Résultats introuvables")
        continue

    products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")

    for p in products:
        try:
            title = p.find_element(By.XPATH, ".//h2//span").text
        except:
            title = None
        try:
            price = p.find_element(By.XPATH, ".//span[@class='a-price-whole']").text
            price = float(price.replace(',', '.').replace('€', '').strip())
        except:
            price = None
        try:
            rating = p.find_element(By.XPATH, ".//span[@class='a-icon-alt']").get_attribute("innerHTML")
            rating = float(rating.split()[0].replace(',', '.'))
        except:
            rating = None
        try:
            votes = p.find_element(By.XPATH, ".//span[@class='a-size-base']").text
            votes = int(votes.replace('\u202f', '').replace(' ', '').replace(',', ''))
        except:
            votes = None
        try:
            image = p.find_element(By.TAG_NAME, "img").get_attribute("src")
        except:
            image = None
        try:
            sales = p.find_element(By.XPATH, ".//span[contains(text(), 'acheté')]").text
        except:
            sales = None

        data.append({
            'title': title,
            'price': price,
            'rating': rating,
            'votes': votes,
            'image_url': image,
            'sales_last_month': sales
        })

    time.sleep(2)

driver.quit()

# Conversion en DataFrame
dfv21 = pd.DataFrame(data)

# Filtrer les produits avec un titre pour éviter les lignes vides
dfv21 = dfv21[dfv21['title'].notnull()]

# Sauvegarde CSV
dfv21.to_csv("amazon_objets_connectes_v21.csv", index=False)
print("✅ Données sauvegardées : amazon_objets_connectes_v21.csv")

📄 Scraping page 1
📄 Scraping page 2
📄 Scraping page 3
✅ Données sauvegardées : amazon_objets_connectes_v21.csv


In [15]:
dfv21 = pd.read_csv('amazon_objets_connectes_v21.csv')
dfv21.isna().sum()

title                0
price                0
rating               3
votes               72
image_url            0
sales_last_month     8
dtype: int64

In [16]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd

# Configuration du navigateur
options = Options()
options.add_argument("--headless")  # si tu veux voir le navigateur, supprime cette ligne
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)

# URL de la catégorie objets connectés
url = "https://www.amazon.fr/s?i=electronics&rh=n%3A4551203031&fs=true"

# Lancement
driver.get(url)
time.sleep(3)

# Données à collecter
products = []

# Nombre de pages à parcourir
nb_pages = 10

for page in range(1, nb_pages + 1):
    print(f"Scraping page {page}")
    time.sleep(3)

    items = driver.find_elements(By.XPATH, '//div[@data-component-type="s-search-result"]')
    for item in items:
        try:
            title = item.find_element(By.CSS_SELECTOR, "h2 span").text
        except:
            title = None
        try:
            price = item.find_element(By.CSS_SELECTOR, ".a-price-whole").text + item.find_element(By.CSS_SELECTOR, ".a-price-fraction").text
        except:
            price = None
        try:
            rating = item.find_element(By.CSS_SELECTOR, "i span").get_attribute("innerHTML").split(" ")[0]
        except:
            rating = None
        try:
            votes = item.find_element(By.CSS_SELECTOR, 'span.a-size-base.s-underline-text').text.replace(" ", "").replace(",", "")
        except:
            votes = None
        try:
            sales_text = item.find_element(By.XPATH, ".//span[contains(text(),'acheté au cours du mois')]").text
        except:
            sales_text = None
        try:
            img_url = item.find_element(By.CSS_SELECTOR, "img.s-image").get_attribute("src")
        except:
            img_url = None

        products.append({
            "title": title,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales_text,
            "image_url": img_url
        })

    # Passer à la page suivante
    try:
        next_btn = driver.find_element(By.CSS_SELECTOR, 'a.s-pagination-next')
        driver.execute_script("arguments[0].click();", next_btn)
    except:
        print("Plus de pages.")
        break

driver.quit()

# Création du DataFrame
dfv22 = pd.DataFrame(products)
dfv22.to_csv("amazon_objets_connectes_v22.csv", index=False)
print("✅ Données sauvegardées dans amazon_objets_connectes_v22.csv")

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
✅ Données sauvegardées dans amazon_objets_connectes_v22.csv


In [19]:
dfv22 = pd.read_csv('amazon_objets_connectes_v22.csv')
dfv22.head()

Unnamed: 0,title,price,rating,votes,sales_last_month,image_url
0,Garmin – Forerunner 55 - Montre GPS multi-acti...,13899.0,46,6 532,,https://m.media-amazon.com/images/I/61eTqEILa9...
1,Garmin Forerunner 255 - Montre GPS Multisports...,20399.0,45,1 690,,https://m.media-amazon.com/images/I/71tjy7Umf0...
2,"Garmin Forerunner 255, Music - Montre GPS Mult...",24400.0,45,616,,https://m.media-amazon.com/images/I/51BxNNStky...
3,"Insta360 X5 - caméra d'action 360° 8K étanche,...",58999.0,45,557,,https://m.media-amazon.com/images/I/71YjsbkkO9...
4,Garmin Forerunner 255 – GPS-Laufuhr mit indivi...,20399.0,46,662,,https://m.media-amazon.com/images/I/51oM3U58x2...


In [20]:
import time
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Configuration du driver Chrome
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL catégorie Objets Connectés
base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&ref=lp_4551203031_sar"

driver.get(base_url)
time.sleep(3)

# Clic sur le bouton "Voir tous les résultats"
try:
    see_all_button = driver.find_element(By.XPATH, "//span[text()='Voir tous les résultats']/ancestor::a")
    see_all_button.click()
    time.sleep(3)
except:
    pass  # s'il n'existe pas

products = []
page = 1
max_pages = 10  # modifie selon ton besoin

while page <= max_pages:
    print(f"Scraping page {page}...")
    soup = BeautifulSoup(driver.page_source, "html.parser")
    items = driver.find_elements(By.CSS_SELECTOR, 'div.s-result-item[data-component-type="s-search-result"]')

    for rank, item in enumerate(items, start=1):
        try:
            title = item.find_element(By.CSS_SELECTOR, "h2").text
        except:
            title = None

        try:
            price_whole = item.find_element(By.CSS_SELECTOR, ".a-price-whole").text.replace(" ", "").replace(",", "")
            price_fraction = item.find_element(By.CSS_SELECTOR, ".a-price-fraction").text
            price = float(price_whole + price_fraction) / 100
        except:
            price = None

        try:
            rating = item.find_element(By.CSS_SELECTOR, "span.a-icon-alt").get_attribute("innerHTML").split(" ")[0].replace(",", ".")
            rating = float(rating)
        except:
            rating = None

        try:
            votes = item.find_element(By.CSS_SELECTOR, "span[aria-label$='évaluations']").text.replace(" ", "").replace(",", "")
            votes = int(votes)
        except:
            votes = None

        try:
            score = rating * votes if rating and votes else None
        except:
            score = None

        try:
            image_url = item.find_element(By.CSS_SELECTOR, "img.s-image").get_attribute("src")
        except:
            image_url = None

        try:
            sales_text = item.find_element(By.XPATH, ".//span[contains(text(),'au cours du dernier mois')]").text
        except:
            sales_text = None

        products.append({
            "title": title,
            "price": price,
            "rating": rating,
            "votes": votes,
            "score": score,
            "sales_last_month": sales_text,
            "image_url": image_url,
            "category": "Objets connectés",
            "rank": rank + (page - 1) * len(items),
            "scraped_at": datetime.now()
        })

    # Passage à la page suivante
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, "a.s-pagination-next")
        if "s-pagination-disabled" in next_button.get_attribute("class"):
            break
        next_button.click()
        time.sleep(3)
        page += 1
    except:
        break

driver.quit()

# Création du DataFrame
dfv23 = pd.DataFrame(products)
dfv23.to_csv("amazon_objets_connectes_v23.csv", index=False)
print("✅ Données sauvegardées dans amazon_objets_connectes_v23.csv")

Scraping page 1...
✅ Données sauvegardées dans amazon_objets_connectes_v23.csv


In [21]:
dfv23 = pd.read_csv('amazon_objets_connectes_v23.csv')
dfv23.head()

Unnamed: 0,title,price,rating,votes,score,sales_last_month,image_url,category,rank,scraped_at
0,Garmin – Forerunner 55 - Montre GPS multi-acti...,138.99,4.6,,,,https://m.media-amazon.com/images/I/61eTqEILa9...,Objets connectés,1,2025-06-24 10:09:32.114658
1,Garmin Forerunner 255 - Montre GPS Multisports...,203.99,4.5,,,,https://m.media-amazon.com/images/I/71tjy7Umf0...,Objets connectés,2,2025-06-24 10:09:32.193033
2,"Garmin Forerunner 255, Music - Montre GPS Mult...",244.0,4.5,,,,https://m.media-amazon.com/images/I/51BxNNStky...,Objets connectés,3,2025-06-24 10:09:32.265688
3,"Insta360 X5 - caméra d'action 360° 8K étanche,...",589.99,4.5,,,,https://m.media-amazon.com/images/I/71YjsbkkO9...,Objets connectés,4,2025-06-24 10:09:32.418453
4,Garmin Forerunner 255 – GPS-Laufuhr mit indivi...,203.99,4.6,,,,https://m.media-amazon.com/images/I/51oM3U58x2...,Objets connectés,5,2025-06-24 10:09:32.507953


In [22]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Configuration Selenium
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"

products = []
page = 1
max_pages = 3

while page <= max_pages:
    print(f"Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})
    
    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        price_whole = item.select_one('span.a-price > span.a-offscreen')
        price = None
        if price_whole:
            price = price_whole.text.strip().replace("€", "").replace(",", ".")
            try:
                price = float(price)
            except:
                price = None

        rating_tag = item.select_one("span.a-icon-alt")
        rating = None
        if rating_tag:
            rating_match = re.search(r"(\d+,\d+)", rating_tag.text)
            if rating_match:
                rating = float(rating_match.group(1).replace(",", "."))

        votes_elem = item.find("span", {"class": "a-size-base s-underline-text"})
        votes = None
        if votes_elem:
            votes_text = votes_elem.text.strip().replace("\u202f", "").replace(" ", "").replace(" ", "").replace(".", "")
            if votes_text.isdigit():
                votes = int(votes_text)

        sales_elem = item.find("span", class_="a-size-base a-color-secondary")
        sales = None
        if sales_elem and "acheté" in sales_elem.text:
            sales_match = re.search(r"(\d[\d\s]+)", sales_elem.text)
            if sales_match:
                sales = int(sales_match.group(1).replace(" ", "").replace("\u202f", ""))

        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        products.append({
            "title": title,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })
    
    page += 1

driver.quit()

dfv24 = pd.DataFrame(products)
dfv24.to_csv("amazon_objets_connectes_v24.csv", index=False)
print("✅ Données sauvegardées : amazon_objets_connectes_v24.csv")

Scraping page 1
Scraping page 2
Scraping page 3
✅ Données sauvegardées : amazon_objets_connectes_v24.csv


In [27]:
dfv24 = pd.read_csv('amazon_objets_connectes_v24.csv')
dfv24.sample(20)

Unnamed: 0,title,price,rating,votes,sales_last_month,image_url,category,rank,scraped_at
68,"Ecouteurs Bluetooth Sans Fil, Écouteurs Blueto...",21.99,4.4,,100.0,https://m.media-amazon.com/images/I/719kzqTTcg...,Objets connectés,69,2025-06-24 10:15:27.539893
56,"SUUNTO Race Montre de Sport Homme Femme, Track...",419.0,4.4,400.0,,https://m.media-amazon.com/images/I/71ejASC5Cd...,Objets connectés,57,2025-06-24 10:15:27.534400
29,Soundcore Space One Casque Bletooth sans fil a...,72.99,4.3,,200.0,https://m.media-amazon.com/images/I/51jCt-yAPO...,Objets connectés,30,2025-06-24 10:15:22.671245
31,AKASO Caméra Sport 4K Etanche WiFi Action Camé...,119.99,4.5,,100.0,https://m.media-amazon.com/images/I/71OUG6j-XG...,Objets connectés,32,2025-06-24 10:15:22.672056
61,Creative Labs Pebble 4.4W Noir haut-parleur - ...,25.32,4.5,,100.0,https://m.media-amazon.com/images/I/51o6ogGlR5...,Objets connectés,62,2025-06-24 10:15:27.536881
71,"WOLFANG Caméra Sport 4K 60FPS, GA300 24MP Gran...",59.99,4.2,,50.0,https://m.media-amazon.com/images/I/71MZ7LJGUm...,Objets connectés,72,2025-06-24 10:15:27.540881
38,"Ecouteurs Bluetooth sans Fil, AOVOCE Casque Bl...",20.99,4.4,,300.0,https://m.media-amazon.com/images/I/7192lowvvh...,Objets connectés,39,2025-06-24 10:15:22.677215
7,"Garmin Forerunner 255, Music - Montre GPS Mult...",243.99,4.5,165.0,200.0,https://m.media-amazon.com/images/I/51LVczCIWJ...,Objets connectés,8,2025-06-24 10:15:18.148242
69,Cambridge Audio Melomania M100 Earbuds - Casqu...,129.0,4.1,543.0,50.0,https://m.media-amazon.com/images/I/61T99yMIYa...,Objets connectés,70,2025-06-24 10:15:27.540213
17,"DJI Bundle Aventure Osmo Action 5 Pro, Caméra ...",478.98,4.5,,50.0,https://m.media-amazon.com/images/I/71FdBD3y0r...,Objets connectés,18,2025-06-24 10:15:18.153747


In [28]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Configuration Selenium
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"

products = []
page = 1
max_pages = 3

while page <= max_pages:
    print(f"Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})
    
    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        price_whole = item.select_one('span.a-price > span.a-offscreen')
        price = None
        if price_whole:
            price = price_whole.text.strip().replace("€", "").replace(",", ".")
            try:
                price = float(price)
            except:
                price = None

        rating_tag = item.select_one("span.a-icon-alt")
        rating = None
        if rating_tag:
            rating_match = re.search(r"(\d+,\d+)", rating_tag.text)
            if rating_match:
                rating = float(rating_match.group(1).replace(",", "."))

        # ✅ Nouvelle méthode robuste pour récupérer les votes
        votes = None
        votes_elem = item.select_one('span[aria-label*="évaluations"]')
        if votes_elem:
            votes_text = votes_elem.get("aria-label")
            votes_match = re.search(r"([\d\s ]+)évaluations", votes_text)
            if votes_match:
                votes_clean = votes_match.group(1).replace(" ", "").replace("\u202f", "").replace(" ", "")
                if votes_clean.isdigit():
                    votes = int(votes_clean)

        sales_elem = item.find("span", class_="a-size-base a-color-secondary")
        sales = None
        if sales_elem and "acheté" in sales_elem.text:
            sales_match = re.search(r"(\d[\d\s]+)", sales_elem.text)
            if sales_match:
                sales = int(sales_match.group(1).replace(" ", "").replace("\u202f", ""))

        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        products.append({
            "title": title,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })
    
    page += 1

driver.quit()

dfv25 = pd.DataFrame(products)
dfv25.to_csv("amazon_objets_connectes_v25.csv", index=False)
print("✅ Données sauvegardées : amazon_objets_connectes_v25.csv")

Scraping page 1
Scraping page 2
Scraping page 3
✅ Données sauvegardées : amazon_objets_connectes_v25.csv


In [29]:
dfv25 = pd.read_csv('amazon_objets_connectes_v25.csv')
dfv25.sample(20)

Unnamed: 0,title,price,rating,votes,sales_last_month,image_url,category,rank,scraped_at
34,"DJI Bundle Essentiel Osmo Action 5 Pro, Caméra...",369.0,4.5,,50.0,https://m.media-amazon.com/images/I/71LN4iT1Pq...,Objets connectés,35,2025-06-24 10:24:00.988655
30,Apexcam M80 Caméra Sport 4K Stabilisateur 60FP...,53.49,4.6,,200.0,https://m.media-amazon.com/images/I/71MqVAkobQ...,Objets connectés,31,2025-06-24 10:24:00.980277
71,"WOLFANG Caméra Sport 4K 60FPS, GA300 24MP Gran...",59.99,4.2,,50.0,https://m.media-amazon.com/images/I/71MZ7LJGUm...,Objets connectés,72,2025-06-24 10:24:05.859738
42,"Ecouteurs Bluetooth sans Fil, Casque Bluetooth...",23.98,4.5,,300.0,https://m.media-amazon.com/images/I/71-m3Etug6...,Objets connectés,43,2025-06-24 10:24:01.005541
47,"HoomBand Casque Audio Bluetooth, Sommeil Bande...",69.0,4.2,,100.0,https://m.media-amazon.com/images/I/916q9FD91Z...,Objets connectés,48,2025-06-24 10:24:01.017771
36,SANOTO Casque Conduction Osseuse Open Ear Casq...,49.99,4.2,,300.0,https://m.media-amazon.com/images/I/513AJLSmNM...,Objets connectés,37,2025-06-24 10:24:00.992355
14,"Ecouteurs Bluetooth sans Fil, kauguo Oreillett...",19.99,4.7,,500.0,https://m.media-amazon.com/images/I/61AYm-VTJp...,Objets connectés,15,2025-06-24 10:23:56.326602
2,"Garmin Forerunner 255, Music - Montre GPS Mult...",244.0,4.5,,700.0,https://m.media-amazon.com/images/I/51BxNNStky...,Objets connectés,3,2025-06-24 10:23:56.318853
63,GoPro Ensemble d'accessoires HERO12 – Comprend...,406.48,4.6,,,https://m.media-amazon.com/images/I/61FHaxNrUM...,Objets connectés,64,2025-06-24 10:24:05.853585
9,GoPro HERO13 Black - Caméra d'action étanche a...,423.59,4.5,,100.0,https://m.media-amazon.com/images/I/615eTO83jL...,Objets connectés,10,2025-06-24 10:23:56.323466


In [30]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Configuration Selenium
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"

products = []
page = 1
max_pages = 10

while page <= max_pages:
    print(f"🔍 Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        # Lien du produit
        url = None
        if title_elem:
            link = title_elem.find("a", href=True)
            if link:
                url = "https://www.amazon.fr" + link["href"]

        # Prix
        price = None
        price_whole = item.select_one('span.a-price > span.a-offscreen')
        if price_whole:
            try:
                price = float(price_whole.text.strip().replace("€", "").replace(",", "."))
            except:
                price = None

        # Note
        rating = None
        rating_tag = item.select_one("span.a-icon-alt")
        if rating_tag:
            rating_match = re.search(r"(\d+,\d+)", rating_tag.text)
            if rating_match:
                rating = float(rating_match.group(1).replace(",", "."))

        # Votes (robuste)
        votes = None
        aria_elem = item.select_one('span[aria-label*="évaluations"]')
        if aria_elem:
            match = re.search(r"([\d\s ]+)évaluations", aria_elem.get("aria-label", ""))
            if match:
                votes = int(match.group(1).replace(" ", "").replace("\u202f", "").replace(" ", ""))

        if votes is None:
            raw_votes = item.select_one('span.a-size-base.s-underline-text')
            if raw_votes:
                clean = raw_votes.text.strip().replace(" ", "").replace("\u202f", "").replace(" ", "")
                if clean.isdigit():
                    votes = int(clean)

        # Ventes estimées
        sales = None
        sales_elem = item.find("span", class_="a-size-base a-color-secondary")
        if sales_elem and "acheté" in sales_elem.text:
            match = re.search(r"(\d[\d\s]+)", sales_elem.text)
            if match:
                sales = int(match.group(1).replace(" ", "").replace("\u202f", ""))

        # Image
        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        # Marque (naïf : premier mot du titre)
        brand = title.split()[0] if title else None

        products.append({
            "title": title,
            "brand": brand,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "url": url,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })

    page += 1

driver.quit()

# Sauvegarde CSV
dfv26 = pd.DataFrame(products)
dfv26.to_csv("amazon_objets_connectes_v26.csv", index=False)
print("✅ Données sauvegardées : amazon_objets_connectes_v26.csv")

🔍 Scraping page 1
🔍 Scraping page 2
🔍 Scraping page 3
🔍 Scraping page 4
🔍 Scraping page 5
🔍 Scraping page 6
🔍 Scraping page 7
🔍 Scraping page 8
🔍 Scraping page 9
🔍 Scraping page 10
✅ Données sauvegardées : amazon_objets_connectes_v26.csv


In [None]:
dfv26 = pd.read_csv('amazon_objets_connectes_v26.csv')
dfv26.sample(50)

In [49]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Configuration Selenium
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"

products = []
page = 1
max_pages = 100

while page <= max_pages:
    print(f"Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        price_whole = item.select_one('span.a-price > span.a-offscreen')
        price = None
        if price_whole:
            price = price_whole.text.strip().replace("€", "").replace(",", ".")
            try:
                price = float(price)
            except:
                price = None

        rating_tag = item.select_one("span.a-icon-alt")
        rating = None
        if rating_tag:
            rating_match = re.search(r"(\d+,\d+)", rating_tag.text)
            if rating_match:
                rating = float(rating_match.group(1).replace(",", "."))

        # ✅ Bloc votes amélioré
        votes = None
        votes_text = None
        votes_elem = item.find("span", class_="a-size-base s-underline-text")
        if votes_elem:
            votes_text = votes_elem.text.strip()
        else:
            alt_votes = item.select_one("div.a-row.a-size-small span.a-size-base")
            if alt_votes:
                votes_text = alt_votes.text.strip()
        if votes_text:
            cleaned = re.sub(r"[^\d]", "", votes_text)
            if cleaned.isdigit():
                votes = int(cleaned)

        sales_elem = item.find("span", class_="a-size-base a-color-secondary")
        sales = None
        if sales_elem and "acheté" in sales_elem.text:
            sales_match = re.search(r"(\d[\d\s]+)", sales_elem.text)
            if sales_match:
                sales = int(sales_match.group(1).replace(" ", "").replace("\u202f", ""))

        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        brand = None
        if title:
            brand = title.split()[0]

        products.append({
            "title": title,
            "brand": brand,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })

    page += 1

driver.quit()

dfv27 = pd.DataFrame(products)
dfv27.to_csv("amazon_objets_connectes_v27.csv", index=False)
print("✅ Données sauvegardées : amazon_objets_connectes_v27.csv")


Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
Scraping page 28
Scraping page 29
Scraping page 30
Scraping page 31
Scraping page 32
Scraping page 33
Scraping page 34
Scraping page 35
Scraping page 36
Scraping page 37
Scraping page 38
Scraping page 39
Scraping page 40
Scraping page 41
Scraping page 42
Scraping page 43
Scraping page 44
Scraping page 45
Scraping page 46
Scraping page 47
Scraping page 48
Scraping page 49
Scraping page 50
Scraping page 51
Scraping page 52
Scraping page 53
Scraping page 54
Scraping page 55
Scraping page 56
Scraping page 57
Scraping page 58
Scraping page 59
Scrapi

In [50]:
dfv27 = pd.read_csv('amazon_objets_connectes_v27.csv')
dfv27.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2280 entries, 0 to 2279
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             2280 non-null   object 
 1   brand             2280 non-null   object 
 2   price             2140 non-null   float64
 3   rating            2082 non-null   float64
 4   votes             2082 non-null   float64
 5   sales_last_month  142 non-null    float64
 6   image_url         2280 non-null   object 
 7   category          2280 non-null   object 
 8   rank              2280 non-null   int64  
 9   scraped_at        2280 non-null   object 
dtypes: float64(4), int64(1), object(5)
memory usage: 178.3+ KB


In [41]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Configuration Selenium
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"

products = []
page = 1
max_pages = 30  # Ajuste selon ton besoin

while page <= max_pages:
    print(f"Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        brand = title.split()[0] if title else None

        price_whole = item.select_one('span.a-price > span.a-offscreen')
        price = None
        if price_whole:
            price = price_whole.text.strip().replace("€", "").replace(",", ".")
            try:
                price = float(price)
            except:
                price = None

        rating_tag = item.select_one("span.a-icon-alt")
        rating = None
        if rating_tag:
            rating_match = re.search(r"(\d+,\d+)", rating_tag.text)
            if rating_match:
                rating = float(rating_match.group(1).replace(",", "."))

        votes_elem = item.find("span", {"class": "a-size-base s-underline-text"})
        votes = None
        if votes_elem:
            votes_text = votes_elem.text.strip().replace("\u202f", "").replace(" ", "").replace(" ", "").replace(".", "")
            if votes_text.isdigit():
                votes = int(votes_text)

        # 🔥 Nouvelle méthode plus fiable pour détecter les ventes
        sales = None
        sales_elem_candidates = item.find_all(string=re.compile(r"acheté"))
        for candidate in sales_elem_candidates:
            if "acheté" in candidate:
                sales_match = re.search(r"(\d[\d\s]+)", candidate)
                if sales_match:
                    try:
                        sales = int(sales_match.group(1).replace(" ", "").replace("\u202f", ""))
                        break
                    except:
                        continue

        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        products.append({
            "title": title,
            "brand": brand,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })

    page += 1

driver.quit()

dfv28 = pd.DataFrame(products)
dfv28.to_csv("amazon_objets_connectes_v28.csv", index=False)
print("✅ Données sauvegardées : amazon_objets_connectes_v28.csv")


Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
Scraping page 28
Scraping page 29
Scraping page 30
✅ Données sauvegardées : amazon_objets_connectes_v28.csv


In [42]:
dfv28 = pd.read_csv('amazon_objets_connectes_v28.csv')
dfv28.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             720 non-null    object 
 1   brand             720 non-null    object 
 2   price             692 non-null    float64
 3   rating            702 non-null    float64
 4   votes             329 non-null    float64
 5   sales_last_month  146 non-null    float64
 6   image_url         720 non-null    object 
 7   category          720 non-null    object 
 8   rank              720 non-null    int64  
 9   scraped_at        720 non-null    object 
dtypes: float64(4), int64(1), object(5)
memory usage: 56.4+ KB


In [43]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Configuration Selenium
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"

products = []
page = 1
max_pages = 10  # ← adapte si besoin

while page <= max_pages:
    print(f"Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})
    
    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        price_whole = item.select_one('span.a-price > span.a-offscreen')
        price = None
        if price_whole:
            price = price_whole.text.strip().replace("€", "").replace(",", ".")
            try:
                price = float(price)
            except:
                price = None

        rating_tag = item.select_one("span.a-icon-alt")
        rating = None
        if rating_tag:
            rating_match = re.search(r"(\d+,\d+)", rating_tag.text)
            if rating_match:
                rating = float(rating_match.group(1).replace(",", "."))

        votes_elem = item.find("span", {"class": "a-size-base s-underline-text"})
        votes = None
        if votes_elem:
            votes_text = votes_elem.text.strip().replace("\u202f", "").replace(" ", "").replace(" ", "").replace(".", "")
            if votes_text.isdigit():
                votes = int(votes_text)

        # 🔥 Version robuste de extraction "ventes dernier mois"
        sales = None
        sales_elem_candidates = item.find_all(string=re.compile(r"(acheté|vendu)", re.IGNORECASE))
        for candidate in sales_elem_candidates:
            text = candidate.strip()
            if "mois dernier" in text:
                sales_match = re.search(r"(\d[\d\s]*)", text)
                if sales_match:
                    try:
                        sales = int(sales_match.group(1).replace(" ", "").replace("\u202f", ""))
                    except:
                        sales = None
                elif "Plus de" in text:
                    approx_match = re.search(r"Plus de (\d+)", text)
                    if approx_match:
                        try:
                            sales = int(approx_match.group(1))
                        except:
                            sales = None
                break

        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        products.append({
            "title": title,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })
    
    page += 1

driver.quit()

dfv29 = pd.DataFrame(products)
dfv29.to_csv("amazon_objets_connectes_v29.csv", index=False)
print("✅ Données sauvegardées : amazon_objets_connectes_v29.csv")


Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
✅ Données sauvegardées : amazon_objets_connectes_v29.csv


In [44]:
dfv29 = pd.read_csv('amazon_objets_connectes_v29.csv')
dfv29.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             240 non-null    object 
 1   price             235 non-null    float64
 2   rating            234 non-null    float64
 3   votes             105 non-null    float64
 4   sales_last_month  138 non-null    float64
 5   image_url         240 non-null    object 
 6   category          240 non-null    object 
 7   rank              240 non-null    int64  
 8   scraped_at        240 non-null    object 
dtypes: float64(4), int64(1), object(4)
memory usage: 17.0+ KB


In [47]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Configuration de Selenium
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"
products = []
page = 1
max_pages = 100  # augmente si besoin

while page <= max_pages:
    print(f"Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        brand = None  # inchangé de v27

        price_elem = item.select_one('span.a-price > span.a-offscreen')
        price = None
        if price_elem:
            try:
                price = float(price_elem.text.replace("€", "").replace(",", "."))
            except:
                pass

        rating_elem = item.select_one("span.a-icon-alt")
        rating = None
        if rating_elem:
            m = re.search(r"(\d+,\d+)", rating_elem.text)
            if m:
                rating = float(m.group(1).replace(",", "."))

        votes_elem = item.find("span", class_="a-size-base s-underline-text")
        votes = None
        if votes_elem:
            raw = votes_elem.text.strip().replace("\u202f", "").replace(" ", "").replace(" ", "")
            if raw.isdigit():
                votes = int(raw)

        # ✅ Optimisation de sales_last_month uniquement
        sales = None
        candidates = item.find_all("span", class_="a-size-base a-color-secondary")
        for cand in candidates:
            txt = cand.get_text(strip=True)
            if "acheté" in txt.lower() or "vendu" in txt.lower():
                match = re.search(r"(\d[\d\s]*)", txt)
                if match:
                    try:
                        sales = int(match.group(1).replace(" ", "").replace("\u202f", ""))
                        break
                    except:
                        pass
                elif "Plus de" in txt:
                    match = re.search(r"Plus de (\d+)", txt)
                    if match:
                        try:
                            sales = int(match.group(1))
                            break
                        except:
                            pass

        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        products.append({
            "title": title,
            "brand": brand,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })

    page += 1

driver.quit()

df = pd.DataFrame(products)
df.to_csv("amazon_objets_connectes_v30.csv", index=False)
print("✅ Données sauvegardées : amazon_objets_connectes_v30.csv")


Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
Scraping page 28
Scraping page 29
Scraping page 30
Scraping page 31
Scraping page 32
Scraping page 33
Scraping page 34
Scraping page 35
Scraping page 36
Scraping page 37
Scraping page 38
Scraping page 39
Scraping page 40
Scraping page 41
Scraping page 42
Scraping page 43
Scraping page 44
Scraping page 45
Scraping page 46
Scraping page 47
Scraping page 48
Scraping page 49
Scraping page 50
Scraping page 51
Scraping page 52
Scraping page 53
Scraping page 54
Scraping page 55
Scraping page 56
Scraping page 57
Scraping page 58
Scraping page 59
Scrapi

In [48]:
dfv30 = pd.read_csv('amazon_objets_connectes_v30.csv')
dfv30.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             2400 non-null   object 
 1   brand             0 non-null      float64
 2   price             2249 non-null   float64
 3   rating            2186 non-null   float64
 4   votes             1432 non-null   float64
 5   sales_last_month  142 non-null    float64
 6   image_url         2400 non-null   object 
 7   category          2400 non-null   object 
 8   rank              2400 non-null   int64  
 9   scraped_at        2400 non-null   object 
dtypes: float64(5), int64(1), object(4)
memory usage: 187.6+ KB


In [51]:
df = dfv27
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2280 entries, 0 to 2279
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             2280 non-null   object 
 1   brand             2280 non-null   object 
 2   price             2140 non-null   float64
 3   rating            2082 non-null   float64
 4   votes             2082 non-null   float64
 5   sales_last_month  142 non-null    float64
 6   image_url         2280 non-null   object 
 7   category          2280 non-null   object 
 8   rank              2280 non-null   int64  
 9   scraped_at        2280 non-null   object 
dtypes: float64(4), int64(1), object(5)
memory usage: 178.3+ KB


In [53]:
import re

def extract_asin(url):
    """Extrait l'ASIN à partir d'une URL Amazon"""
    if not url:
        return None
    match = re.search(r"/dp/([A-Z0-9]{10})", url)
    if match:
        return match.group(1)
    return None


In [54]:
url = "https://www.amazon.fr/dp/B07HFWPJX4/"
print(extract_asin(url))  # Résultat attendu : B07HFWPJX4


B07HFWPJX4


In [57]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Fonction pour parser les ventes "1k" => 1000
def parse_sales_number(text):
    text = text.lower().replace(" ", "").replace(",", ".").replace(" ", "")
    match = re.search(r"([\d\.]+)(k?)", text)
    if match:
        number = float(match.group(1))
        if match.group(2) == 'k':
            return int(number * 1000)
        return int(number)
    return None

# Config Selenium
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"
products = []
page = 1
max_pages = 20

while page <= max_pages:
    print(f"Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        price = None
        price_elem = item.select_one('span.a-price > span.a-offscreen')
        if price_elem:
            try:
                price = float(price_elem.text.strip().replace("€", "").replace(",", "."))
            except:
                price = None

        rating = None
        rating_tag = item.select_one("span.a-icon-alt")
        if rating_tag:
            rating_match = re.search(r"(\d+,\d+)", rating_tag.text)
            if rating_match:
                rating = float(rating_match.group(1).replace(",", "."))

        votes = None
        votes_text = None
        votes_elem = item.find("span", class_="a-size-base s-underline-text")
        if votes_elem:
            votes_text = votes_elem.text.strip()
        else:
            alt_votes = item.select_one("div.a-row.a-size-small span.a-size-base")
            if alt_votes:
                votes_text = alt_votes.text.strip()
        if votes_text:
            cleaned = re.sub(r"[^\d]", "", votes_text)
            if cleaned.isdigit():
                votes = int(cleaned)

        # ✅ Sales fix: gère k / chiffres
        sales_elem = item.find("span", class_="a-size-base a-color-secondary")
        sales = None
        if sales_elem and "acheté" in sales_elem.text:
            sales = parse_sales_number(sales_elem.text)

        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        brand = title.split()[0] if title else None

        url_elem = item.find("a", class_="a-link-normal", href=True)
        url = f"https://www.amazon.fr{url_elem['href'].split('?')[0]}" if url_elem else None

        # ✅ Ajout colonne pour traitement futur
        has_reviews = bool(rating and votes and votes > 0)

        products.append({
            "title": title,
            "brand": brand,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "url": url,
            "has_reviews": has_reviews,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })

    page += 1

driver.quit()

dfv41 = pd.DataFrame(products)
dfv41.to_csv("amazon_objets_connectes_v41.csv", index=False)
print("✅ Données sauvegardées dans amazon_objets_connectes_v41.csv")


Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
✅ Données sauvegardées dans amazon_objets_connectes_v41.csv


In [59]:
dfv41 = pd.read_csv('amazon_objets_connectes_v41.csv')
dfv41.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             480 non-null    object 
 1   brand             480 non-null    object 
 2   price             461 non-null    float64
 3   rating            474 non-null    float64
 4   votes             474 non-null    float64
 5   sales_last_month  142 non-null    float64
 6   image_url         480 non-null    object 
 7   url               480 non-null    object 
 8   has_reviews       480 non-null    bool   
 9   category          480 non-null    object 
 10  rank              480 non-null    int64  
 11  scraped_at        480 non-null    object 
dtypes: bool(1), float64(4), int64(1), object(6)
memory usage: 41.8+ KB


In [60]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re
import unicodedata

options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"
products = []
max_pages = 10  # nombre de pages à scrapper

for page in range(1, max_pages + 1):
    driver.get(base_url.format(page))
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        price_elem = item.select_one('span.a-price > span.a-offscreen')
        price = float(price_elem.text.strip().replace("€", "").replace(",", ".")) if price_elem else None

        rating = None
        rating_elem = item.select_one("span.a-icon-alt")
        if rating_elem:
            match = re.search(r"(\d+,\d+)", rating_elem.text)
            if match:
                rating = float(match.group(1).replace(",", "."))

        votes = None
        votes_elem = item.find("span", class_="a-size-base s-underline-text")
        if votes_elem:
            raw_votes = votes_elem.text.strip().replace(" ", "").replace("\u202f", "")
            if raw_votes.replace(".", "").isdigit():
                votes = int(raw_votes.replace(".", ""))

        sales = None
        sales_elem = item.find("span", class_="a-size-base a-color-secondary")
        if sales_elem:
            sales_text = unicodedata.normalize("NFKD", sales_elem.get_text(strip=True))
            if "acheté" in sales_text and "mois" in sales_text:
                match = re.search(r"(\d[\d\s\u202f\u00a0kK]*)", sales_text)
                if match:
                    raw_number = match.group(1).replace(" ", "").replace("\u202f", "").replace("\xa0", "").lower()
                    if "k" in raw_number:
                        try:
                            sales = int(float(raw_number.replace("k", "")) * 1000)
                        except:
                            sales = None
                    else:
                        try:
                            sales = int(raw_number)
                        except:
                            sales = None

        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        brand = title.split()[0] if title else None

        products.append({
            "title": title,
            "brand": brand,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })

driver.quit()

df = pd.DataFrame(products)
df.to_csv("amazon_objets_connectes_v42.csv", index=False)
print("✅ Fichier amazon_objets_connectes_v42.csv généré avec succès.")


✅ Fichier amazon_objets_connectes_v42.csv généré avec succès.


In [61]:
dfv42 = pd.read_csv('amazon_objets_connectes_v42.csv')
dfv42.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             240 non-null    object 
 1   brand             240 non-null    object 
 2   price             233 non-null    float64
 3   rating            235 non-null    float64
 4   votes             104 non-null    float64
 5   sales_last_month  0 non-null      float64
 6   image_url         240 non-null    object 
 7   category          240 non-null    object 
 8   rank              240 non-null    int64  
 9   scraped_at        240 non-null    object 
dtypes: float64(4), int64(1), object(5)
memory usage: 18.9+ KB


In [62]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Fonction pour convertir "1k" ou "1.2k" en 1000 ou 1200
def parse_sales_number(text):
    text = text.lower().replace(" ", "").replace(",", ".").replace(" ", "")
    match = re.search(r"([\d\.]+)(k?)", text)
    if match:
        number = float(match.group(1))
        if match.group(2) == 'k':
            return int(number * 1000)
        return int(number)
    return None

# Configuration Selenium
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"
products = []
page = 1
max_pages = 10

while page <= max_pages:
    print(f"Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        price = None
        price_elem = item.select_one('span.a-price > span.a-offscreen')
        if price_elem:
            try:
                price = float(price_elem.text.strip().replace("€", "").replace(",", "."))
            except:
                price = None

        rating = None
        rating_tag = item.select_one("span.a-icon-alt")
        if rating_tag:
            rating_match = re.search(r"(\d+,\d+)", rating_tag.text)
            if rating_match:
                rating = float(rating_match.group(1).replace(",", "."))

        votes = None
        votes_text = None
        votes_elem = item.find("span", class_="a-size-base s-underline-text")
        if votes_elem:
            votes_text = votes_elem.text.strip()
        else:
            alt_votes = item.select_one("div.a-row.a-size-small span.a-size-base")
            if alt_votes:
                votes_text = alt_votes.text.strip()
        if votes_text:
            cleaned = re.sub(r"[^\d]", "", votes_text)
            if cleaned.isdigit():
                votes = int(cleaned)

        # ✅ Gestion propre des ventes (avec "k")
        sales_elem = item.find("span", class_="a-size-base a-color-secondary")
        sales = None
        if sales_elem and "acheté" in sales_elem.text:
            sales = parse_sales_number(sales_elem.text)

        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        brand = title.split()[0] if title else None

        url_elem = item.find("a", class_="a-link-normal", href=True)
        url = f"https://www.amazon.fr{url_elem['href'].split('?')[0]}" if url_elem else None

        # ✅ Booléen pour filtrer plus tard les produits avec avis
        has_reviews = bool(rating and votes and votes > 0)

        products.append({
            "title": title,
            "brand": brand,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "url": url,
            "has_reviews": has_reviews,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })

    page += 1

driver.quit()

dfv42 = pd.DataFrame(products)
dfv42.to_csv("amazon_objets_connectes_v42.csv", index=False)
print("✅ Données sauvegardées dans amazon_objets_connectes_v42.csv")


Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
✅ Données sauvegardées dans amazon_objets_connectes_v42.csv


In [63]:
dfv42 = pd.read_csv('amazon_objets_connectes_v42.csv')
dfv42.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             240 non-null    object 
 1   brand             240 non-null    object 
 2   price             233 non-null    float64
 3   rating            235 non-null    float64
 4   votes             235 non-null    float64
 5   sales_last_month  138 non-null    float64
 6   image_url         240 non-null    object 
 7   url               240 non-null    object 
 8   has_reviews       240 non-null    bool   
 9   category          240 non-null    object 
 10  rank              240 non-null    int64  
 11  scraped_at        240 non-null    object 
dtypes: bool(1), float64(4), int64(1), object(6)
memory usage: 21.0+ KB


In [64]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re

# Fonction pour parser les ventes "1k" => 1000
def parse_sales_number(text):
    text = text.lower().replace("\u202f", "").replace(",", ".").replace(" ", "")
    match = re.search(r"([\d\.]+)(k?)", text)
    if match:
        number = float(match.group(1))
        if match.group(2) == 'k':
            return int(number * 1000)
        return int(number)
    return None

# Config Selenium
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&page={}"
products = []
page = 1
max_pages = 20

while page <= max_pages:
    print(f"Scraping page {page}")
    driver.get(base_url.format(page))
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("div", {"data-component-type": "s-search-result"})

    for rank, item in enumerate(items, start=1 + (page - 1) * len(items)):
        title_elem = item.h2
        title = title_elem.text.strip() if title_elem else None

        price = None
        price_elem = item.select_one('span.a-price > span.a-offscreen')
        if price_elem:
            try:
                price = float(price_elem.text.strip().replace("€", "").replace(",", "."))
            except:
                price = None

        rating = None
        rating_tag = item.select_one("span.a-icon-alt")
        if rating_tag:
            rating_match = re.search(r"(\d+,\d+)", rating_tag.text)
            if rating_match:
                rating = float(rating_match.group(1).replace(",", "."))

        votes = None
        votes_text = None
        votes_elem = item.find("span", class_="a-size-base s-underline-text")
        if votes_elem:
            votes_text = votes_elem.text.strip()
        else:
            alt_votes = item.select_one("div.a-row.a-size-small span.a-size-base")
            if alt_votes:
                votes_text = alt_votes.text.strip()
        if votes_text:
            cleaned = re.sub(r"[^\d]", "", votes_text)
            if cleaned.isdigit():
                votes = int(cleaned)

        # ✅ Sales fix: gère k / chiffres avec logs
        sales_elem = item.find("span", class_="a-size-base a-color-secondary")
        sales = None
        if sales_elem:
            raw_sales_text = sales_elem.text.lower()
            if "acheté" in raw_sales_text:
                print("🔍 Acheté détecté brut:", raw_sales_text)
                sales = parse_sales_number(raw_sales_text)

        image_elem = item.find("img")
        image_url = image_elem['src'] if image_elem else None

        brand = title.split()[0] if title else None

        url_elem = item.find("a", class_="a-link-normal", href=True)
        url = f"https://www.amazon.fr{url_elem['href'].split('?')[0]}" if url_elem else None

        # ✅ Ajout colonne pour traitement futur
        has_reviews = bool(rating and votes and votes > 0)

        products.append({
            "title": title,
            "brand": brand,
            "price": price,
            "rating": rating,
            "votes": votes,
            "sales_last_month": sales,
            "image_url": image_url,
            "url": url,
            "has_reviews": has_reviews,
            "category": "Objets connectés",
            "rank": rank,
            "scraped_at": datetime.now()
        })

    page += 1

driver.quit()

dfv42 = pd.DataFrame(products)
dfv42.to_csv("amazon_objets_connectes_v42.csv", index=False)
print("✅ Données sauvegardées dans amazon_objets_connectes_v42.csv")


Scraping page 1
🔍 Acheté détecté brut: plus de 1 k achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 700 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 300 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 700 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 400 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 300 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 300 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 200 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 200 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 100 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 700 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 100 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 400 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 500 achetés au cours du mois dernier
🔍 Acheté détecté brut: plus de 2

In [65]:
dfv42 = pd.read_csv('amazon_objets_connectes_v42.csv')
dfv42.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 408 entries, 0 to 407
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             408 non-null    object 
 1   brand             408 non-null    object 
 2   price             393 non-null    float64
 3   rating            403 non-null    float64
 4   votes             403 non-null    float64
 5   sales_last_month  142 non-null    float64
 6   image_url         408 non-null    object 
 7   url               408 non-null    object 
 8   has_reviews       408 non-null    bool   
 9   category          408 non-null    object 
 10  rank              408 non-null    int64  
 11  scraped_at        408 non-null    object 
dtypes: bool(1), float64(4), int64(1), object(6)
memory usage: 35.6+ KB


In [None]:
df_clean.to