In [23]:
import requests
from bs4 import BeautifulSoup

def get_amazon_bestsellers(category_url, n=10):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
    }
    response = requests.get(category_url, headers=headers)
    if response.status_code != 200:
        print("Erreur lors de la r√©cup√©ration de la page")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Trouver tous les liens produits qui ont cette classe sp√©cifique
    product_links = soup.select('a.a-link-normal.aok-block')[:n]

    bestsellers = []
    for a in product_links:
        href = a.get('href')
        full_link = 'https://www.amazon.fr' + href if href else None

        img = a.find('img')
        title = img['alt'] if img and 'alt' in img.attrs else "Titre inconnu"

        bestsellers.append({'title': title, 'link': full_link})

    return bestsellers

if __name__ == "__main__":
    url = "https://www.amazon.fr/gp/bestsellers/electronics/"
    bestsellers = get_amazon_bestsellers(url, n=20)

    for i, product in enumerate(bestsellers, 1):
        print(f"{i}. {product['title']}")
        print(f"   Lien: {product['link']}\n")

1. Apple AirTag
   Lien: https://www.amazon.fr/Apple-MX532ZMA-Nouveau-AirTag/dp/B0935DN1BN/ref=zg_bs_g_electronics_d_sccl_1/260-0894714-1452108?psc=1

2. Titre inconnu
   Lien: https://www.amazon.fr/Apple-MX532ZMA-Nouveau-AirTag/dp/B0935DN1BN/ref=zg_bs_g_electronics_d_sccl_1/260-0894714-1452108?psc=1

3. Titre inconnu
   Lien: https://www.amazon.fr/Apple-MX532ZMA-Nouveau-AirTag/dp/B0935DN1BN/ref=zg_bs_g_electronics_d_sccl_1/260-0894714-1452108?psc=1

4. Amazon Fire TV Stick HD (Nouvelle g√©n√©ration) | TV gratuite et en direct, t√©l√©commande vocale Alexa, contr√¥le de la maison co
   Lien: https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQMWQDH4/ref=zg_bs_g_electronics_d_sccl_2/260-0894714-1452108?psc=1

5. Titre inconnu
   Lien: https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQMWQDH4/ref=zg_bs_g_electronics_d_sccl_2/260-0894714-1452108?psc=1

6. Titre inconnu
   Lien: https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQMWQDH4/ref=zg_bs_g_electronics_d_sccl_2/260-0894714-1452108?psc=1

7. Imou 2K(

In [33]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time

def scrape_aliexpress_selenium(url):
    options = Options()
    options.add_argument('--headless')  # ex√©cution sans ouvrir la fen√™tre du navigateur
    options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=options)

    driver.get(url)
    time.sleep(5)  # attendre que la page charge JS (tu peux augmenter si n√©cessaire)

    products = []

    product_cards = driver.find_elements(By.CSS_SELECTOR, 'div.manhattan--content--1lP57Ag')
    if not product_cards:
        print("Aucun produit trouv√© sur la page.")
        driver.quit()
        return pd.DataFrame()

    for card in product_cards:
        try:
            title = card.find_element(By.CSS_SELECTOR, 'a.manhattan--titleText--WccSjUS').text
        except:
            title = ''
        try:
            price = card.find_element(By.CSS_SELECTOR, 'div.manhattan--price-sale--1CCSZfK').text
        except:
            price = ''
        try:
            orders = card.find_element(By.CSS_SELECTOR, 'div.manhattan--order--1lP57Ag').text
        except:
            orders = ''

        products.append({'title': title, 'price': price, 'orders': orders})

    driver.quit()
    return pd.DataFrame(products)

url = "https://fr.aliexpress.com/w/wholesale-best-seller.html?page=2&g=y&SearchText=best+seller"
df = scrape_aliexpress_selenium(url)
print(df)

Aucun produit trouv√© sur la page.
Empty DataFrame
Columns: []
Index: []


In [43]:
import requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0'}
url = 'https://www.amazon.fr/gp/bestsellers/?ref_=nav_cs_bestsellers'

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# On s√©lectionne tous les liens produits dans la grille des best sellers
product_links = soup.select('a.a-link-normal.aok-block')

# Affiche les 5 premiers produits
for link in product_links[:5]:
    title = link.get('title') or link.text.strip()  # certains liens ont le titre comme attribut
    href = link.get('href')
    full_link = f"https://www.amazon.fr{href}" if href else "Lien manquant"

    print(f"Produit : {title}")
    print(f"Lien : {full_link}\n")
print("Statut de la requ√™te :", response.status_code)
print("Contenu partiel de la page :")
print(soup.prettify()[:2000]) 

Statut de la requ√™te : 200
Contenu partiel de la page :
<!DOCTYPE html>
<!--[if lt IE 7]> <html lang="fr" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->
<!--[if IE 7]>    <html lang="fr" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->
<!--[if IE 8]>    <html lang="fr" class="a-no-js a-lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="a-no-js" lang="fr">
 <!--<![endif]-->
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <title dir="ltr">
   Amazon.fr
  </title>
  <meta content="width=device-width" name="viewport"/>
  <link href="https://images-na.ssl-images-amazon.com/images/G/01/AUIClients/AmazonUI-3c913031596ca78a3768f4e934b1cc02ce238101.secure.min._V1_.css" rel="stylesheet"/>
  <script>
   if (true === true) {
    var ue_t0 = (+ new Date()),
        ue_csm = window,
        ue = { t0: ue_t0, d: function() { return (+new Date() - ue_t0); } },
 

In [45]:
import requests
from bs4 import BeautifulSoup

# En-t√™tes pour passer les protections de base
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    'Accept-Language': 'fr-FR,fr;q=0.9'
}

url = 'https://www.amazon.fr/gp/bestsellers/luminaires-eclairage'  # Cat√©gorie sp√©cifique si possible

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
with open("amazon_test.html", "w", encoding="utf-8") as f:
    f.write(soup.prettify())
# S√©lectionne tous les blocs produit
product_blocks = soup.select('div.zg-grid-general-faceout')

for product in product_blocks[:5]:  # Limit√© √† 5 produits
    # Titre
    title_tag = product.select_one('.p13n-sc-truncate-desktop-type2')
    title = title_tag.get('title') if title_tag else "Titre non trouv√©"

    # Lien
    link_tag = product.find('a', class_='a-link-normal aok-block')
    link = f"https://www.amazon.fr{link_tag['href']}" if link_tag and link_tag.get('href') else "Lien non trouv√©"

    # Prix
    price_tag = product.select_one('.p13n-sc-price')
    price = price_tag.text.strip() if price_tag else "Prix non trouv√©"

    print(f"Produit : {title}")
    print(f"Lien : {link}")
    print(f"Prix : {price}")
    print("-" * 60)

In [46]:
import requests
from bs4 import BeautifulSoup

# Fonction pour scraper une page
def scraper_page(url):
    response = requests.get(url)

# On v√©rifie que le site r√©ponde avant d'aller plus loin
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        titles = soup.select('div.p13n-sc-truncate-desktop-type2.p13n-sc-truncated')

# On cr√©e une liste vide pour stocker     
        data = []

# On r√©cup√©re le texte
        for title in titles:
            print(title.text)
            data.append(title.text)
       
# On programme la sortie de la fonction avec le return : notre texte.
        return data # Sous un format liste
# Autrement, on affiche le message d'erreur.
    else:
        print(f"Erreur lors de la r√©cup√©ration de la page {url}.")
# On peut aussi traiter le message d'erreur avec un "try" "except" si on connais le message d'erreur. 

In [47]:
scraper_page('https://www.amazon.fr/gp/bestsellers/?ref_=nav_cs_bestsellers')

Erreur lors de la r√©cup√©ration de la page https://www.amazon.fr/gp/bestsellers/?ref_=nav_cs_bestsellers.


In [48]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time

# Configuration Chrome
options = Options()
options.add_argument('--headless')  # pour ne pas ouvrir la fen√™tre
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

base_url = "https://www.amazon.fr/gp/bestsellers/electronics/ref=zg_bs_pg_{}?_encoding=UTF8&pg={}"

all_data = []

# Scrape les 5 premi√®res pages (change range si besoin)
for page in range(1, 6):
    print(f"Scraping page {page}...")
    url = base_url.format(page, page)
    driver.get(url)
    time.sleep(3)

    products = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

    for product in products:
        try:
            title = product.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
        except:
            title = None
        try:
            price = product.find_element(By.CSS_SELECTOR, ".a-price > .a-offscreen").text.replace("‚Ç¨", "").replace(",", ".")
        except:
            price = None
        try:
            rating = product.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerHTML").split(" ")[0]
        except:
            rating = None
        try:
            votes = product.find_element(By.CSS_SELECTOR, ".a-size-small").text.replace("‚ÄØ", "").replace(",", "")
        except:
            votes = None
        try:
            link = product.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            link = None

        all_data.append({
            "Titre": title,
            "Prix (‚Ç¨)": price,
            "Note": rating,
            "Nombre de votes": votes,
            "Lien": link
        })

driver.quit()

df = pd.DataFrame(all_data)
print(df.head())

# Facultatif : enregistrer dans un CSV
df.to_csv("amazon_bestsellers_hightech.csv", index=False)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
                                               Titre Prix (‚Ç¨) Note  \
0                                       Apple AirTag     None  4,6   
1  Imou 2K(3MP) Cam√©ra Surveillance WiFi Int√©rieu...     None  4,4   
2  Tapo 2K(3MP) Cam√©ra Surveillance WiFi int√©rieu...     None  4,6   
3  DURACELL CR2032 Piles Boutons au lithium 3V (l...     None  4,7   
4  Amazon Fire TV Stick HD (Nouvelle g√©n√©ration) ...     None  4,5   

  Nombre de votes                                               Lien  
0         127 835  https://www.amazon.fr/Apple-MX532ZMA-Nouveau-A...  
1          42 181  https://www.amazon.fr/Imou-Surveillance-Int%C3...  
2          34 195  https://www.amazon.fr/Tapo-Surveillance-int%C3...  
3         124 763  https://www.amazon.fr/DURACELL-CR2032-Piles-Bo...  
4          21 611  https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQ...  


In [49]:
df

Unnamed: 0,Titre,Prix (‚Ç¨),Note,Nombre de votes,Lien
0,Apple AirTag,,46,127 835,https://www.amazon.fr/Apple-MX532ZMA-Nouveau-A...
1,Imou 2K(3MP) Cam√©ra Surveillance WiFi Int√©rieu...,,44,42 181,https://www.amazon.fr/Imou-Surveillance-Int%C3...
2,Tapo 2K(3MP) Cam√©ra Surveillance WiFi int√©rieu...,,46,34 195,https://www.amazon.fr/Tapo-Surveillance-int%C3...
3,DURACELL CR2032 Piles Boutons au lithium 3V (l...,,47,124 763,https://www.amazon.fr/DURACELL-CR2032-Piles-Bo...
4,Amazon Fire TV Stick HD (Nouvelle g√©n√©ration) ...,,45,21 611,https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQ...
5,"Aioneus Chargeur USB C, 40W 4 Port Prise USBC ...",,45,12 323,https://www.amazon.fr/Aioneus-Chargeur-Secteur...
6,"INIU Batterie Externe, 22.5W 10000mAh Power Ba...",,46,25 917,https://www.amazon.fr/INIU-Batterie-10500mAh-C...
7,Amazon Basics Lot de 6 piles bouton CR2032 au ...,,47,198 995,https://www.amazon.fr/Amazon-Basics-bouton-lit...
8,Duracell Plus Piles AA (lot de 24) - Alcalines...,,47,40 482,https://www.amazon.fr/Piles-AA-Duracell-Plus-l...
9,"JBL GO 4, Enceinte Bluetooth ultra-portable, s...",,47,8 320,https://www.amazon.fr/JBL-Ultra-Portable-percu...


In [50]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import time
import random

# Config Selenium
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

all_data = []

# Amazon a g√©n√©ralement 50 pages max pour les best sellers
max_pages = 50

for page in range(1, max_pages + 1):
    print(f"Scraping page {page}...")
    url = f"https://www.amazon.fr/gp/bestsellers/electronics/ref=zg_bs_pg_{page}?_encoding=UTF8&pg={page}"
    driver.get(url)
    time.sleep(random.uniform(2.5, 4))  # d√©lai anti-bot

    products = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

    if not products:
        print("Fin d√©tect√©e ou page vide.")
        break

    for product in products:
        try:
            title = product.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
        except NoSuchElementException:
            title = None

        try:
            price = product.find_element(By.CSS_SELECTOR, ".a-price .a-offscreen").text
            price = price.replace("‚Ç¨", "").replace(",", ".").strip()
        except NoSuchElementException:
            price = None

        try:
            rating = product.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerHTML").split(" ")[0]
        except NoSuchElementException:
            rating = None

        try:
            votes = product.find_element(By.CSS_SELECTOR, ".a-size-small").text
            votes = votes.replace("‚ÄØ", "").replace(",", "").replace(".", "").strip()
        except NoSuchElementException:
            votes = None

        try:
            link = product.find_element(By.TAG_NAME, "a").get_attribute("href")
        except NoSuchElementException:
            link = None

        all_data.append({
            "Titre": title,
            "Prix (‚Ç¨)": price,
            "Note": rating,
            "Nombre de votes": votes,
            "Lien": link
        })

    # Petite pause entre les pages
    time.sleep(random.uniform(1.5, 3))

driver.quit()

# Cr√©ation DataFrame
df = pd.DataFrame(all_data)

# Nettoyage des doublons √©ventuels
df.drop_duplicates(subset=["Titre", "Lien"], inplace=True)

# Affichage
print(df.head())
print(f"Total produits scrapp√©s : {len(df)}")

# Enregistrement facultatif
df.to_csv("amazon_bestsellers_hightech_full.csv", index=False)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Fin d√©tect√©e ou page vide.
                                               Titre Prix (‚Ç¨) Note  \
0                                       Apple AirTag     None  4,6   
1  Imou 2K(3MP) Cam√©ra Surveillance WiFi Int√©rieu...     None  4,4   
2  Tapo 2K(3MP) Cam√©ra Surveillance WiFi int√©rieu...     None  4,6   
3  DURACELL CR2032 Piles Boutons au lithium 3V (l...     None  4,7   
4  Amazon Fire TV Stick HD (Nouvelle g√©n√©ration) ...     None  4,5   

  Nombre de votes                                               Lien  
0         127 835  https://www.amazon.fr/Apple-MX532ZMA-Nouveau-A...  
1          42 181  https://www.amazon.fr/Imou-Surveillance-Int%C3...  
2          34 195  https://www.amazon.fr/Tapo-Surveillance-int%C3...  
3         124 763  https://www.amazon.fr/DURACELL-CR2032-Piles-Bo...  
4          21 611  https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQ...  
Total produits scrapp√©s : 60


In [54]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

headers = {'User-Agent': 'Mozilla/5.0'}

def scrape_category(url, category_name):
    products = []

    for page in range(1, 3):  # 2 pages = environ 50 produits
        paginated_url = f"{url}?pg={page}"
        response = requests.get(paginated_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        items = soup.select('div.zg-grid-general-faceout')

        for rank, item in enumerate(items, start=(page - 1) * 25 + 1):
            title_tag = item.select_one('.p13n-sc-truncate-desktop-type2') or item.select_one('._cDEzb_p13n-sc-css-line-clamp-3_g3dy1')
            title = title_tag.get_text(strip=True) if title_tag else None

            price_tag = item.select_one('.a-price span.a-offscreen')
            price = float(price_tag.text.replace('‚Ç¨', '').replace(',', '.')) if price_tag else None

            rating_tag = item.select_one('.a-icon-alt')
            rating = float(rating_tag.text.split()[0].replace(',', '.')) if rating_tag else None

            reviews_tag = item.select_one('.a-size-small')
            num_reviews = int(reviews_tag.text.replace("\xa0", "").replace(" ", "").replace(",", "")) if reviews_tag else None


            link_tag = item.select_one('a.a-link-normal')
            url_product = "https://www.amazon.fr" + link_tag['href'] if link_tag else None

            prime_icon = item.select_one('.a-icon-prime')
            is_prime = True if prime_icon else False

            product = {
                "title": title,
                "price": price,
                "rating": rating,
                "num_reviews": num_reviews,
                "category": category_name,
                "url": url_product,
                "brand": None,  # √Ä ajouter plus tard si possible
                "is_prime": is_prime,
                "bestseller_rank": rank,
                "scraped_at": datetime.now()
            }

            products.append(product)

        time.sleep(1)  # pour √©viter les blocages Amazon

    return pd.DataFrame(products)

In [55]:
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beaut√© et Parfum": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine et Maison": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vid√©o": "https://www.amazon.fr/gp/bestsellers/videogames",
}

df_list = []

for cat_name, url in category_urls.items():
    print(f"Scraping {cat_name}...")
    df_cat = scrape_category(url, cat_name)
    df_list.append(df_cat)

# Fusionner tous les DataFrames
df_all = pd.concat(df_list, ignore_index=True)

# Sauvegarder en CSV
df_all.to_csv("amazon_bestsellers.csv", index=False)
print("Scraping termin√©, donn√©es enregistr√©es.")

Scraping High-tech...
Scraping Beaut√© et Parfum...
Scraping Cuisine et Maison...
Scraping Mode...
Scraping Animalerie...
Scraping Jeux et Jouets...
Scraping Jeux vid√©o...
Scraping termin√©, donn√©es enregistr√©es.


In [57]:
import pandas as pd

df = pd.read_csv("amazon_bestsellers.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            59 non-null     object 
 1   price            0 non-null      float64
 2   rating           59 non-null     float64
 3   num_reviews      59 non-null     float64
 4   category         60 non-null     object 
 5   url              60 non-null     object 
 6   brand            0 non-null      float64
 7   is_prime         60 non-null     bool   
 8   bestseller_rank  60 non-null     int64  
 9   scraped_at       60 non-null     object 
dtypes: bool(1), float64(4), int64(1), object(4)
memory usage: 4.4+ KB


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import random

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

category_urls = {
    "High-tech": "https://www.idealo.fr/cat/3605/informatique.html",
    "Cuisine": "https://www.idealo.fr/cat/13489/cuisine.html",
    "Jeux video": "https://www.idealo.fr/cat/10678/jeux-video.html",
    "Beaut√©": "https://www.idealo.fr/cat/13492/beaute-sante.html"
}

def scrape_idealo_category(url, category_name):
    products = []

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Erreur lors de la requ√™te pour {category_name}")
        return pd.DataFrame()

    soup = BeautifulSoup(response.content, "html.parser")
    items = soup.select("div.offerList-item")

    for item in items:
        title_tag = item.select_one(".offerList-title")
        title = title_tag.text.strip() if title_tag else None

        price_tag = item.select_one(".offerList-price")
        try:
            price = float(price_tag.text.replace("\xa0", "").replace("\u20ac", "").replace(",", ".")) if price_tag else None
        except:
            price = None

        product = {
            "title": title,
            "price": price,
            "category": category_name,
            "scraped_at": datetime.now()
        }
        products.append(product)

    return pd.DataFrame(products)

# Scraping principal
df_list = []
for cat, url in category_urls.items():
    print(f"Scraping {cat}...")
    df = scrape_idealo_category(url, cat)
    df_list.append(df)
    time.sleep(random.uniform(2, 5))  # Pause humaine entre les requ√™tes

# Fusion finale
df_all = pd.concat(df_list, ignore_index=True)
df_all.to_csv("idealo_products.csv", index=False)
print("Scraping termin√©. Fichier enregistr√© : idealo_products.csv")

Scraping High-tech...


In [122]:
import pandas as pd

dfv2 = pd.read_csv("amazon_best_v2.csv")
dfv2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            193 non-null    object 
 1   price            0 non-null      float64
 2   rating           214 non-null    float64
 3   num_reviews      197 non-null    float64
 4   category         218 non-null    object 
 5   url              218 non-null    object 
 6   brand            0 non-null      float64
 7   is_prime         218 non-null    bool   
 8   bestseller_rank  218 non-null    int64  
 9   scraped_at       218 non-null    object 
dtypes: bool(1), float64(4), int64(1), object(4)
memory usage: 15.7+ KB


In [7]:
import requests
import pandas as pd
from datetime import datetime

# Remplacez par votre propre cl√© API SerpAPI
SERPAPI_KEY = "ed285c21a9caeffec3e5e686e5df49e86215b847ff360f9e253d8930cc3f96e6"  # <-- Remplace par ta vraie cl√©

# Amazon France Best Seller Category IDs valides
departments = {
    "High-tech": "13921051",
    "Cuisine & Maison": "57004031",
    "Beaut√©": "197858031",
    "Mode": "2454132031",
    "Jeux vid√©o": "530490"
}

def scrape_amazon_bestsellers(api_key, category_id, category_name):
    url = "https://serpapi.com/search.json"
    params = {
        "engine": "amazon_bestsellers",
        "amazon_domain": "amazon.fr",
        "category_id": category_id,
        "api_key": api_key
    }

    response = requests.get(url, params=params)
    data = response.json()

    # Afficher les erreurs d√©taill√©es
    if "error" in data:
        print(f"‚ùå Erreur pour {category_name}: {data['error']}")
        return pd.DataFrame()

    products = []
    for item in data.get("bestsellers", []):
        products.append({
            "title": item.get("title"),
            "price": item.get("price"),
            "rating": item.get("rating"),
            "reviews": item.get("ratings_total"),
            "category": category_name,
            "url": item.get("link"),
            "scraped_at": datetime.now()
        })

    return pd.DataFrame(products)

# Lancer le scraping
all_data = []
for name, cat_id in departments.items():
    print(f"üîç Scraping {name}...")
    df = scrape_amazon_bestsellers(SERPAPI_KEY, cat_id, name)
    print(f"‚úÖ {name} : {len(df)} produits extraits\n")
    all_data.append(df)

# Fusionner et exporter
df_final = pd.concat(all_data, ignore_index=True)
df_final.to_csv("amazon_best_serpapi.csv", index=False)
print("üéâ Scraping Amazon termin√© avec SerpAPI. Fichier : amazon_best_serpapi.csv")


üîç Scraping High-tech...
‚ùå Erreur pour High-tech: Unsupported `amazon_bestsellers` search engine.
‚úÖ High-tech : 0 produits extraits

üîç Scraping Cuisine & Maison...
‚ùå Erreur pour Cuisine & Maison: Unsupported `amazon_bestsellers` search engine.
‚úÖ Cuisine & Maison : 0 produits extraits

üîç Scraping Beaut√©...
‚ùå Erreur pour Beaut√©: Unsupported `amazon_bestsellers` search engine.
‚úÖ Beaut√© : 0 produits extraits

üîç Scraping Mode...
‚ùå Erreur pour Mode: Unsupported `amazon_bestsellers` search engine.
‚úÖ Mode : 0 produits extraits

üîç Scraping Jeux vid√©o...
‚ùå Erreur pour Jeux vid√©o: Unsupported `amazon_bestsellers` search engine.
‚úÖ Jeux vid√©o : 0 produits extraits

üéâ Scraping Amazon termin√© avec SerpAPI. Fichier : amazon_best_serpapi.csv


In [5]:
dffff = pd.read_csv("amazon_best_serpapi.csv")
dffff.info()

EmptyDataError: No columns to parse from file

In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beaut√©": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vid√©o": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(url, category):
    driver.get(url)
    time.sleep(3)  # Laisse le temps √† la page de charger

    items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")
    products = []

    for rank, item in enumerate(items, start=1):
        try:
            title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
        except:
            title = None

        try:
            price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
            price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("‚Ç¨", "").replace(",", ".").strip())
        except:
            price = None

        try:
            rating_text = item.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerText")
            rating = float(rating_text.split()[0].replace(",", "."))
        except:
            rating = None

        try:
            reviews_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-base").text
            reviews = int(reviews_tag.replace("\u202f", "").replace(",", "").strip())
        except:
            reviews = None

        try:
            url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
        except:
            url_product = None

        products.append({
            "title": title,
            "price": price,
            "rating": rating,
            "reviews": reviews,
            "url": url_product,
            "category": category,
            "rank": rank,
            "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

    return pd.DataFrame(products)

# Scraping de toutes les cat√©gories
df_list = []
for name, link in category_urls.items():
    print(f"Scraping {name}...")
    try:
        df = scrape_category(link, name)
        print(f"{name}: {len(df)} produits trouv√©s")
        df_list.append(df)
    except Exception as e:
        print(f"Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)
df_final.to_csv("amazon_best_v3.csv", index=False)
driver.quit()
print("‚úÖ Fichier amazon_best_v3.csv g√©n√©r√© avec succ√®s.")

Scraping High-tech...
High-tech: 30 produits trouv√©s
Scraping Beaut√©...
Beaut√©: 30 produits trouv√©s
Scraping Cuisine...
Cuisine: 30 produits trouv√©s
Scraping Mode...
Mode: 30 produits trouv√©s
Scraping Animalerie...
Animalerie: 30 produits trouv√©s
Scraping Jeux et Jouets...
Jeux et Jouets: 30 produits trouv√©s
Scraping Jeux vid√©o...
Jeux vid√©o: 30 produits trouv√©s
‚úÖ Fichier amazon_best_v3.csv g√©n√©r√© avec succ√®s.


In [17]:
df_v3 = pd.read_csv('amazon_best_v3.csv')
df_v3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       186 non-null    object 
 1   price       200 non-null    float64
 2   rating      206 non-null    float64
 3   reviews     0 non-null      float64
 4   url         210 non-null    object 
 5   category    210 non-null    object 
 6   rank        210 non-null    int64  
 7   scraped_at  210 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 13.3+ KB


In [18]:
df_v3.head()

Unnamed: 0,title,price,rating,reviews,url,category,rank,scraped_at
0,Apple AirTag,29.99,4.6,,https://www.amazon.fr/Apple-MX532ZMA-Nouveau-A...,High-tech,1,2025-06-20 14:25:34
1,Imou 2K(3MP) Cam√©ra Surveillance WiFi Int√©rieu...,18.99,4.4,,https://www.amazon.fr/Imou-Surveillance-Int%C3...,High-tech,2,2025-06-20 14:25:34
2,Tapo 2K(3MP) Cam√©ra Surveillance WiFi int√©rieu...,18.99,4.6,,https://www.amazon.fr/Tapo-Surveillance-int%C3...,High-tech,3,2025-06-20 14:25:34
3,Amazon Fire TV Stick HD (Nouvelle g√©n√©ration) ...,44.99,4.5,,https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQ...,High-tech,4,2025-06-20 14:25:34
4,DURACELL CR2032 Piles Boutons au lithium 3V (l...,4.75,4.7,,https://www.amazon.fr/DURACELL-CR2032-Piles-Bo...,High-tech,5,2025-06-20 14:25:34


In [19]:
df['reviews'].notnull().sum()


np.int64(0)

In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beaut√©": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vid√©o": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
            except:
                title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("‚Ç¨", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            try:
                # essaie plusieurs formats pour les votes
                votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-base").text
                votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                    votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
                except:
                    votes = None

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les cat√©gories
df_list = []
for name, base_url in category_urls.items():
    print(f"Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"{name}: {len(df)} produits trouv√©s")
        df_list.append(df)
    except Exception as e:
        print(f"Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)
df_final.to_csv("amazon_best_v4.csv", index=False)
driver.quit()
print("‚úÖ Fichier amazon_best_v4.csv g√©n√©r√© avec succ√®s.")


Scraping High-tech...
High-tech: 60 produits trouv√©s
Scraping Beaut√©...
Beaut√©: 60 produits trouv√©s
Scraping Cuisine...
Cuisine: 60 produits trouv√©s
Scraping Mode...
Mode: 60 produits trouv√©s
Scraping Animalerie...
Animalerie: 60 produits trouv√©s
Scraping Jeux et Jouets...
Jeux et Jouets: 60 produits trouv√©s
Scraping Jeux vid√©o...
Jeux vid√©o: 60 produits trouv√©s
‚úÖ Fichier amazon_best_v4.csv g√©n√©r√© avec succ√®s.


In [21]:
df_v4 = pd.read_csv('amazon_best_v4.csv')
df_v4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       380 non-null    object 
 1   price       407 non-null    float64
 2   rating      415 non-null    float64
 3   votes       56 non-null     float64
 4   url         420 non-null    object 
 5   category    420 non-null    object 
 6   rank        420 non-null    int64  
 7   scraped_at  420 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 26.4+ KB


In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beaut√©": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vid√©o": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
            except:
                title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("‚Ç¨", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            try:
                votes_tag = item.find_element(By.XPATH, ".//span[contains(@aria-label, 'notation') or contains(@aria-label, 'avis') or contains(@aria-label, '√©valuations')]").get_attribute("aria-label")
                votes = int(votes_tag.split()[0].replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                    votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
                except:
                    votes = None

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les cat√©gories
df_list = []
for name, base_url in category_urls.items():
    print(f"Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"{name}: {len(df)} produits trouv√©s")
        df_list.append(df)
    except Exception as e:
        print(f"Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)

# Ajout d'un score
df_final['score'] = df_final['rating'] * df_final['votes']

# Sauvegarde
df_final.to_csv("amazon_best_v5.csv", index=False)
driver.quit()
print("‚úÖ Fichier amazon_best_v5.csv g√©n√©r√© avec succ√®s.")

Scraping High-tech...
High-tech: 60 produits trouv√©s
Scraping Beaut√©...
Beaut√©: 60 produits trouv√©s
Scraping Cuisine...
Cuisine: 60 produits trouv√©s
Scraping Mode...
Mode: 60 produits trouv√©s
Scraping Animalerie...
Animalerie: 60 produits trouv√©s
Scraping Jeux et Jouets...
Jeux et Jouets: 60 produits trouv√©s
Scraping Jeux vid√©o...
Jeux vid√©o: 60 produits trouv√©s
‚úÖ Fichier amazon_best_v5.csv g√©n√©r√© avec succ√®s.


In [23]:
df_v5 = pd.read_csv('amazon_best_v4.csv')
df_v5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       380 non-null    object 
 1   price       407 non-null    float64
 2   rating      415 non-null    float64
 3   votes       56 non-null     float64
 4   url         420 non-null    object 
 5   category    420 non-null    object 
 6   rank        420 non-null    int64  
 7   scraped_at  420 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 26.4+ KB


In [26]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beaut√©": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vid√©o": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                # Titre : essayer plusieurs structures
                try:
                    title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
                except:
                    title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-1_1Fn1y").text
            except:
                title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("‚Ç¨", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, ".a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            try:
                # Votes : version fiable avec span.a-size-small (vue dans capture)
                votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    votes_tag = item.find_element(By.XPATH, ".//span[contains(@aria-label, 'notation') or contains(@aria-label, 'avis') or contains(@aria-label, '√©valuations')]").get_attribute("aria-label")
                    votes = int(votes_tag.split()[0].replace("\u202f", "").replace(",", "").strip())
                except:
                    votes = None

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les cat√©gories
df_list = []
for name, base_url in category_urls.items():
    print(f"Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"{name}: {len(df)} produits trouv√©s")
        df_list.append(df)
    except Exception as e:
        print(f"Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)

# Ajout d'un score
df_final['score'] = df_final['rating'] * df_final['votes']

# Sauvegarde
df_final.to_csv("amazon_best_v6.csv", index=False)
driver.quit()
print("‚úÖ Fichier amazon_best_v6.csv g√©n√©r√© avec succ√®s.")

Scraping High-tech...
High-tech: 0 produits trouv√©s
Scraping Beaut√©...
Beaut√©: 30 produits trouv√©s
Scraping Cuisine...
Cuisine: 60 produits trouv√©s
Scraping Mode...
Mode: 60 produits trouv√©s
Scraping Animalerie...
Animalerie: 60 produits trouv√©s
Scraping Jeux et Jouets...
Jeux et Jouets: 60 produits trouv√©s
Scraping Jeux vid√©o...
Jeux vid√©o: 60 produits trouv√©s
‚úÖ Fichier amazon_best_v6.csv g√©n√©r√© avec succ√®s.


In [27]:
df_v6 = pd.read_csv('amazon_best_v6.csv')
df_v6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       325 non-null    object 
 1   price       321 non-null    float64
 2   rating      326 non-null    float64
 3   votes       45 non-null     float64
 4   url         330 non-null    object 
 5   category    330 non-null    object 
 6   rank        330 non-null    int64  
 7   scraped_at  330 non-null    object 
 8   score       45 non-null     float64
dtypes: float64(4), int64(1), object(4)
memory usage: 23.3+ KB


In [28]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beaut√©": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vid√©o": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        if not items:
            print(f"‚ùå Aucun produit trouv√© pour {category} page {page} !")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
            except:
                try:
                    title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-1_1Fn1y").text
                except:
                    title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("‚Ç¨", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, "span.a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            try:
                votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    votes_tag = item.find_element(By.XPATH, ".//span[contains(@aria-label, 'notation') or contains(@aria-label, 'avis') or contains(@aria-label, '√©valuations')]").get_attribute("aria-label")
                    votes = int(votes_tag.split()[0].replace("\u202f", "").replace(",", "").strip())
                except:
                    votes = None

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les cat√©gories
df_list = []
for name, base_url in category_urls.items():
    print(f"üîç Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"‚úÖ {name}: {len(df)} produits trouv√©s")
        df_list.append(df)
    except Exception as e:
        print(f"‚ùå Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)

# Ajout d'un score
df_final['score'] = df_final['rating'] * df_final['votes']

# Sauvegarde
df_final.to_csv("amazon_best_v7.csv", index=False)
driver.quit()
print("‚úÖ Fichier amazon_best_v7.csv g√©n√©r√© avec succ√®s.")

üîç Scraping High-tech...
‚úÖ High-tech: 60 produits trouv√©s
üîç Scraping Beaut√©...
‚úÖ Beaut√©: 60 produits trouv√©s
üîç Scraping Cuisine...
‚úÖ Cuisine: 60 produits trouv√©s
üîç Scraping Mode...
‚úÖ Mode: 60 produits trouv√©s
üîç Scraping Animalerie...
‚úÖ Animalerie: 60 produits trouv√©s
üîç Scraping Jeux et Jouets...
‚úÖ Jeux et Jouets: 60 produits trouv√©s
üîç Scraping Jeux vid√©o...
‚úÖ Jeux vid√©o: 60 produits trouv√©s
‚úÖ Fichier amazon_best_v7.csv g√©n√©r√© avec succ√®s.


In [29]:
df_v7 = pd.read_csv('amazon_best_v7.csv')
df_v7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       414 non-null    object 
 1   price       407 non-null    float64
 2   rating      415 non-null    float64
 3   votes       56 non-null     float64
 4   url         420 non-null    object 
 5   category    420 non-null    object 
 6   rank        420 non-null    int64  
 7   scraped_at  420 non-null    object 
 8   score       56 non-null     float64
dtypes: float64(4), int64(1), object(4)
memory usage: 29.7+ KB


In [32]:
df_v7.sample(50)

Unnamed: 0,title,price,rating,votes,url,category,rank,scraped_at,score
389,Animal Crossing : New Horizons pour Nintendo S...,44.16,4.8,,https://www.amazon.fr/Animal-Crossing-Horizons...,Jeux vid√©o,30,2025-06-20 14:57:43,
318,,,,,https://www.amazon.fr/Pok%C3%A9mon-JCC-Collect...,Jeux et Jouets,19,2025-06-20 14:57:24,
193,"SINOPHANT Legging Femmes Pantalon de Sport, Je...",11.99,4.5,,https://www.amazon.fr/SINOPHANT-Legging-Femmes...,Mode,14,2025-06-20 14:56:49,
204,QINCAO Boxers Homme Lot de 6 Coton Pas d'√©tiqu...,24.99,4.5,,https://www.amazon.fr/QINCAO-sous-v%C3%AAtemen...,Mode,25,2025-06-20 14:56:50,
70,GARNIER Ambre Solaire - Natural Bronzer - Mous...,9.49,4.2,,https://www.amazon.fr/Garnier-Ambre-Solaire-Na...,Beaut√©,11,2025-06-20 14:56:10,
219,Enlision Hommes Chemises en Lin Solid Henley S...,25.99,4.1,,https://www.amazon.fr/Enlision-Henley-Chemise-...,Mode,60,2025-06-20 14:56:56,
125,"Dreo 20dB ventilateur colonne silencieux, Vite...",89.99,4.6,,https://www.amazon.fr/Dreo-Ventilateur-tour-av...,Cuisine,6,2025-06-20 14:56:31,
390,"Ozeino Casque Gaming pour PS5 PC PS4, Casque G...",19.99,4.5,,https://www.amazon.fr/Ozeino-Couleurs-Transduc...,Jeux vid√©o,51,2025-06-20 14:57:47,
191,Dim Chaussettes Ecodim Talon/Pointes Renforc√©s...,10.5,4.4,,https://www.amazon.fr/Dim-Ecodim-Chaussettes-p...,Mode,12,2025-06-20 14:56:48,
266,"FRONTLINE Spot on Chien - Anti Puces, Anti-Tiq...",18.74,4.3,,https://www.amazon.fr/FRONTLINE-Spot-Chien-Ant...,Animalerie,27,2025-06-20 14:57:07,


In [33]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime
import re

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beaut√©": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vid√©o": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        if not items:
            print(f"‚ùå Aucun produit trouv√© pour {category} page {page} !")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
            except:
                try:
                    title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-1_1Fn1y").text
                except:
                    title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("‚Ç¨", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, "span.a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            # ‚úÖ Nouvelle extraction robuste des votes
            try:
                votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    aria_label = item.find_element(By.CSS_SELECTOR, "div.a-icon-row").get_attribute("aria-label")
                    match = re.search(r'([0-9\u202f\s]+) √©valuations', aria_label)
                    if match:
                        votes = int(match.group(1).replace("\u202f", "").replace(" ", ""))
                    else:
                        votes = None
                except:
                    votes = None

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les cat√©gories
df_list = []
for name, base_url in category_urls.items():
    print(f"üîç Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"‚úÖ {name}: {len(df)} produits trouv√©s")
        df_list.append(df)
    except Exception as e:
        print(f"‚ùå Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)

# Ajout d'un score pond√©r√©
df_final['score'] = df_final['rating'] * df_final['votes']

# Sauvegarde
df_final.to_csv("amazon_best_v8.csv", index=False)
driver.quit()
print("‚úÖ Fichier amazon_best_v8.csv g√©n√©r√© avec succ√®s.")


üîç Scraping High-tech...
‚úÖ High-tech: 60 produits trouv√©s
üîç Scraping Beaut√©...
‚úÖ Beaut√©: 60 produits trouv√©s
üîç Scraping Cuisine...
‚úÖ Cuisine: 60 produits trouv√©s
üîç Scraping Mode...
‚úÖ Mode: 60 produits trouv√©s
üîç Scraping Animalerie...
‚úÖ Animalerie: 60 produits trouv√©s
üîç Scraping Jeux et Jouets...
‚úÖ Jeux et Jouets: 60 produits trouv√©s
üîç Scraping Jeux vid√©o...
‚úÖ Jeux vid√©o: 60 produits trouv√©s
‚úÖ Fichier amazon_best_v8.csv g√©n√©r√© avec succ√®s.


In [36]:
df_v8 = pd.read_csv('amazon_best_v8.csv')
df_v8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       414 non-null    object 
 1   price       407 non-null    float64
 2   rating      415 non-null    float64
 3   votes       56 non-null     float64
 4   url         420 non-null    object 
 5   category    420 non-null    object 
 6   rank        420 non-null    int64  
 7   scraped_at  420 non-null    object 
 8   score       56 non-null     float64
dtypes: float64(4), int64(1), object(4)
memory usage: 29.7+ KB


In [41]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime
import re

# Configuration de Selenium (Chrome en headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Liste des URLs des best sellers Amazon France (page 1 uniquement)
category_urls = {
    "High-tech": "https://www.amazon.fr/gp/bestsellers/electronics",
    "Beaut√©": "https://www.amazon.fr/gp/bestsellers/beauty",
    "Cuisine": "https://www.amazon.fr/gp/bestsellers/kitchen",
    "Mode": "https://www.amazon.fr/gp/bestsellers/fashion",
    "Animalerie": "https://www.amazon.fr/gp/bestsellers/pet-supplies",
    "Jeux et Jouets": "https://www.amazon.fr/gp/bestsellers/toys",
    "Jeux vid√©o": "https://www.amazon.fr/gp/bestsellers/videogames"
}

def scrape_category(base_url, category):
    all_products = []
    for page in [1, 2]:
        url = f"{base_url}?pg={page}"
        driver.get(url)
        time.sleep(3)

        items = driver.find_elements(By.CSS_SELECTOR, "div.zg-grid-general-faceout")

        if not items:
            print(f"‚ùå Aucun produit trouv√© pour {category} page {page} !")

        for rank, item in enumerate(items, start=1 + (page - 1) * 50):
            try:
                title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1").text
            except:
                try:
                    title = item.find_element(By.CSS_SELECTOR, "._cDEzb_p13n-sc-css-line-clamp-1_1Fn1y").text
                except:
                    title = None

            try:
                price_text = item.find_element(By.CSS_SELECTOR, "span._cDEzb_p13n-sc-price_3mJ9Z").text
                price = float(price_text.replace("\u202f", "").replace("\xa0", "").replace("‚Ç¨", "").replace(",", ".").strip())
            except:
                price = None

            try:
                rating_text = item.find_element(By.CSS_SELECTOR, "span.a-icon-alt").get_attribute("innerText")
                rating = float(rating_text.split()[0].replace(",", "."))
            except:
                rating = None

            # ‚úÖ Extraction du nombre de votes avec plusieurs tentatives
            votes = None
            try:
                # M√©thode 1 : balise visible directe
                votes_tag = item.find_element(By.CSS_SELECTOR, "span.a-size-small").text
                votes = int(votes_tag.replace("\u202f", "").replace(",", "").strip())
            except:
                try:
                    # M√©thode 2 : via aria-label complet
                    aria_label = item.find_element(By.CSS_SELECTOR, "div.a-icon-row").get_attribute("aria-label")
                    match = re.search(r'(\d[\d\u202f\s]*)\s*\u00e9valuations', aria_label.lower())
                    if match:
                        votes = int(match.group(1).replace("\u202f", "").replace(" ", ""))
                except:
                    pass

            try:
                url_product = item.find_element(By.CSS_SELECTOR, "a.a-link-normal").get_attribute("href")
            except:
                url_product = None

            all_products.append({
                "title": title,
                "price": price,
                "rating": rating,
                "votes": votes,
                "url": url_product,
                "category": category,
                "rank": rank,
                "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return pd.DataFrame(all_products)

# Scraping de toutes les cat√©gories
df_list = []
for name, base_url in category_urls.items():
    print(f"üîç Scraping {name}...")
    try:
        df = scrape_category(base_url, name)
        print(f"‚úÖ {name}: {len(df)} produits trouv√©s")
        df_list.append(df)
    except Exception as e:
        print(f"‚ùå Erreur pour {name} : {e}")

# Fusion et export final
df_final = pd.concat(df_list, ignore_index=True)

# Ajout d'un score pond√©r√© si les deux colonnes existent
df_final['score'] = df_final.apply(lambda row: row['rating'] * row['votes'] if pd.notnull(row['votes']) and pd.notnull(row['rating']) else None, axis=1)

# Sauvegarde
df_final.to_csv("amazon_best_v10.csv", index=False)
with open("amazon_best_v10.html", "w", encoding="utf-8") as f:
    f.write(driver.page_source)
driver.quit()

print("‚úÖ Fichier amazon_best_v10.csv g√©n√©r√© avec succ√®s.")

üîç Scraping High-tech...
‚úÖ High-tech: 60 produits trouv√©s
üîç Scraping Beaut√©...
‚úÖ Beaut√©: 60 produits trouv√©s
üîç Scraping Cuisine...
‚úÖ Cuisine: 60 produits trouv√©s
üîç Scraping Mode...
‚úÖ Mode: 60 produits trouv√©s
üîç Scraping Animalerie...
‚úÖ Animalerie: 60 produits trouv√©s
üîç Scraping Jeux et Jouets...
‚úÖ Jeux et Jouets: 60 produits trouv√©s
üîç Scraping Jeux vid√©o...
‚úÖ Jeux vid√©o: 60 produits trouv√©s
‚úÖ Fichier amazon_best_v10.csv g√©n√©r√© avec succ√®s.


In [43]:
df_v10 = pd.read_csv('amazon_best_v10.csv')
df_v10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       415 non-null    object 
 1   price       406 non-null    float64
 2   rating      416 non-null    float64
 3   votes       56 non-null     float64
 4   url         420 non-null    object 
 5   category    420 non-null    object 
 6   rank        420 non-null    int64  
 7   scraped_at  420 non-null    object 
 8   score       56 non-null     float64
dtypes: float64(4), int64(1), object(4)
memory usage: 29.7+ KB


In [44]:
df_v10.head()

Unnamed: 0,title,price,rating,votes,url,category,rank,scraped_at,score
0,Apple AirTag,29.99,4.6,,https://www.amazon.fr/Apple-MX532ZMA-Nouveau-A...,High-tech,1,2025-06-20 16:08:23,
1,Imou 2K(3MP) Cam√©ra Surveillance WiFi Int√©rieu...,18.99,4.4,,https://www.amazon.fr/Imou-Surveillance-Int%C3...,High-tech,2,2025-06-20 16:08:23,
2,Tapo 2K(3MP) Cam√©ra Surveillance WiFi int√©rieu...,18.99,4.6,,https://www.amazon.fr/Tapo-Surveillance-int%C3...,High-tech,3,2025-06-20 16:08:24,
3,Amazon Fire TV Stick HD (Nouvelle g√©n√©ration) ...,44.99,4.6,,https://www.amazon.fr/fire-tv-stick-hd/dp/B0CQ...,High-tech,4,2025-06-20 16:08:24,
4,DURACELL CR2032 Piles Boutons au lithium 3V (l...,4.75,4.7,,https://www.amazon.fr/DURACELL-CR2032-Piles-Bo...,High-tech,5,2025-06-20 16:08:24,


In [55]:
import time
from datetime import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Configuration du navigateur
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://www.amazon.fr/s?i=electronics&srs=4551203031&rh=n%3A4551203031&s=popularity-rank&fs=true&ref=lp_4551203031_sar"
driver.get(url)
time.sleep(3)

data = []
page = 1

while True:
    print(f"Scraping page {page}...")
    cards = driver.find_elements(By.XPATH, '//div[contains(@data-component-type, "s-search-result")]')

    for rank, card in enumerate(cards, start=1):
        try:
            title = card.find_element(By.XPATH, ".//h2//span").text.strip()
        except:
            title = None

        try:
            price_text = card.find_element(By.CSS_SELECTOR, "span.a-price > span.a-offscreen").text.strip()
            price = float(price_text.replace("‚ÄØ", "").replace(" ", "").replace("EUR", "").replace(",", "."))
        except:
            price = None

        try:
            rating_text = card.find_element(By.CSS_SELECTOR, "span.a-icon-alt").get_attribute("innerHTML")
            rating = float(rating_text.split()[0].replace(',', '.'))
        except:
            rating = None

        try:
            votes_text = card.find_element(By.XPATH, ".//span[@class='a-size-base' or @class='a-size-small']").text
            votes = int(votes_text.replace("\u202f", "").replace(",", "").replace(" ", ""))
        except:
            votes = None

        try:
            score = round(rating * votes, 1) if rating and votes else None
        except:
            score = None

        try:
            brand = card.find_element(By.XPATH, ".//span[@class='a-size-base-plus a-color-base']").text
        except:
            brand = None

        try:
            product_url = card.find_element(By.XPATH, ".//h2/a").get_attribute("href")
        except:
            product_url = None

        try:
            image_url = card.find_element(By.XPATH, ".//img").get_attribute("src")
        except:
            image_url = None

        data.append({
            "title": title,
            "price": price,
            "rating": rating,
            "votes": votes,
            "score": score,
            "brand": brand,
            "url": product_url,
            "image_url": image_url,
            "category": "High-Tech > Objets connect√©s",
            "rank": rank,
            "scraped_at": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        })

    # Tentative d'aller √† la page suivante
    try:
        next_button = driver.find_element(By.XPATH, '//a[contains(@class,"s-pagination-next")]')
        if 'disabled' in next_button.get_attribute("class"):
            break
        else:
            driver.execute_script("arguments[0].scrollIntoView();", next_button)
            time.sleep(2)
            next_button.click()
            page += 1
            time.sleep(3)
    except:
        print("Fin de pagination ou erreur")
        break

# Fermeture du navigateur
driver.quit()

# Enregistrement des donn√©es
df = pd.DataFrame(data)
df.to_csv("amazon_hightech_objets_connectes_v12.csv", index=False)
print(df.info())


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

In [None]:
df_v12 = pd.read_csv('amazon_hightech_objets_connectes_v12.csv')
df_v12.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       0 non-null      float64
 1   price       118 non-null    float64
 2   rating      0 non-null      float64
 3   votes       46 non-null     float64
 4   score       0 non-null      float64
 5   brand       0 non-null      float64
 6   url         0 non-null      float64
 7   image_url   120 non-null    object 
 8   category    135 non-null    object 
 9   rank        135 non-null    int64  
 10  scraped_at  135 non-null    object 
dtypes: float64(7), int64(1), object(3)
memory usage: 11.7+ KB
