# Scraper le prix neuf des voitures

## 1. Importer la liste des modèles


In [81]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from datetime import datetime
import sys
import os
import glob
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.scraping.scraping import init_driver
from src.scraping.scraping import accept_popup_general

In [2]:
df_model = pd.read_csv('../data/processed_data/modeles_voitures.csv')

In [3]:
df_model.head()

Unnamed: 0,marque,modele,finition,annee
0,Abarth,Abarth 124 Spider,1.4 Turbo Turismo BVA,2019.0
1,Abarth,Abarth 500,1.4 Turbo T-Jet 595,2020.0
2,Abarth,Abarth 500,1.4 Turbo T-Jet 595,2022.0
3,Abarth,Abarth 500,1.4 Turbo T-Jet 595 Pista,2018.0
4,Abarth,Abarth 500,1.4 Turbo T-Jet 595 Turismo,2020.0


In [4]:
# Check missing values
df_model.isnull().sum()

marque      1
modele      1
finition    1
annee       1
dtype: int64

In [5]:
# Supprimer missing values
df_model = df_model.dropna()

In [6]:
df_model["annee"] = df_model["annee"].astype(int)

In [7]:
df_model

Unnamed: 0,marque,modele,finition,annee
0,Abarth,Abarth 124 Spider,1.4 Turbo Turismo BVA,2019
1,Abarth,Abarth 500,1.4 Turbo T-Jet 595,2020
2,Abarth,Abarth 500,1.4 Turbo T-Jet 595,2022
3,Abarth,Abarth 500,1.4 Turbo T-Jet 595 Pista,2018
4,Abarth,Abarth 500,1.4 Turbo T-Jet 595 Turismo,2020
...,...,...,...,...
1608,Volvo,Volvo V40,2.0 D2 Inscription,2018
1609,Volvo,Volvo V40 Cross Country,Cross Country 1.5 T3 Momentum Geartronic 6,2019
1610,Volvo,Volvo XC40,1.5 T3 R-Design Geartronic 8,2019
1611,Volvo,Volvo XC40,2.0 D4 AWD AdBlue R-Design Geartronic 8,2019


In [8]:
# Liste des modèles et années
df = df_model[["marque", "modele", "annee"]].drop_duplicates()
df = df.sort_values(by=["marque", "modele", "annee"])
df.reset_index(drop=True, inplace=True)

In [9]:
df.head()

Unnamed: 0,marque,modele,annee
0,Abarth,Abarth 124 Spider,2019
1,Abarth,Abarth 500,2018
2,Abarth,Abarth 500,2019
3,Abarth,Abarth 500,2020
4,Abarth,Abarth 500,2022


In [10]:
print(df["marque"].unique())
print(len(df["marque"].unique()))

['Abarth' 'Alfa' 'Audi' 'BMW' 'Citroen' 'Cupra' 'DS' 'Dacia' 'Fiat' 'Ford'
 'Honda' 'Hyundai' 'Infiniti' 'Jaguar' 'Jeep' 'Kia' 'Land' 'Lexus' 'MG'
 'MINI' 'Mazda' 'Mercedes-Benz' 'Mitsubishi' 'Nissan' 'Opel' 'Peugeot'
 'Renault' 'Seat' 'Skoda' 'Smart' 'Suzuki' 'Toyota' 'Volkswagen' 'Volvo']
34


In [11]:
# Rename "marque" to match with the right name used in the caradisiac website
dict_name_marque = {'Abarth' : 'Abarth', 'Alfa' : 'Alfa Romeo', 'Audi' : 'Audi', 
                    'BMW' : 'BMW', 
                    'Citroen' : 'Citroen', 'Cupra' : 'Cupra', 
                    'DS' : 'DS', 'Dacia' : 'Dacia',
                    'Fiat' : 'Fiat', 'Ford' : 'Ford', 
                    'Honda' : 'Honda', 'Hyundai' : 'Hyundai', 
                    'Infiniti' : 'Infiniti', 
                    'Jaguar' : 'Jaguar', 'Jeep' : 'Jeep',
                    'Kia' : 'Kia', 
                    'Land' : 'Land Rover', 'Lexus' : 'Lexus', 
                    'MG' : "MG", 'MINI' : 'MINI', 'Mazda' : 'Mazda', 'Mercedes-Benz' : 'Mercedes', 'Mitsubishi' : 'Mitsubishi', 
                    'Nissan' : 'Nissan', 
                    'Opel' : 'Opel', 
                    'Peugeot' : 'Peugeot', 
                    'Renault' : 'Renault', 
                    'Seat' : 'Seat', 'Skoda' : 'Skoda', 'Smart' : 'Smart', 'Suzuki' : 'Suzuki', 
                    'Toyota' : 'Toyota', 
                    'Volkswagen' : 'Volkswagen', 'Volvo' : 'Volvo'
                    }

df['marque'] = df['marque'].replace(dict_name_marque)
print(df["marque"].unique())
print(len(df["marque"].unique()))

['Abarth' 'Alfa Romeo' 'Audi' 'BMW' 'Citroen' 'Cupra' 'DS' 'Dacia' 'Fiat'
 'Ford' 'Honda' 'Hyundai' 'Infiniti' 'Jaguar' 'Jeep' 'Kia' 'Land Rover'
 'Lexus' 'MG' 'MINI' 'Mazda' 'Mercedes' 'Mitsubishi' 'Nissan' 'Opel'
 'Peugeot' 'Renault' 'Seat' 'Skoda' 'Smart' 'Suzuki' 'Toyota' 'Volkswagen'
 'Volvo']
34


In [12]:
freq_marque = df["marque"].value_counts().reset_index().sort_values(by='marque')
freq_marque

Unnamed: 0,marque,count
25,Abarth,7
21,Alfa Romeo,10
5,Audi,45
8,BMW,35
1,Citroen,53
28,Cupra,4
19,DS,13
15,Dacia,19
12,Fiat,29
6,Ford,37


## 2. Essai avec request et BeautifulSoup

In [10]:
# Function to scrape car catalogue prices from Caradisiac
def clean_model_name(model):
    # Remove special characters and normalize spaces
    model_v2 = re.sub(r'[^\w\s]', '', model).strip().lower()
    model_v3 = model_v2.replace(' ', '-')
    return model_v3

def scrape_caradisiac_price(modele, annee):
    base_url = "https://www.caradisiac.com/fiches-techniques/modele--"
    
    # Build search URL
    clean_model = clean_model_name(modele)
    search_url = f"{base_url}{clean_model}/{str(annee)}"
    
    print(f"Search url original pattern is: {search_url}")
    
    try:
        # First attempt with original pattern
        time.sleep(5)
        response = requests.get(search_url)
        #print(f"Code Status: {response.status_code}")
        if response.status_code != 200:
            # Try alternative URL patterns
            alternative_models = [
                clean_model.replace('_', ''), # Remove "_"
                '-'.join([clean_model.replace('_', ''), "2e", "generation"])  # join with version
            ] + ['-'.join([clean_model.replace('_', ''), str(i)]) for i in range(1,6)]
            
            #print(f"Alternative urls: {alternative_models}")
            
            for alt_model in alternative_models:
                time.sleep(5)
                alt_url = f"{base_url}{alt_model}/{str(annee)}"
                response = requests.get(alt_url)
                if response.status_code == 200:
                    search_url = alt_url
                    break
        # Add delay to be respectful to the server
        time.sleep(5)
        
        # Make request
        print(f"L'url final est: {search_url}")
        response = requests.get(search_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all car versions
        versions = soup.find_all('table', class_='listingTab')

        # Extract all rows from the table
        table = versions[0]  # Since versions contains only one table
        rows = table.find_all('tr')

        # Initialize lists to store data
        data = []
        headers = []

        # Get headers from first row
        header_row = rows[0]
        headers = [th.get_text(strip=True) for th in header_row.find_all('th')]

        # Get data from remaining rows
        for row in rows[1:]:
            cols = row.find_all('td')
            row_data = [col.get_text(strip=True) for col in cols]
            if row_data:  # Only add non-empty rows
                data.append(row_data)
        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)
        df['url'] = search_url
        return df    
    except Exception as e:
        print(f"Error scraping {modele}: {str(e)}")
        return None

In [11]:
# # Create an empty DataFrame to store all results
# df_all = pd.DataFrame()

# # Add progress tracking
# total_models = len(df)
# print(f"Starting to scrape {total_models} models...")

# for idx, row in df.iterrows():
#     try:
#         print(f"Processing {idx+1}/{total_models}: {row['modele']} ({row['annee']})")
#         df_version = scrape_caradisiac_price(row['modele'], row['annee'])
        
#         if df_version is not None:
#             # Add model and year columns to identify the source
#             df_version.insert(0, 'source_model', row['modele'])
#             df_version.insert(1, 'source_year', row['annee'])
#             df_all = pd.concat([df_all, df_version], ignore_index=True)
            
#     except Exception as e:
#         print(f"Error processing {row['modele']} ({row['annee']}): {str(e)}")
#         continue

# print(f"\nScraping completed. Total entries collected: {len(df_all)}")

## 3. Essai avec Selenium pour retrouver les bons urls


In [None]:
def select_option_contain (select_element, partial_text):
    select = Select(select_element)

    #print(f"Partial text est: {partial_text}")
    accents = {
            'a': r'[àáâãäå]',
            'e': r'[èéêë]',
            'i': r'[ìíîï]',
            'o': r'[òóôõö]',
            'u': r'[ùúûü]',
            'c': r'[ç]',
            'n': r'[ñ]',
            'A': r'[ÀÁÂÃÄÅ]',
            'E': r'[ÈÉÊË]',
            'I': r'[ÌÍÎÏ]',
            'O': r'[ÒÓÔÕÖ]',
            'U': r'[ÙÚÛÜ]',
            'C': r'[Ç]',
            'N': r'[Ñ]',
        }
    # Remove special characters with accent
    for remplacement, pattern in accents.items():
        partial_text = re.sub(pattern, remplacement, partial_text)
    print(f"Partial text: {partial_text}")

    # First check for exact matches
    for option in select.options:
        if option.get_attribute("value").strip() != "" and partial_text.strip().lower() == option.text.strip().lower():
            select.select_by_visible_text(option.text)
            print(f"Found exact match: {option.text}")
            option_select = "Exact match"
            return option.text, option_select
            break
             
    # If no exact match found, check for partial matches
    best_match = None
    max_common_chars = 0
    
    for option in select.options:
        if option.get_attribute("value").strip() != "":
            option_text = option.text.strip().lower()
            partial_text_lower = partial_text.strip().lower()
            
            # Count common characters
            common_chars = sum(1 for c in option_text if c in partial_text_lower)
            
            # Check if this is a partial match and has more common characters
            if ((partial_text_lower in option_text) or (option_text in partial_text_lower)) and common_chars > max_common_chars:
                max_common_chars = common_chars
                best_match = option
    # Retain the best partial match = the option that has more common characters with the partial text
    if best_match is not None:
        select.select_by_visible_text(best_match.text)
        print(f"Found best partial match: {best_match.text}")
        option_select = "Partial match"
        return best_match.text, option_select


In [14]:
def clean_name(model):
    # Remove special characters and normalize spaces
    accents = {
            'a': r'[àáâãäå]',
            'e': r'[èéêë]',
            'i': r'[ìíîï]',
            'o': r'[òóôõö]',
            'u': r'[ùúûü]',
            'c': r'[ç]',
            'n': r'[ñ]',
            'A': r'[ÀÁÂÃÄÅ]',
            'E': r'[ÈÉÊË]',
            'I': r'[ÌÍÎÏ]',
            'O': r'[ÒÓÔÕÖ]',
            'U': r'[ÙÚÛÜ]',
            'C': r'[Ç]',
            'N': r'[Ñ]',
        }

    for remplacement, pattern in accents.items():
        model = re.sub(pattern, remplacement, model)
    model = re.sub(r'[^\w\s\-!]', '', model).strip().lower()

    model = model.replace(' ', '-')
    
    return model

In [14]:
# list_models = pd.unique(df['modele'])
# print(list_models)
# print([i for i in list_models if "!" in i ])

# list_models_cleaned = [clean_name(model) for model in list_models]
# print(list_models_cleaned)

# list_models_test = [any(char in i for char in 'éèêëàâäôöûüçñ') for i in list_models_cleaned]
# print([i for i in list_models_test if i == True])

In [75]:
def collect_prix_neuf (url, marque, modele, annee):
    '''
    Guessing the convenient urls
    Arguments:

    '''

    # Initialiser le driver
    driver = init_driver()
    driver.get(url)
    
    # Accept pop-up
    accept_popup_general(driver, "#didomi-notice-agree-button")
    time.sleep(2)
    driver.execute_script("document.location.reload()")
    wait = WebDriverWait(driver, 10)

    df_all_versions = pd.DataFrame()

    try:
        # Sélectionner les options de dropdowns: Marques, Gammes, Annees, Modeles

        # Marque: if contains marque
        time.sleep(2)
        #marque_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'brands')))
        marque_dropdown = driver.find_element(By.ID, 'brands')
        #option_marque =  Select(marque_dropdown).select_by_visible_text(marque)
        option_marque, match_type_marque = select_option_contain (marque_dropdown, marque)
        print(f"Marque: {option_marque}")

        # Gammes: if contains modele
        #modele_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'models')))
        time.sleep(2)
        modele_dropdown = driver.find_element(By.ID, 'models')
        #print(f"Modèle initial est: {modele}")
        option_modele, match_type_modele = select_option_contain (modele_dropdown, modele)
        print(f"Modèle: {option_modele}")
        
        # Annees = annee
        #annee_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'year')))
        time.sleep(2)
        annee_dropdown = driver.find_element(By.ID, 'year')
        option_year, match_type_year = select_option_contain(annee_dropdown, str(annee))
        #print(f"Liste année: {[option.text for option in Select(annee_dropdown).options]}")
        #print(f"Année choisie: {option_year}")
        
        # Modeles: extract all (for i in range modeles.options)
        #modelscomm_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'modelscomm')))
        time.sleep(2)
        modelscomm_dropdown = driver.find_element(By.ID, 'modelscomm')
        select_modelscomm = Select(modelscomm_dropdown)

        # Exclure version vide
        versions = [option.text for option in select_modelscomm.options if option.get_attribute("value").strip() != ""]
        print(f"Liste des versions: {versions}")
       
        if versions:  # This checks if the list is not empty
            for version in versions:
                data = []
                #select_modelscomm.select_by_visible_text(version.text)
                option_marque_cleaned = clean_name(option_marque)
                option_version = clean_name(version)
                
                search_url_version = f"{url}/modele--{option_marque_cleaned}-{option_version}/{option_year}"
                print(f"Extraire la fiche technique de: {version} - année {option_year}")
                print(f"Search url est : {search_url_version}")

                driver.get(search_url_version)
                
                # Récupérer le tableau "listingTab"
                table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table.listingTab')))

                # Get all rows including header row
                rows = table.find_elements(By.TAG_NAME, 'tr')

                # Get headers from first row
                headers = rows[0].find_elements(By.TAG_NAME, 'th')
                header_texts = [header.text for header in headers]

                # Get data from remaining rows
                for row in rows[1:]:
                    cells = row.find_elements(By.TAG_NAME, 'td')
                    row_data = [cell.text for cell in cells]
                    if row_data:
                        data.append(row_data)

                # Sauvegarder le tableau dans un dataframe
                df = pd.DataFrame(data, columns=header_texts)

                # Création des colonnes pour identifier le type de matching
                df['url'] = driver.current_url
                df['option_marque_select'] = option_marque
                df['option_modele_select'] = option_modele
                df['option_year_select'] = option_year
                df['match_type_marque'] = match_type_marque
                df['match_type_modele'] = match_type_modele
                df['match_type_year'] = match_type_year

                #print(f"URL used is: {driver.current_url}")

                #print(df.head())
                
                df_all_versions = pd.concat([df_all_versions, df], ignore_index=True)
                #driver.close()
                #print(f"df all version est: {df_all_versions.head()}")
        
        # Close the driver    
        driver.quit() 
        return df_all_versions
    except Exception as e:
        print (f"URL is not correct")
        print (f"Error is: {e}")
        driver.quit()
        return None

In [None]:
# url = "https://www.caradisiac.com/fiches-techniques"
# marque = "BMW"
# modele = "BMW Série 1"
# annee = "2021"
# df_test = collect_prix_neuf (url, marque, modele, annee)

Partial text: BMW
Found exact match: BMW
Marque: BMW
Partial text: BMW Serie 1
Found best partial match: SERIE 1
Modèle: SERIE 1
Partial text: 2021
Found exact match: 2021
Liste des versions: ['SERIE 1 F20 5 PORTES', 'SERIE 1 F21 3 PORTES', 'SERIE 1 F40']
Extraire la fiche technique de: SERIE 1 F20 5 PORTES - année 2021
Search url est : https://www.caradisiac.com/fiches-techniques/modele--bmw-serie-1-f20-5-portes/2021
Extraire la fiche technique de: SERIE 1 F21 3 PORTES - année 2021
Search url est : https://www.caradisiac.com/fiches-techniques/modele--bmw-serie-1-f21-3-portes/2021
Extraire la fiche technique de: SERIE 1 F40 - année 2021
Search url est : https://www.caradisiac.com/fiches-techniques/modele--bmw-serie-1-f40/2021


In [32]:
def scraping_prix_neuf (df, csv_path_ok, csv_path_ko):
    # Create an empty DataFrame to store all results
    df_all = pd.DataFrame()

    df_error = pd.DataFrame()

    # Add progress tracking
    total_models = len(df)
    print(f"⏳ Starting to scrape {total_models} models & years...")

    # url
    url = "https://www.caradisiac.com/fiches-techniques"

    for idx, row in df.iterrows():
        try:
            print(f"--> 🔄Processing {idx+1}/{total_models}: {row['modele']} ({row['annee']})")
            df_version = collect_prix_neuf (url, row['marque'], row['modele'], row['annee'])
            
            if df_version is not None:
                # Add model and year columns to identify the source
                df_version.insert(0, 'source_model', row['modele'])
                df_version.insert(1, 'source_year', row['annee'])
                df_all = pd.concat([df_all, df_version], ignore_index=True)
                print(f"✅Successfully collected for {row['modele']} ({row['annee']})")
            else:
                print(f"❌Error processing {row['modele']} ({row['annee']})")
                df_error = pd.concat([df_error, pd.DataFrame({'modele': [row['modele']], 'annee': [row['annee']]})], ignore_index=True)    
        except Exception as e:
            print(f"❌Error processing {row['modele']} ({row['annee']}): {str(e)}")
            continue

    print(f"\n 🆗 Scraping completed. Total entries collected: {len(df_all)}")
    # Exporter les données dans un fichier CSV
    df_all.to_csv(csv_path_ok, index=False, encoding='utf-8-sig')
    print(f"\n ⚠️ Total models not found: {len(df_error)}")
    df_error.to_csv(csv_path_ko, index=False, encoding='utf-8-sig')
    return None

In [77]:
# Fonction pour diviser le df original en plusieurs parties => pour lancer le scraping en plusieurs patchs
def split_dataframe_by_size(df, nb_lines, csv_root):
    list_dfs = [df[i:i+nb_lines] for i in range(0, len(df), nb_lines)]

    somme_originale = len(df)
    somme_lignes = sum(len(split) for split in list_dfs)

    print(f"Somme des lignes de la base brute = {somme_originale}")
    print(f"Somme des lignes de tous les splits dataframe = {somme_lignes}")
    print(f'# Data split: {len(list_dfs)}')

    # Save each split dataframe to CSV
    for i, split_df in enumerate(list_dfs):
        csv_path = f'{csv_root}/split_car_models_{i+1}.csv'
        split_df.to_csv(csv_path, index=False)
        print(f'Split {i+1} saved to {csv_path}')
    
    return list_dfs

In [79]:
list_df_split = split_dataframe_by_size(df, 50, "../data/raw_data")

Somme des lignes de la base brute = 752
Somme des lignes de tous les splits dataframe = 752
# Data split: 16
Split 1 saved to ../data/raw_data/split_car_models_1.csv
Split 2 saved to ../data/raw_data/split_car_models_2.csv
Split 3 saved to ../data/raw_data/split_car_models_3.csv
Split 4 saved to ../data/raw_data/split_car_models_4.csv
Split 5 saved to ../data/raw_data/split_car_models_5.csv
Split 6 saved to ../data/raw_data/split_car_models_6.csv
Split 7 saved to ../data/raw_data/split_car_models_7.csv
Split 8 saved to ../data/raw_data/split_car_models_8.csv
Split 9 saved to ../data/raw_data/split_car_models_9.csv
Split 10 saved to ../data/raw_data/split_car_models_10.csv
Split 11 saved to ../data/raw_data/split_car_models_11.csv
Split 12 saved to ../data/raw_data/split_car_models_12.csv
Split 13 saved to ../data/raw_data/split_car_models_13.csv
Split 14 saved to ../data/raw_data/split_car_models_14.csv
Split 15 saved to ../data/raw_data/split_car_models_15.csv
Split 16 saved to ../dat

In [None]:
# Process by pack
path = "../data/raw_data"
pattern = "split_car_models_*.csv"
files = glob.glob(os.path.join(path, pattern))
nb_split_df = len(files)

for i, path_split in enumerate(files):
    csv_path_ok = '../data/processed_data/prix_neuf_voitures_pack' + str(i+1) + '.csv'
    
    # Only execute if output file doesn't exist
    if not os.path.exists(csv_path_ok):
        data = pd.read_csv(path_split)
        csv_path_ko = '../data/processed_data/prix_neuf_voitures_pack' + str(i+1) + '_error.csv'

        # Run scraping
        scraping_prix_neuf(data, csv_path_ok, csv_path_ko)
    else:
        print(f"File {csv_path_ok} already exists, skipping...")

⏳ Starting to scrape 50 models & years...
--> 🔄Processing 1/50: Abarth 124 Spider (2019)
Liste des versions: ['124 (2E GENERATION) SPIDER']
Extraire la fiche technique de: 124 (2E GENERATION) SPIDER - année 2019
Search url est : https://www.caradisiac.com/fiches-techniques/modele--abarth-124-2e-generation-spider/2019
✅Successfully collected for Abarth 124 Spider (2019)
--> 🔄Processing 2/50: Abarth 500 (2018)
Liste des versions: ['500 (2E GENERATION)', '500 C (2E GENERATION)']
Extraire la fiche technique de: 500 (2E GENERATION) - année 2018
Search url est : https://www.caradisiac.com/fiches-techniques/modele--abarth-500-2e-generation/2018
Extraire la fiche technique de: 500 C (2E GENERATION) - année 2018
Search url est : https://www.caradisiac.com/fiches-techniques/modele--abarth-500-c-2e-generation/2018
✅Successfully collected for Abarth 500 (2018)
--> 🔄Processing 3/50: Abarth 500 (2019)
Liste des versions: ['500 (2E GENERATION)', '500 C (2E GENERATION)']
Extraire la fiche technique de

In [None]:
# Liste des modèles qui ne sont pas collectés

# 

In [None]:
# url = "https://www.caradisiac.com/fiches-techniques"
# marque = "BMW"
# modele = "BMW Série 1"
# annee = "2019"
# df_test1 = collect_prix_neuf (url, marque, modele, annee)