# Scraper le prix neuf des voitures

## 1. Importer les librairies

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from datetime import datetime
import sys
import os
import glob
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.scraping.scraping_annonces import init_driver
from src.scraping.scraping_annonces import accept_popup_general

In [2]:
data_path = "../data/raw_data/autohero.csv"
df_annonces = pd.read_csv(data_path, sep=",", encoding="utf-8")

In [3]:
df_annonces.head()

Unnamed: 0,scraped_at,modele,finition,prix,annee_mise_en_circulation,kilometrage,carburant,transmission,puissance,nb_ancien_proprietaire,classe_vehicule,nb_porte,nb_place,couleur,sellerie,classe_emission,emission_CO2,crit_air,usage_commerciale_anterieure,url_annonce
0,2025-04-09,Ford Fiesta,1.0 EcoBoost ST-Line X,13 190 €,15.05.2020,69 301 km,Essence,Boite de vitesse manuelle,95 CV / 70 kW,3,Citadine,5.0,5.0,Gris,Tissu (Sellerie d'origine),EURO 6,,Crit'Air 1,,https://www.autohero.com/fr/ford-fiesta/id/516...
1,2025-04-09,Toyota ProAce,Combi Long 1.5 D-4D Dynamic,23 990 €,29.04.2021,71 887 km,Diesel,Boite de vitesse manuelle,120 CV / 88 kW,2,Monospace,4.0,9.0,Gris,Tissu (Sellerie d'origine),EURO 6,170 g/km,Crit'Air 2,Oui,https://www.autohero.com/fr/toyota-pro-ace/id/...
2,2025-04-09,Mercedes-Benz GLA,250 e AMG Line 8G-DCT,32 490 €,23.10.2020,59 649 km,Hybride,Double embrayage / DCT,218 CV / 160 kW,2,SUV,5.0,5.0,Gris,Mi-cuir (Sellerie d'origine),EURO 6,32 g/km,Crit'Air 1,Oui,https://www.autohero.com/fr/mercedes-benz-gla/...
3,2025-04-09,BMW X1,sDrive18i xLine DKG7,27 890 €,05.05.2021,37 869 km,Essence,Double embrayage / DCT,136 CV / 100 kW,2,SUV,5.0,5.0,Noir,Mi-cuir (Sellerie d'origine),EURO 6,148 g/km,Crit'Air 1,Non,https://www.autohero.com/fr/bmw-x-1/id/490f203...
4,2025-04-09,Peugeot 3008,1.5 Blue-HDi Crossway EAT8,19 090 €,31.12.2019,58 958 km,Diesel,Boite de vitesse automatique,130 CV / 96 kW,3,SUV,5.0,5.0,Blanc,Mi-cuir (Sellerie d'origine),EURO 6,98 g/km,Crit'Air 2,Oui,https://www.autohero.com/fr/peugeot-3008/id/df...


In [4]:
df_annonces.shape

(2352, 20)

Il y a 2352 lignes et 22 colonnes

In [5]:
df_annonces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2352 entries, 0 to 2351
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   scraped_at                    2352 non-null   object 
 1   modele                        2342 non-null   object 
 2   finition                      2342 non-null   object 
 3   prix                          2342 non-null   object 
 4   annee_mise_en_circulation     2342 non-null   object 
 5   kilometrage                   2342 non-null   object 
 6   carburant                     2342 non-null   object 
 7   transmission                  2342 non-null   object 
 8   puissance                     2342 non-null   object 
 9   nb_ancien_proprietaire        2342 non-null   object 
 10  classe_vehicule               2342 non-null   object 
 11  nb_porte                      2342 non-null   float64
 12  nb_place                      2342 non-null   float64
 13  cou

In [6]:
# Extraire l'année de mise en circulation et la convertir en int
df_annonces["annee"] = (df_annonces["annee_mise_en_circulation"]
               .str.extract(r'(\d{4})')[0]
               .astype('Int64'))                                  

In [7]:
# Extraire la marque de la voiture et l'insérer à côté de la colonne "modele"
# en utilisant la première partie de la chaîne avant l'espace
df_annonces.insert(df_annonces.columns.get_loc("modele"), "marque", df_annonces["modele"].str.split(" ").str[0])

## 2. Base des modèles à récupérer le prix neuf  
On a besoin de:
- Nom de modèle  
- Finition  
- Année  

In [8]:
# Liste des Modèle, finition et année
df_model = df_annonces[["marque", "modele", "finition", "annee"]].value_counts().reset_index(name="nb_occurrences")
df_model = df_model.sort_values(by=["marque", "modele", "finition", "annee"])
df_model.reset_index(drop=True, inplace=True)
df_model.to_csv("../data/processed_data/modeles_voitures.csv", index=False)

In [9]:
# df_model = pd.read_csv('../data/processed_data/modeles_voitures.csv')

In [10]:
df_model

Unnamed: 0,marque,modele,finition,annee,nb_occurrences
0,Abarth,Abarth 124 Spider,1.4 Turbo Turismo BVA,2019,1
1,Abarth,Abarth 500,1.4 Turbo T-Jet 595,2020,2
2,Abarth,Abarth 500,1.4 Turbo T-Jet 595,2022,1
3,Abarth,Abarth 500,1.4 Turbo T-Jet 595 Pista,2018,1
4,Abarth,Abarth 500,1.4 Turbo T-Jet 595 Turismo,2020,1
...,...,...,...,...,...
1608,Volvo,Volvo V40,2.0 D2 Inscription,2018,1
1609,Volvo,Volvo V40 Cross Country,Cross Country 1.5 T3 Momentum Geartronic 6,2019,1
1610,Volvo,Volvo XC40,1.5 T3 R-Design Geartronic 8,2019,1
1611,Volvo,Volvo XC40,2.0 D4 AWD AdBlue R-Design Geartronic 8,2019,1


In [11]:
# Check missing values
df_model.isnull().sum()

marque            0
modele            0
finition          0
annee             0
nb_occurrences    0
dtype: int64

In [12]:
# Supprimer missing values
df_model = df_model.dropna()

In [13]:
df_model["annee"] = df_model["annee"].astype(int)

In [14]:
# Dimension
df_model.shape

(1613, 5)

In [15]:
# Liste des modèles et années
# Somme des occurrences par modèle et année, supprimer la colonne "finition"
df = df_model.drop(columns=["finition"])
df = df.groupby(["marque", "modele", "annee"]).sum().reset_index()

df = df.sort_values(by=["marque", "modele", "annee"])
df.reset_index(drop=True, inplace=True)

In [16]:
print(df["marque"].unique())
print(len(df["marque"].unique()))

['Abarth' 'Alfa' 'Audi' 'BMW' 'Citroen' 'Cupra' 'DS' 'Dacia' 'Fiat' 'Ford'
 'Honda' 'Hyundai' 'Infiniti' 'Jaguar' 'Jeep' 'Kia' 'Land' 'Lexus' 'MG'
 'MINI' 'Mazda' 'Mercedes-Benz' 'Mitsubishi' 'Nissan' 'Opel' 'Peugeot'
 'Renault' 'Seat' 'Skoda' 'Smart' 'Suzuki' 'Toyota' 'Volkswagen' 'Volvo']
34


In [17]:
# Rename "marque" to match with the right name used in the caradisiac website
dict_name_marque = {'Abarth' : 'Abarth', 'Alfa' : 'Alfa Romeo', 'Audi' : 'Audi', 
                    'BMW' : 'BMW', 
                    'Citroen' : 'Citroen', 'Cupra' : 'Cupra', 
                    'DS' : 'DS', 'Dacia' : 'Dacia',
                    'Fiat' : 'Fiat', 'Ford' : 'Ford', 
                    'Honda' : 'Honda', 'Hyundai' : 'Hyundai', 
                    'Infiniti' : 'Infiniti', 
                    'Jaguar' : 'Jaguar', 'Jeep' : 'Jeep',
                    'Kia' : 'Kia', 
                    'Land' : 'Land Rover', 'Lexus' : 'Lexus', 
                    'MG' : "MG", 'MINI' : 'MINI', 'Mazda' : 'Mazda', 'Mercedes-Benz' : 'Mercedes', 'Mitsubishi' : 'Mitsubishi', 
                    'Nissan' : 'Nissan', 
                    'Opel' : 'Opel', 
                    'Peugeot' : 'Peugeot', 
                    'Renault' : 'Renault', 
                    'Seat' : 'Seat', 'Skoda' : 'Skoda', 'Smart' : 'Smart', 'Suzuki' : 'Suzuki', 
                    'Toyota' : 'Toyota', 
                    'Volkswagen' : 'Volkswagen', 'Volvo' : 'Volvo'
                    }

df['marque'] = df['marque'].replace(dict_name_marque)
print(df["marque"].unique())
print(len(df["marque"].unique()))

['Abarth' 'Alfa Romeo' 'Audi' 'BMW' 'Citroen' 'Cupra' 'DS' 'Dacia' 'Fiat'
 'Ford' 'Honda' 'Hyundai' 'Infiniti' 'Jaguar' 'Jeep' 'Kia' 'Land Rover'
 'Lexus' 'MG' 'MINI' 'Mazda' 'Mercedes' 'Mitsubishi' 'Nissan' 'Opel'
 'Peugeot' 'Renault' 'Seat' 'Skoda' 'Smart' 'Suzuki' 'Toyota' 'Volkswagen'
 'Volvo']
34


In [18]:
freq_marque = df["marque"].value_counts().reset_index().sort_values(by='marque')
freq_marque

Unnamed: 0,marque,count
25,Abarth,7
21,Alfa Romeo,10
5,Audi,45
8,BMW,35
1,Citroen,53
28,Cupra,4
19,DS,13
15,Dacia,19
12,Fiat,29
6,Ford,37


## 2. Essai avec request et BeautifulSoup

In [19]:
# Function to scrape car catalogue prices from Caradisiac
def clean_model_name(model):
    # Remove special characters and normalize spaces
    model_v2 = re.sub(r'[^\w\s]', '', model).strip().lower()
    model_v3 = model_v2.replace(' ', '-')
    return model_v3

def scrape_caradisiac_price(modele, annee):
    base_url = "https://www.caradisiac.com/fiches-techniques/modele--"
    
    # Build search URL
    clean_model = clean_model_name(modele)
    search_url = f"{base_url}{clean_model}/{str(annee)}"
    
    print(f"Search url original pattern is: {search_url}")
    
    try:
        # First attempt with original pattern
        time.sleep(5)
        response = requests.get(search_url)
        #print(f"Code Status: {response.status_code}")
        if response.status_code != 200:
            # Try alternative URL patterns
            alternative_models = [
                clean_model.replace('_', ''), # Remove "_"
                '-'.join([clean_model.replace('_', ''), "2e", "generation"])  # join with version
            ] + ['-'.join([clean_model.replace('_', ''), str(i)]) for i in range(1,6)]
            
            #print(f"Alternative urls: {alternative_models}")
            
            for alt_model in alternative_models:
                time.sleep(5)
                alt_url = f"{base_url}{alt_model}/{str(annee)}"
                response = requests.get(alt_url)
                if response.status_code == 200:
                    search_url = alt_url
                    break
        # Add delay to be respectful to the server
        time.sleep(5)
        
        # Make request
        print(f"L'url final est: {search_url}")
        response = requests.get(search_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all car versions
        versions = soup.find_all('table', class_='listingTab')

        # Extract all rows from the table
        table = versions[0]  # Since versions contains only one table
        rows = table.find_all('tr')

        # Initialize lists to store data
        data = []
        headers = []

        # Get headers from first row
        header_row = rows[0]
        headers = [th.get_text(strip=True) for th in header_row.find_all('th')]

        # Get data from remaining rows
        for row in rows[1:]:
            cols = row.find_all('td')
            row_data = [col.get_text(strip=True) for col in cols]
            if row_data:  # Only add non-empty rows
                data.append(row_data)
        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)
        df['url'] = search_url
        return df    
    except Exception as e:
        print(f"Error scraping {modele}: {str(e)}")
        return None

In [20]:
# # Create an empty DataFrame to store all results
# df_all = pd.DataFrame()

# # Add progress tracking
# total_models = len(df)
# print(f"Starting to scrape {total_models} models...")

# for idx, row in df.iterrows():
#     try:
#         print(f"Processing {idx+1}/{total_models}: {row['modele']} ({row['annee']})")
#         df_version = scrape_caradisiac_price(row['modele'], row['annee'])
        
#         if df_version is not None:
#             # Add model and year columns to identify the source
#             df_version.insert(0, 'source_model', row['modele'])
#             df_version.insert(1, 'source_year', row['annee'])
#             df_all = pd.concat([df_all, df_version], ignore_index=True)
            
#     except Exception as e:
#         print(f"Error processing {row['modele']} ({row['annee']}): {str(e)}")
#         continue

# print(f"\nScraping completed. Total entries collected: {len(df_all)}")

## 3. Essai avec Selenium pour retrouver les bons urls


In [21]:
def select_option_contain (select_element, partial_text):
    select = Select(select_element)

    #print(f"Partial text est: {partial_text}")
    accents = {
            'a': r'[àáâãäå]',
            'e': r'[èéêë]',
            'i': r'[ìíîï]',
            'o': r'[òóôõö]',
            'u': r'[ùúûü]',
            'c': r'[ç]',
            'n': r'[ñ]',
            'A': r'[ÀÁÂÃÄÅ]',
            'E': r'[ÈÉÊË]',
            'I': r'[ÌÍÎÏ]',
            'O': r'[ÒÓÔÕÖ]',
            'U': r'[ÙÚÛÜ]',
            'C': r'[Ç]',
            'N': r'[Ñ]',
        }
    # Remove special characters with accent
    for remplacement, pattern in accents.items():
        partial_text = re.sub(pattern, remplacement, partial_text)
    print(f"Partial text: {partial_text}")

    # First check for exact matches
    for option in select.options:
        #if option.get_attribute("value").strip() != "" and partial_text.strip().lower() == option.text.strip().lower():
        if (option.get_attribute("value").strip() != "" and 
            partial_text.lower() == option.text.lower()
            ):
            select.select_by_visible_text(option.text)
            print(f"Found exact match: {option.text}")
            option_select = "Very Exact match"
            return option.text, option_select
            break
        
    # Check for exact matches after removing space and "-"
    for option in select.options:
        if (option.get_attribute("value").strip() != "" and 
            re.sub(r"[\s\-]", "", partial_text.strip().lower()) == re.sub(r"[\s\-]", "", option.text.strip().lower())
            ): # remove any space and "-"
            select.select_by_visible_text(option.text)
            print(f"Found exact match: {option.text}")
            option_select = "Exact match"
            return option.text, option_select
            break
             
    # If no exact match found, check for partial matches
    best_match = None
    max_common_chars = 0
    
    for option in select.options:
        if option.get_attribute("value").strip() != "":
            option_text = re.sub(r"[\s\-]", "", option.text.strip().lower()) # remove any space
            partial_text_lower = re.sub(r"[\s\-]", "", partial_text.strip().lower()) # remove any space
            
            # Count common characters
            common_chars = sum(1 for c in option_text if c in partial_text_lower)
            
            # Check if this is a partial match and has more common characters
            if ((partial_text_lower in option_text) or (option_text in partial_text_lower)) and common_chars > max_common_chars:
                max_common_chars = common_chars
                best_match = option
    # Retain the best partial match = the option that has more common characters with the partial text
    if best_match is not None:
        select.select_by_visible_text(best_match.text)
        print(f"Found best partial match: {best_match.text}")
        option_select = "Partial match"
        return best_match.text, option_select


In [22]:
def clean_name(model):
    # Remove special characters and normalize spaces
    accents = {
            'a': r'[àáâãäå]',
            'e': r'[èéêë]',
            'i': r'[ìíîï]',
            'o': r'[òóôõö]',
            'u': r'[ùúûü]',
            'c': r'[ç]',
            'n': r'[ñ]',
            'A': r'[ÀÁÂÃÄÅ]',
            'E': r'[ÈÉÊË]',
            'I': r'[ÌÍÎÏ]',
            'O': r'[ÒÓÔÕÖ]',
            'U': r'[ÙÚÛÜ]',
            'C': r'[Ç]',
            'N': r'[Ñ]',
        }

    for remplacement, pattern in accents.items():
        model = re.sub(pattern, remplacement, model)
    model = re.sub(r'[^\w\s\-!]', '', model).strip().lower()

    model = model.replace(' ', '-')
    
    return model

In [23]:
# list_models = pd.unique(df['modele'])
# print(list_models)
# print([i for i in list_models if "!" in i ])

# list_models_cleaned = [clean_name(model) for model in list_models]
# print(list_models_cleaned)

# list_models_test = [any(char in i for char in 'éèêëàâäôöûüçñ') for i in list_models_cleaned]
# print([i for i in list_models_test if i == True])

In [24]:
def collect_prix_neuf (url, marque, modele, annee, wait_sec=10):
    '''
    Guessing the convenient urls
    Arguments:

    '''

    try:
        # Initialiser le driver
        driver = init_driver()
        driver.get(url)
        time.sleep(2)
        wait = WebDriverWait(driver, wait_sec)
        
        # Accept pop-up
        #accept_popup_general(driver, "#didomi-notice-agree-button")
        accept_popup_general(driver, ".didomi-continue-without-agreeing")
        time.sleep(5)
        #driver.execute_script("document.location.reload()")
        #driver.refresh()
        
        df_all_versions = pd.DataFrame()
        
        # Sélectionner les options de dropdowns: Marques, Gammes, Annees, Modeles

        # Marque: if contains marque
        time.sleep(2)
        #marque_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'brands')))
        marque_dropdown = driver.find_element(By.ID, 'brands')
        #option_marque =  Select(marque_dropdown).select_by_visible_text(marque)
        option_marque, match_type_marque = select_option_contain (marque_dropdown, marque)
        print(f"Marque: {option_marque}")

        # Gammes: if contains modele
        #modele_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'models')))
        time.sleep(2)
        modele_dropdown = driver.find_element(By.ID, 'models')
        #print(f"Modèle initial est: {modele}")
        option_modele, match_type_modele = select_option_contain (modele_dropdown, modele)
        print(f"Modèle: {option_modele}")
        
        # Annees = annee
        #annee_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'year')))
        time.sleep(2)
        annee_dropdown = driver.find_element(By.ID, 'year')
        option_year, match_type_year = select_option_contain(annee_dropdown, str(annee))
        #print(f"Liste année: {[option.text for option in Select(annee_dropdown).options]}")
        #print(f"Année choisie: {option_year}")
        
        # Modeles: extract all (for i in range modeles.options)
        #modelscomm_dropdown = wait.until(EC.presence_of_element_located((By.ID, 'modelscomm')))
        time.sleep(2)
        modelscomm_dropdown = driver.find_element(By.ID, 'modelscomm')
        select_modelscomm = Select(modelscomm_dropdown)

        # Exclure version vide
        versions = [option.text for option in select_modelscomm.options if option.get_attribute("value").strip() != ""]
        print(f"Liste des versions: {versions}")
       
        if versions:  # This checks if the list is not empty
            for version in versions:
                data = []
                #select_modelscomm.select_by_visible_text(version.text)
                option_marque_cleaned = clean_name(option_marque)
                option_version = clean_name(version)
                
                search_url_version = f"{url}/modele--{option_marque_cleaned}-{option_version}/{option_year}"
                print(f"Extraire la fiche technique de: {version} - année {option_year}")
                print(f"Search url est : {search_url_version}")

                try:
                    driver.get(search_url_version)
                    time.sleep(2)
                    
                    # Récupérer le tableau "listingTab"
                    table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table.listingTab')))

                    # Get all rows including header row
                    rows = table.find_elements(By.TAG_NAME, 'tr')

                    # Get headers from first row
                    headers = rows[0].find_elements(By.TAG_NAME, 'th')
                    header_texts = [header.text for header in headers]

                    # Get data from remaining rows
                    for row in rows[1:]:
                        cells = row.find_elements(By.TAG_NAME, 'td')
                        row_data = [cell.text for cell in cells]
                        if row_data:
                            data.append(row_data)

                    # Sauvegarder le tableau dans un dataframe
                    df = pd.DataFrame(data, columns=header_texts)

                    # Création des colonnes pour identifier le type de matching
                    df['url'] = driver.current_url
                    df['option_marque_select'] = option_marque
                    df['option_modele_select'] = option_modele
                    df['option_year_select'] = option_year
                    df['match_type_marque'] = match_type_marque
                    df['match_type_modele'] = match_type_modele
                    df['match_type_year'] = match_type_year

                    #print(f"URL used is: {driver.current_url}")

                    #print(df.head())
                    
                    df_all_versions = pd.concat([df_all_versions, df], ignore_index=True)
                    #driver.close()
                    #print(f"df all version est: {df_all_versions.head()}")
                except Exception: # If error then continue with a new iteration
                    continue
        # Close the driver    
        driver.quit() 
        return df_all_versions
    except Exception as e:
        print (f"URL is not correct")
        print (f"Error is: {e}")
        driver.quit()
        return None

In [25]:
# url = "https://www.caradisiac.com/fiches-techniques"
# marque = "FIAT"
# modele = "Fiat 500X"
# annee = "2017"
# df_test = collect_prix_neuf (url, marque, modele, annee)

In [44]:
def scraping_prix_neuf (df, csv_path_ok, csv_path_ko, wait_time = 10):
    # Create an empty DataFrame to store all results
    df_all = pd.DataFrame()

    df_error = pd.DataFrame()

    # Add progress tracking
    total_models = len(df)
    print(f"⏳ Starting to scrape {total_models} models & years...")

    # url
    url = "https://www.caradisiac.com/fiches-techniques"

    for idx, row in df.iterrows():
        try:
            print(f"--> 🔄Processing {idx+1}/{total_models}: {row['modele']} ({row['annee']})")
            df_version = collect_prix_neuf (url, row['marque'], row['modele'], row['annee'], wait_time)
            
            if df_version is not None:
                # Add model and year columns to identify the source
                df_version.insert(0, 'source_model', row['modele'])
                df_version.insert(1, 'source_year', row['annee'])
                df_all = pd.concat([df_all, df_version], ignore_index=True)
                print(f"✅Successfully collected for {row['modele']} ({row['annee']})")
            else:
                print(f"❌Error processing {row['modele']} ({row['annee']})")
                df_error = pd.concat([df_error, pd.DataFrame({'modele': [row['modele']], 'annee': [row['annee']]})], ignore_index=True)    
        except Exception as e:
            print(f"❌Error processing {row['modele']} ({row['annee']}): {str(e)}")
            continue

    print(f"\n 🆗 Scraping completed. Total entries collected: {len(df_all)}")
    # Print le nombre de couple modèles & années trouvés, sans doublons
    if not df_all.empty:
        print(f"Total unique models & years found: {len(df_all[['source_model', 'source_year']].drop_duplicates())}")
    # Exporter les données dans un fichier CSV
    df_all.to_csv(csv_path_ok, index=False, encoding='utf-8-sig')
    print(f"⚠️ Total models & years not found: {len(df_error)}")
    df_error.to_csv(csv_path_ko, index=False, encoding='utf-8-sig')
    return None

In [27]:
# Fonction pour diviser le df original en plusieurs parties => pour lancer le scraping en plusieurs patchs
def split_dataframe_by_size(df, nb_lines, csv_root):
    list_dfs = [df[i:i+nb_lines] for i in range(0, len(df), nb_lines)]

    somme_originale = len(df)
    somme_lignes = sum(len(split) for split in list_dfs)

    print(f"Somme des lignes de la base brute = {somme_originale}")
    print(f"Somme des lignes de tous les splits dataframe = {somme_lignes}")
    print(f'# Data split: {len(list_dfs)}')

    # Save each split dataframe to CSV
    for i, split_df in enumerate(list_dfs):
        csv_path = f'{csv_root}/split_car_models_{i+1}.csv'
        split_df.to_csv(csv_path, index=False)
        print(f'Split {i+1} saved to {csv_path}')
    
    return list_dfs

In [28]:
list_df_split = split_dataframe_by_size(df, 50, "../data/raw_data")

Somme des lignes de la base brute = 752
Somme des lignes de tous les splits dataframe = 752
# Data split: 16
Split 1 saved to ../data/raw_data/split_car_models_1.csv
Split 2 saved to ../data/raw_data/split_car_models_2.csv
Split 3 saved to ../data/raw_data/split_car_models_3.csv
Split 4 saved to ../data/raw_data/split_car_models_4.csv
Split 5 saved to ../data/raw_data/split_car_models_5.csv
Split 6 saved to ../data/raw_data/split_car_models_6.csv
Split 7 saved to ../data/raw_data/split_car_models_7.csv
Split 8 saved to ../data/raw_data/split_car_models_8.csv
Split 9 saved to ../data/raw_data/split_car_models_9.csv
Split 10 saved to ../data/raw_data/split_car_models_10.csv
Split 11 saved to ../data/raw_data/split_car_models_11.csv
Split 12 saved to ../data/raw_data/split_car_models_12.csv
Split 13 saved to ../data/raw_data/split_car_models_13.csv
Split 14 saved to ../data/raw_data/split_car_models_14.csv
Split 15 saved to ../data/raw_data/split_car_models_15.csv
Split 16 saved to ../dat

### First attempt

In [29]:
# Process by pack
path = "../data/raw_data"
pattern = "split_car_models_*.csv"
files = glob.glob(os.path.join(path, pattern))
nb_split_df = len(files)

for i, path_split in enumerate(files):
    csv_path_ok = '../data/raw_data/prix_neuf_voitures_pack' + str(i+1) + '.csv'
    
    # Only execute if output file doesn't exist
    if not os.path.exists(csv_path_ok):
        data = pd.read_csv(path_split)
        csv_path_ko = '../data/raw_data/prix_neuf_voitures_pack' + str(i+1) + '_error.csv'

        # Run scraping
        scraping_prix_neuf(data, csv_path_ok, csv_path_ko)
    else:
        print(f"File {csv_path_ok} already exists, skipping...")

File ../data/raw_data/prix_neuf_voitures_pack1.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack2.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack3.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack4.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack5.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack6.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack7.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack8.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack9.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack10.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack11.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack12.csv already exists, skipping...
File ../data/raw_data/prix_neuf_voitures_pack13.c

In [30]:
# Liste des modèles & années à collecter
df['modele_annee'] = df['modele'] + ' ' + df['annee'].astype(str)
unique_modele_annee = sorted(pd.unique(df["modele_annee"]))
print(f"{len(unique_modele_annee)} modèles et années à collecter: \n {unique_modele_annee}")

# Liste des modèles & années qui n'ont pas été collectés
path = "../data/raw_data"
pattern = "prix_neuf_voitures_pack*.csv"
files = [file for file in glob.glob(os.path.join(path, pattern)) if "error" not in file]
print(files)
df_prix_neuf = pd.DataFrame()
for file in files:
    df_ = pd.read_csv(file)
    # Ajouter le nom du fichier à la colonne "source_file"
    df_['source_file'] = file.split('/')[-1]
    df_prix_neuf = pd.concat([df_prix_neuf, df_], ignore_index=True)
df_prix_neuf['modele_annee'] = df_prix_neuf['source_model'] + ' ' + df_prix_neuf['source_year'].astype(str)
unique_modele_annee_collected = sorted(pd.unique(df_prix_neuf["modele_annee"]))
print(f"{len(unique_modele_annee_collected)} modèles et années collectés: \n {unique_modele_annee_collected}")

# Liste des modèles & années qui ont été collectés (sans doublons)
df_prix_neuf['modele_annee_option'] = df_prix_neuf['option_marque_select'] +  ' ' + df_prix_neuf['option_modele_select'] + ' ' + df_prix_neuf['option_year_select'].astype(str)
unique_modele_annee_option = sorted(pd.unique(df_prix_neuf["modele_annee_option"]))
print(f"{len(unique_modele_annee_option)} modèles et années sélectionnés: \n {unique_modele_annee_option}")

unique_modele_annee_final = df_prix_neuf[['source_model', 'source_year', 'modele_annee_option']].drop_duplicates()
unique_modele_annee_final['duplicate_source_models'] = unique_modele_annee_final.groupby('modele_annee_option')['source_model'].transform(lambda x: '#'.join(x) if len(x) > 1 else None)
df_doublons = unique_modele_annee_final[['modele_annee_option', 'duplicate_source_models']].drop_duplicates()
df_doublons = df_doublons[df_doublons['duplicate_source_models'].notnull()]
df_doublons

752 modèles et années à collecter: 
 ['Abarth 124 Spider 2019', 'Abarth 500 2018', 'Abarth 500 2019', 'Abarth 500 2020', 'Abarth 500 2022', 'Abarth 595 2017', 'Abarth 595C 2020', 'Alfa Romeo Giulia 2020', 'Alfa Romeo Giulia 2021', 'Alfa Romeo Giulietta 2017', 'Alfa Romeo Giulietta 2018', 'Alfa Romeo Giulietta 2019', 'Alfa Romeo Giulietta 2021', 'Alfa Romeo Stelvio 2017', 'Alfa Romeo Stelvio 2018', 'Alfa Romeo Tonale 2023', 'Alfa Romeo Tonale 2024', 'Audi A1 2017', 'Audi A1 2018', 'Audi A1 2019', 'Audi A1 2020', 'Audi A1 2021', 'Audi A1 2023', 'Audi A1 citycarver 2021', 'Audi A3 2017', 'Audi A3 2018', 'Audi A3 2019', 'Audi A3 2020', 'Audi A3 2021', 'Audi A3 2022', 'Audi A3 2023', 'Audi A3 Limousine 2017', 'Audi A3 Limousine 2019', 'Audi A3 Limousine 2021', 'Audi A4 2018', 'Audi A4 2019', 'Audi A4 2020', 'Audi A4 2021', 'Audi A4 2022', 'Audi A5 Sportback 2020', 'Audi A5 Sportback 2022', 'Audi A6 2018', 'Audi Q2 2017', 'Audi Q2 2018', 'Audi Q2 2019', 'Audi Q2 2020', 'Audi Q2 2022', 'Audi 

709 modèles et années collectés: 
 ['Abarth 124 Spider 2019', 'Abarth 500 2018', 'Abarth 500 2019', 'Abarth 500 2020', 'Abarth 500 2022', 'Alfa Romeo Giulia 2020', 'Alfa Romeo Giulia 2021', 'Alfa Romeo Giulietta 2017', 'Alfa Romeo Giulietta 2018', 'Alfa Romeo Giulietta 2019', 'Alfa Romeo Giulietta 2021', 'Alfa Romeo Stelvio 2017', 'Alfa Romeo Stelvio 2018', 'Alfa Romeo Tonale 2023', 'Alfa Romeo Tonale 2024', 'Audi A1 2017', 'Audi A1 2018', 'Audi A1 2019', 'Audi A1 2020', 'Audi A1 2021', 'Audi A1 2023', 'Audi A1 citycarver 2021', 'Audi A3 2017', 'Audi A3 2018', 'Audi A3 2019', 'Audi A3 2020', 'Audi A3 2021', 'Audi A3 2022', 'Audi A3 2023', 'Audi A3 Limousine 2017', 'Audi A3 Limousine 2019', 'Audi A3 Limousine 2021', 'Audi A4 2018', 'Audi A4 2019', 'Audi A4 2020', 'Audi A4 2021', 'Audi A4 2022', 'Audi A5 Sportback 2020', 'Audi A5 Sportback 2022', 'Audi A6 2018', 'Audi Q2 2017', 'Audi Q2 2018', 'Audi Q2 2019', 'Audi Q2 2020', 'Audi Q2 2022', 'Audi Q2 2023', 'Audi Q3 2017', 'Audi Q3 2018',

Unnamed: 0,modele_annee_option,duplicate_source_models
1393,AUDI A1 2021,Audi A1#Audi A1 citycarver
1530,AUDI A3 2017,Audi A3#Audi A3 Limousine
2495,AUDI A3 2019,Audi A3#Audi A3 Limousine
3006,AUDI A3 2021,Audi A3#Audi A3 Limousine
9818,FIAT 500 2017,Fiat 500#Fiat 500C
10012,FIAT 500 2018,Fiat 500#Fiat 500C
10225,FIAT 500 2019,Fiat 500#Fiat 500C
10382,FIAT 500 2020,Fiat 500#Fiat 500C
10496,FIAT 500 2021,Fiat 500#Fiat 500C
18116,HYUNDAI I30 2019,Hyundai i30#Hyundai i30 Fastback


### Second attempt

In [31]:
# Liste des modèles & années restant à collecter
remaining_to_collect = set(unique_modele_annee) - set(unique_modele_annee_collected)
print(len(remaining_to_collect))
print(sorted(remaining_to_collect))

43
['Abarth 595 2017', 'Abarth 595C 2020', 'Citroen C4 Grand Picasso 2017', 'Citroen C4 Grand Picasso 2019', 'Citroen C4 Picasso 2019', 'Citroen DS3 2018', 'Citroen DS3 Cabrio 2017', 'Citroen DS3 Cabrio 2019', 'Citroen DS4 2018', 'Citroen DS4 Crossback 2018', 'Citroen DS5 2017', 'Citroen DS5 2018', 'Ford Tourneo 2020', "Kia cee'd 2018", "Kia cee'd 2019", "Kia cee'd 2021", "Kia cee'd 2022", "Kia cee'd 2023", "Kia pro_cee'd 2017", "Kia pro_cee'd 2019", "Kia pro_cee'd 2020", "Kia pro_cee'd 2021", 'Land Rover Evoque 2017', 'Land Rover Evoque 2018', 'Land Rover Evoque 2020', 'Mercedes-Benz Classe GLB 2020', 'Mercedes-Benz Classe GLB 2021', 'Mercedes-Benz Classe GLB 2022', 'Mercedes-Benz Classe GLC 2017', 'Mercedes-Benz Classe GLC 2018', 'Mercedes-Benz Classe GLC 2019', 'Mercedes-Benz Classe GLC 2020', 'Mercedes-Benz Classe GLC 2021', 'Mercedes-Benz Classe GLE 2018', 'Opel Crossland X 2023', 'Renault Captur 2020', 'Seat Ibiza 2020', 'Seat Ibiza 2021', 'Seat Ibiza 2022', 'Seat Leon 2018', 'Se

In [32]:
# Dataframe des modèles restant à collecter
df2 = df[df['modele_annee'].isin(remaining_to_collect)]
df2 = df2.reset_index(drop=True)

In [34]:
df2

Unnamed: 0,marque,modele,annee,nb_occurrences,modele_annee
0,Abarth,Abarth 595,2017,1,Abarth 595 2017
1,Abarth,Abarth 595C,2020,1,Abarth 595C 2020
2,Citroen,Citroen C4 Grand Picasso,2017,1,Citroen C4 Grand Picasso 2017
3,Citroen,Citroen C4 Grand Picasso,2019,1,Citroen C4 Grand Picasso 2019
4,Citroen,Citroen C4 Picasso,2019,1,Citroen C4 Picasso 2019
5,Citroen,Citroen DS3,2018,2,Citroen DS3 2018
6,Citroen,Citroen DS3 Cabrio,2017,2,Citroen DS3 Cabrio 2017
7,Citroen,Citroen DS3 Cabrio,2019,1,Citroen DS3 Cabrio 2019
8,Citroen,Citroen DS4,2018,1,Citroen DS4 2018
9,Citroen,Citroen DS4 Crossback,2018,2,Citroen DS4 Crossback 2018


In [35]:
# Il y avait peut-être des erreurs lors du 1er scraping à cause du serveur trop chargé
# 2è essai...
csv_path_ok = '../data/raw_data/prix_neuf_voitures_essaie_2.csv'
if not os.path.exists(csv_path_ok):
    csv_path_ko = '../data/raw_data/prix_neuf_voitures_essaie_2_error.csv'
    scraping_prix_neuf (df2, csv_path_ok, csv_path_ko)
else:
    print(f"File {csv_path_ok} already exists, skipping...")

⏳ Starting to scrape 43 models & years...
--> 🔄Processing 1/43: Abarth 595 (2017)
Partial text: Abarth
Found exact match: ABARTH
Marque: ABARTH
Partial text: Abarth 595
URL is not correct
Error is: cannot unpack non-iterable NoneType object
❌Error processing Abarth 595 (2017)
--> 🔄Processing 2/43: Abarth 595C (2020)
Partial text: Abarth
Found exact match: ABARTH
Marque: ABARTH
Partial text: Abarth 595C
URL is not correct
Error is: cannot unpack non-iterable NoneType object
❌Error processing Abarth 595C (2020)
--> 🔄Processing 3/43: Citroen C4 Grand Picasso (2017)
Partial text: Citroen
Found exact match: CITROEN
Marque: CITROEN
Partial text: Citroen C4 Grand Picasso
Found best partial match: PICASSO
Modèle: PICASSO
Partial text: 2017
URL is not correct
Error is: cannot unpack non-iterable NoneType object
❌Error processing Citroen C4 Grand Picasso (2017)
--> 🔄Processing 4/43: Citroen C4 Grand Picasso (2019)
Partial text: Citroen
Found exact match: CITROEN
Marque: CITROEN
Partial text: Cit

In [37]:
# Importer les résultats
df2_ok = pd.read_csv('../data/raw_data/prix_neuf_voitures_essaie_2.csv')
df2_ok

Unnamed: 0,source_model,source_year,Versions,Portes,Energie,Boite,CO2\n(g/km),Prix,url,option_marque_select,option_modele_select,option_year_select,match_type_marque,match_type_modele,match_type_year
0,Renault Captur,2020,(2) 0.9 TCE 90 BUSINESS,5.0,Ess.,Mécanique,125 (nedc),20 500 €,https://www.caradisiac.com/fiches-techniques/m...,RENAULT,CAPTUR,2020,Very Exact match,Partial match,Very Exact match
1,Renault Captur,2020,(2) 0.9 TCE 90 INTENS,5.0,Ess.,Mécanique,125 (nedc),22 000 €,https://www.caradisiac.com/fiches-techniques/m...,RENAULT,CAPTUR,2020,Very Exact match,Partial match,Very Exact match
2,Renault Captur,2020,(2) 0.9 TCE 90 LIFE,5.0,Ess.,Mécanique,125 (nedc),18 000 €,https://www.caradisiac.com/fiches-techniques/m...,RENAULT,CAPTUR,2020,Very Exact match,Partial match,Very Exact match
3,Renault Captur,2020,(2) 0.9 TCE 90 SUNSET,5.0,Ess.,Mécanique,125 (nedc),20 400 €,https://www.caradisiac.com/fiches-techniques/m...,RENAULT,CAPTUR,2020,Very Exact match,Partial match,Very Exact match
4,Renault Captur,2020,(2) 0.9 TCE 90 ZEN,5.0,Ess.,Mécanique,125 (nedc),19 900 €,https://www.caradisiac.com/fiches-techniques/m...,RENAULT,CAPTUR,2020,Very Exact match,Partial match,Very Exact match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,Seat Leon,2020,IV SPORTSTOURER 2.0 TDI 150 S&S STYLE BUSINESS...,5.0,Dies.,Automatique,120 (wltp),34 360 €,https://www.caradisiac.com/fiches-techniques/m...,SEAT,LEON,2020,Very Exact match,Partial match,Very Exact match
781,Seat Leon,2020,IV SPORTSTOURER 2.0 TDI 150 S&S STYLE DSG,5.0,Dies.,Automatique,120 (wltp),33 660 €,https://www.caradisiac.com/fiches-techniques/m...,SEAT,LEON,2020,Very Exact match,Partial match,Very Exact match
782,Seat Leon,2020,IV SPORTSTOURER 2.0 TDI 150 S&S XCELLENCE DSG,5.0,Dies.,Automatique,123 (wltp),36 110 €,https://www.caradisiac.com/fiches-techniques/m...,SEAT,LEON,2020,Very Exact match,Partial match,Very Exact match
783,Seat Leon,2020,IV SPORTSTOURER 2.0 TDI 150 S&S XCELLENCE ONE DSG,5.0,Dies.,Automatique,NC,33 750 €,https://www.caradisiac.com/fiches-techniques/m...,SEAT,LEON,2020,Very Exact match,Partial match,Very Exact match


In [38]:
# Liste des modèles qui sont collectés avec la 2è essaie
df2_ok['modele_annee'] = df2_ok['source_model'] + ' ' + df2_ok['source_year'].astype(str)
unique_modele_annee_collected_sup = sorted(pd.unique(df2_ok["modele_annee"]))
print(f"{len(unique_modele_annee_collected_sup)} modèles et années collectés avec la 2è essaie: \n {unique_modele_annee_collected_sup}")

# Liste des modèles qui ne sont toujours pas collectés avec la 2è essaie
remaining_to_collect_sup = sorted(set(remaining_to_collect) - set(unique_modele_annee_collected_sup))
print(f"{len(remaining_to_collect_sup)} modèles et années restant à collecter: \n {remaining_to_collect_sup}")

7 modèles et années collectés avec la 2è essaie: 
 ['Renault Captur 2020', 'Seat Ibiza 2020', 'Seat Ibiza 2021', 'Seat Ibiza 2022', 'Seat Leon 2018', 'Seat Leon 2019', 'Seat Leon 2020']
36 modèles et années restant à collecter: 
 ['Abarth 595 2017', 'Abarth 595C 2020', 'Citroen C4 Grand Picasso 2017', 'Citroen C4 Grand Picasso 2019', 'Citroen C4 Picasso 2019', 'Citroen DS3 2018', 'Citroen DS3 Cabrio 2017', 'Citroen DS3 Cabrio 2019', 'Citroen DS4 2018', 'Citroen DS4 Crossback 2018', 'Citroen DS5 2017', 'Citroen DS5 2018', 'Ford Tourneo 2020', "Kia cee'd 2018", "Kia cee'd 2019", "Kia cee'd 2021", "Kia cee'd 2022", "Kia cee'd 2023", "Kia pro_cee'd 2017", "Kia pro_cee'd 2019", "Kia pro_cee'd 2020", "Kia pro_cee'd 2021", 'Land Rover Evoque 2017', 'Land Rover Evoque 2018', 'Land Rover Evoque 2020', 'Mercedes-Benz Classe GLB 2020', 'Mercedes-Benz Classe GLB 2021', 'Mercedes-Benz Classe GLB 2022', 'Mercedes-Benz Classe GLC 2017', 'Mercedes-Benz Classe GLC 2018', 'Mercedes-Benz Classe GLC 2019'

### Third attempt

In [42]:
df3 = df[df['modele_annee'].isin(remaining_to_collect_sup)]
df3 = df3.reset_index(drop=True)
df3

Unnamed: 0,marque,modele,annee,nb_occurrences,modele_annee
0,Abarth,Abarth 595,2017,1,Abarth 595 2017
1,Abarth,Abarth 595C,2020,1,Abarth 595C 2020
2,Citroen,Citroen C4 Grand Picasso,2017,1,Citroen C4 Grand Picasso 2017
3,Citroen,Citroen C4 Grand Picasso,2019,1,Citroen C4 Grand Picasso 2019
4,Citroen,Citroen C4 Picasso,2019,1,Citroen C4 Picasso 2019
5,Citroen,Citroen DS3,2018,2,Citroen DS3 2018
6,Citroen,Citroen DS3 Cabrio,2017,2,Citroen DS3 Cabrio 2017
7,Citroen,Citroen DS3 Cabrio,2019,1,Citroen DS3 Cabrio 2019
8,Citroen,Citroen DS4,2018,1,Citroen DS4 2018
9,Citroen,Citroen DS4 Crossback,2018,2,Citroen DS4 Crossback 2018


In [45]:
# 3è essai avec wait time plus élevé
csv_path_ok = '../data/raw_data/prix_neuf_voitures_essaie_3.csv'
if not os.path.exists(csv_path_ok):
    csv_path_ko = '../data/raw_data/prix_neuf_voitures_essaie_3_error.csv'
    scraping_prix_neuf (df3, csv_path_ok, csv_path_ko, 20)
else:
    print(f"File {csv_path_ok} already exists, skipping...")

⏳ Starting to scrape 36 models & years...
--> 🔄Processing 1/36: Abarth 595 (2017)
Partial text: Abarth
Found exact match: ABARTH
Marque: ABARTH
Partial text: Abarth 595
URL is not correct
Error is: cannot unpack non-iterable NoneType object
❌Error processing Abarth 595 (2017)
--> 🔄Processing 2/36: Abarth 595C (2020)
Partial text: Abarth
Found exact match: ABARTH
Marque: ABARTH
Partial text: Abarth 595C
URL is not correct
Error is: cannot unpack non-iterable NoneType object
❌Error processing Abarth 595C (2020)
--> 🔄Processing 3/36: Citroen C4 Grand Picasso (2017)
Partial text: Citroen
Found exact match: CITROEN
Marque: CITROEN
Partial text: Citroen C4 Grand Picasso
Found best partial match: PICASSO
Modèle: PICASSO
Partial text: 2017
URL is not correct
Error is: cannot unpack non-iterable NoneType object
❌Error processing Citroen C4 Grand Picasso (2017)
--> 🔄Processing 4/36: Citroen C4 Grand Picasso (2019)
Partial text: Citroen
Found exact match: CITROEN
Marque: CITROEN
Partial text: Cit

### Fourth attempt

In [62]:
unique_modele_annee_collected_sup_2 = []
try:
    df3_ok = pd.read_csv('../data/raw_data/prix_neuf_voitures_essaie_3.csv')
    # Liste des modèles qui sont collectés avec la 3è essaie 
    df3_ok['modele_annee'] = df3_ok['source_model'] + ' ' + df3_ok['source_year'].astype(str)
    unique_modele_annee_collected_sup_2 = sorted(pd.unique(df3_ok["modele_annee"]))
    print(f"{len(unique_modele_annee_collected_sup_2)} modèles et années collectés avec la 3è essaie: \n {unique_modele_annee_collected_sup_2}")
except Exception:
    df3_ok = pd.DataFrame()
    print("Data is empty")


# Liste des modèles qui ne sont toujours pas collectés avec la 3è essaie
remaining_to_collect_sup_2 = sorted(set(remaining_to_collect_sup) - set(unique_modele_annee_collected_sup_2))
print(f"{len(remaining_to_collect_sup_2)} modèles et années restant à collecter: \n {remaining_to_collect_sup_2}")

Data is empty
36 modèles et années restant à collecter: 
 ['Abarth 595 2017', 'Abarth 595C 2020', 'Citroen C4 Grand Picasso 2017', 'Citroen C4 Grand Picasso 2019', 'Citroen C4 Picasso 2019', 'Citroen DS3 2018', 'Citroen DS3 Cabrio 2017', 'Citroen DS3 Cabrio 2019', 'Citroen DS4 2018', 'Citroen DS4 Crossback 2018', 'Citroen DS5 2017', 'Citroen DS5 2018', 'Ford Tourneo 2020', "Kia cee'd 2018", "Kia cee'd 2019", "Kia cee'd 2021", "Kia cee'd 2022", "Kia cee'd 2023", "Kia pro_cee'd 2017", "Kia pro_cee'd 2019", "Kia pro_cee'd 2020", "Kia pro_cee'd 2021", 'Land Rover Evoque 2017', 'Land Rover Evoque 2018', 'Land Rover Evoque 2020', 'Mercedes-Benz Classe GLB 2020', 'Mercedes-Benz Classe GLB 2021', 'Mercedes-Benz Classe GLB 2022', 'Mercedes-Benz Classe GLC 2017', 'Mercedes-Benz Classe GLC 2018', 'Mercedes-Benz Classe GLC 2019', 'Mercedes-Benz Classe GLC 2020', 'Mercedes-Benz Classe GLC 2021', 'Mercedes-Benz Classe GLE 2018', 'Opel Crossland X 2023', 'Suzuki SX4 S-Cross 2021']


In [49]:
df4 = df3[df3['modele_annee'].isin(remaining_to_collect_sup_2)].reset_index(drop=True)
# liste des modèles
list_modele_df4 = pd.unique(df4['modele'])
print(list_modele_df4)

['Abarth 595' 'Abarth 595C' 'Citroen C4 Grand Picasso'
 'Citroen C4 Picasso' 'Citroen DS3' 'Citroen DS3 Cabrio' 'Citroen DS4'
 'Citroen DS4 Crossback' 'Citroen DS5' 'Ford Tourneo' "Kia cee'd"
 "Kia pro_cee'd" 'Land Rover Evoque' 'Mercedes-Benz Classe GLB'
 'Mercedes-Benz Classe GLC' 'Mercedes-Benz Classe GLE' 'Opel Crossland X'
 'Suzuki SX4 S-Cross']


In [50]:
# Corriger manuellement le nom du modèle/marque pour qu'il soit aligné avec celui utilisé par le site
dict_model_corr = {

 'Abarth 595' : '500',
 'Abarth 595C' : '500',
 'Citroen C4 Grand Picasso' : 'C4 PICASSO',
 'Citroen C4 Picasso' : 'C4 PICASSO', 
 'Citroen DS3' : 'DS3',
 'Citroen DS3 Cabrio' : 'DS3',
 'Citroen DS4' : 'DS4',
 'Citroen DS4 Crossback' : 'DS4 CROSSBACK',
 'Citroen DS5' : 'DS5',
 'Ford Tourneo' : 'TOURNEO COURIER',
 "Kia cee'd" : 'CEED',
 "Kia pro_cee'd": 'PROCEED',
 'Land Rover Evoque' : 'RANGE ROVER EVOQUE',
 'Mercedes-Benz Classe GLB' : 'GLB',
 'Mercedes-Benz Classe GLC' : 'GLC',
 'Mercedes-Benz Classe GLE' : 'GLE',
 'Opel Crossland X' : 'CROSSLAND' ,
 'Suzuki SX4 S-Cross' : 'S-CROSS'
}

In [51]:
# Create columns to keep original "marque" & "modele" name
df4["marque_init"] = df4["marque"]
df4['modele_init'] = df4['modele']

# Replace the "modele" name
df4['modele'] = df4['modele'].replace(dict_model_corr)
df4.head()

Unnamed: 0,marque,modele,annee,nb_occurrences,modele_annee,marque_init,modele_init
0,Abarth,500,2017,1,Abarth 595 2017,Abarth,Abarth 595
1,Abarth,500,2020,1,Abarth 595C 2020,Abarth,Abarth 595C
2,Citroen,C4 PICASSO,2017,1,Citroen C4 Grand Picasso 2017,Citroen,Citroen C4 Grand Picasso
3,Citroen,C4 PICASSO,2019,1,Citroen C4 Grand Picasso 2019,Citroen,Citroen C4 Grand Picasso
4,Citroen,C4 PICASSO,2019,1,Citroen C4 Picasso 2019,Citroen,Citroen C4 Picasso


In [43]:
#df4[(df4["modele"].str.contains("DS", na=False)) & (df4["marque"].str.contains("Citroen", na=False))]

In [52]:
# Rename "marque" Citroen = DS if "modele" contains DS and "marque" contains Citroen
df4.loc[(df4["modele"].str.contains("DS", na=False)) 
        & (df4["marque"].str.contains("Citroen", na=False)), "marque"] = "DS"
df4['modele_annee_upd'] = df4['modele']+ ' ' + df4['annee'].astype(str)
list_to_collect_4th = sorted(pd.unique(df4['modele_annee_upd']))
print(f"{len(list_to_collect_4th)} à collecter: \n {list_to_collect_4th}")

35 à collecter: 
 ['500 2017', '500 2020', 'C4 PICASSO 2017', 'C4 PICASSO 2019', 'CEED 2018', 'CEED 2019', 'CEED 2021', 'CEED 2022', 'CEED 2023', 'CROSSLAND 2023', 'DS3 2017', 'DS3 2018', 'DS3 2019', 'DS4 2018', 'DS4 CROSSBACK 2018', 'DS5 2017', 'DS5 2018', 'GLB 2020', 'GLB 2021', 'GLB 2022', 'GLC 2017', 'GLC 2018', 'GLC 2019', 'GLC 2020', 'GLC 2021', 'GLE 2018', 'PROCEED 2017', 'PROCEED 2019', 'PROCEED 2020', 'PROCEED 2021', 'RANGE ROVER EVOQUE 2017', 'RANGE ROVER EVOQUE 2018', 'RANGE ROVER EVOQUE 2020', 'S-CROSS 2021', 'TOURNEO COURIER 2020']


In [45]:
# url = "https://www.caradisiac.com/fiches-techniques"
# marque = "KIA"
# modele = "PROCEED"
# annee = "2021"
# df_test = collect_prix_neuf (url, marque, modele, annee)

In [55]:
# 4è essai
csv_path_ok = '../data/raw_data/prix_neuf_voitures_essaie_4.csv'
if not os.path.exists(csv_path_ok):
    csv_path_ko = '../data/raw_data/prix_neuf_voitures_essaie_4_error.csv'
    scraping_prix_neuf (df4, csv_path_ok, csv_path_ko, 20)
else:
    print(f"File {csv_path_ok} already exists, skipping...")

⏳ Starting to scrape 36 models & years...
--> 🔄Processing 1/36: 500 (2017)
Partial text: Abarth
Found exact match: ABARTH
Marque: ABARTH
Partial text: 500
Found exact match: 500
Modèle: 500
Partial text: 2017
Found exact match: 2017
Liste des versions: ['500 (2E GENERATION)', '500 C (2E GENERATION)']
Extraire la fiche technique de: 500 (2E GENERATION) - année 2017
Search url est : https://www.caradisiac.com/fiches-techniques/modele--abarth-500-2e-generation/2017
Extraire la fiche technique de: 500 C (2E GENERATION) - année 2017
Search url est : https://www.caradisiac.com/fiches-techniques/modele--abarth-500-c-2e-generation/2017
✅Successfully collected for 500 (2017)
--> 🔄Processing 2/36: 500 (2020)
Partial text: Abarth
Found exact match: ABARTH
Marque: ABARTH
Partial text: 500
Found exact match: 500
Modèle: 500
Partial text: 2020
Found exact match: 2020
Liste des versions: ['500 (2E GENERATION)', '500 C (2E GENERATION)']
Extraire la fiche technique de: 500 (2E GENERATION) - année 2020


In [56]:
# Import the result from 4th attempt
df4_ok = pd.read_csv(csv_path_ok)
df4_ok

# Liste des modèles qui sont collectés avec la 4è essaie
df4_ok['modele_annee'] = df4_ok['source_model'] + ' ' + df4_ok['source_year'].astype(str)
unique_modele_annee_collected_sup_3 = sorted(pd.unique(df4_ok["modele_annee"]))
print(f"{len(unique_modele_annee_collected_sup_3)} modèles et années collectés avec la 4è essaie: \n {unique_modele_annee_collected_sup_3}")

# Liste des modèles qui ne sont toujours pas collectés avec la 3è essaie
remaining_to_collect_sup_3 = sorted(set(list_to_collect_4th) - set(unique_modele_annee_collected_sup_3))
print(f"{len(remaining_to_collect_sup_3)} modèles et années restant à collecter: \n {remaining_to_collect_sup_3}")

33 modèles et années collectés avec la 4è essaie: 
 ['500 2017', '500 2020', 'C4 PICASSO 2017', 'CEED 2018', 'CEED 2019', 'CEED 2021', 'CEED 2022', 'CEED 2023', 'CROSSLAND 2023', 'DS3 2017', 'DS3 2018', 'DS3 2019', 'DS4 2018', 'DS4 CROSSBACK 2018', 'DS5 2017', 'DS5 2018', 'GLB 2020', 'GLB 2021', 'GLB 2022', 'GLC 2017', 'GLC 2018', 'GLC 2019', 'GLC 2020', 'GLC 2021', 'GLE 2018', 'PROCEED 2019', 'PROCEED 2020', 'PROCEED 2021', 'RANGE ROVER EVOQUE 2017', 'RANGE ROVER EVOQUE 2018', 'RANGE ROVER EVOQUE 2020', 'S-CROSS 2021', 'TOURNEO COURIER 2020']
2 modèles et années restant à collecter: 
 ['C4 PICASSO 2019', 'PROCEED 2017']


In [75]:
df4_ok

Unnamed: 0,source_model,source_year,Versions,Portes,Energie,Boite,CO2\n(g/km),Prix,url,option_marque_select,option_modele_select,option_year_select,match_type_marque,match_type_modele,match_type_year,modele_annee
0,500,2017,II (2) 1.4 TURBO 16V T-JET 145 595,3.0,Ess.,Mécanique,163 (wltp),20 790 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,500,2017,Very Exact match,Very Exact match,Very Exact match,500 2017
1,500,2017,II (2) 1.4 TURBO 16V T-JET 145 595 MSQ,3.0,Ess.,Automatique,134 (nedc),20 400 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,500,2017,Very Exact match,Very Exact match,Very Exact match,500 2017
2,500,2017,II (2) 1.4 TURBO 16V T-JET 165 595 TURISMO,3.0,Ess.,Mécanique,167 (wltp),23 890 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,500,2017,Very Exact match,Very Exact match,Very Exact match,500 2017
3,500,2017,II (2) 1.4 TURBO 16V T-JET 165 695 XSR YAMAHA,3.0,Ess.,Mécanique,139 (nedc),27 800 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,500,2017,Very Exact match,Very Exact match,Very Exact match,500 2017
4,500,2017,II (2) 1.4 TURBO T-JET 160 595 PISTA,3.0,Ess.,Mécanique,139 (nedc),21 500 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,500,2017,Very Exact match,Very Exact match,Very Exact match,500 2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2108,S-CROSS,2021,1.4 BOOSTERJET HYBRID STYLE,5.0,Ess.,Mécanique,122 (wltp),32 240 €,https://www.caradisiac.com/fiches-techniques/m...,SUZUKI,S-CROSS,2021,Very Exact match,Very Exact match,Very Exact match,S-CROSS 2021
2109,S-CROSS,2021,1.4 BOOSTERJET HYBRID STYLE 2020,5.0,Ess.,Mécanique,123 (wltp),27 640 €,https://www.caradisiac.com/fiches-techniques/m...,SUZUKI,S-CROSS,2021,Very Exact match,Very Exact match,Very Exact match,S-CROSS 2021
2110,S-CROSS,2021,1.4 BOOSTERJET HYBRID STYLE ALLGRIP,5.0,Ess.,Mécanique,133 (wltp),29 640 €,https://www.caradisiac.com/fiches-techniques/m...,SUZUKI,S-CROSS,2021,Very Exact match,Very Exact match,Very Exact match,S-CROSS 2021
2111,S-CROSS,2021,1.4 BOOSTERJET HYBRID STYLE ALLGRIP AUTO,5.0,Ess.,Automatique,142 (wltp),31 340 €,https://www.caradisiac.com/fiches-techniques/m...,SUZUKI,S-CROSS,2021,Very Exact match,Very Exact match,Very Exact match,S-CROSS 2021


In [57]:
df4.head()

Unnamed: 0,marque,modele,annee,nb_occurrences,modele_annee,marque_init,modele_init,modele_annee_upd
0,Abarth,500,2017,1,Abarth 595 2017,Abarth,Abarth 595,500 2017
1,Abarth,500,2020,1,Abarth 595C 2020,Abarth,Abarth 595C,500 2020
2,Citroen,C4 PICASSO,2017,1,Citroen C4 Grand Picasso 2017,Citroen,Citroen C4 Grand Picasso,C4 PICASSO 2017
3,Citroen,C4 PICASSO,2019,1,Citroen C4 Grand Picasso 2019,Citroen,Citroen C4 Grand Picasso,C4 PICASSO 2019
4,Citroen,C4 PICASSO,2019,1,Citroen C4 Picasso 2019,Citroen,Citroen C4 Picasso,C4 PICASSO 2019


In [58]:
# Nombre d'occurences par modèles et années non collectés
df4_ko = df4[df4['modele_annee_upd'].isin(remaining_to_collect_sup_3)]
df4_ko = df4_ko.reset_index(drop=True)
df4_ko.groupby('modele_annee_upd')['nb_occurrences'].sum().reset_index()

Unnamed: 0,modele_annee_upd,nb_occurrences
0,C4 PICASSO 2019,2
1,PROCEED 2017,1


### Final

In [63]:
# Concatenate the dataframes
df_combined = pd.concat([df_prix_neuf, df2_ok, df3_ok, df4_ok], ignore_index=True)

# Display the combined dataframe
df_combined.head()

# Remove duplicates
df_combined_nodup = df_combined.drop_duplicates(subset=['source_model', 'option_marque_select', 'option_modele_select', 'option_year_select', 'Versions'])
list_final = sorted(pd.unique(df_combined_nodup["modele_annee"]))

print(len(list_final))

749


In [76]:
df_combined_nodup.head()

Unnamed: 0,source_model,Versions,Portes,Energie,Boite,CO2\n(g/km),Prix,url,option_marque_select,option_year_select,source_file,Version_selected
0,Abarth 124 Spider,II 1.4 TURBO 170,2.0,Ess.,Mécanique,146 (nedc),34 500 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,2019,raw_data\prix_neuf_voitures_pack1.csv,abarth 124 2e generation spider
1,Abarth 124 Spider,II 1.4 TURBO 170 GT,2.0,Ess.,Mécanique,146 (nedc),40 900 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,2019,raw_data\prix_neuf_voitures_pack1.csv,abarth 124 2e generation spider
2,Abarth 124 Spider,II 1.4 TURBO 170 GT BVA,2.0,Ess.,Automatique,161 (nedc),42 900 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,2019,raw_data\prix_neuf_voitures_pack1.csv,abarth 124 2e generation spider
3,Abarth 124 Spider,II 1.4 TURBO 170 TURISMO,2.0,Ess.,Mécanique,146 (nedc),37 500 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,2019,raw_data\prix_neuf_voitures_pack1.csv,abarth 124 2e generation spider
4,Abarth 124 Spider,II 1.4 TURBO 170 TURISMO BVA,2.0,Ess.,Automatique,161 (nedc),39 500 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,2019,raw_data\prix_neuf_voitures_pack1.csv,abarth 124 2e generation spider


In [77]:
# Remove missing price
df_combined_nodup = df_combined_nodup[df_combined_nodup["Prix"].notna()]
df_combined_nodup['Prix'].describe()

count        56680
unique        5878
top       28 900 €
freq           137
Name: Prix, dtype: object

In [66]:
# Checking missing price is well removed
missing_price = df_combined_nodup['Prix'].isnull().sum()
print(missing_price)

0


In [67]:
# Extract the version from the URL
df_combined_nodup['Version_selected'] = df_combined_nodup['url'].str.extract(r'--(.*?)/')
df_combined_nodup['Version_selected'] = df_combined_nodup['Version_selected'].str.replace('-', ' ', regex=False)

In [68]:
df_combined_nodup['option_marque_select'] = df_combined_nodup['option_marque_select'].str.upper()
df_combined_nodup['Versions'] = df_combined_nodup['Versions'].str.upper()
df_combined_nodup.drop(columns=['source_year', 'option_modele_select',
                                'match_type_marque', 'match_type_modele', 'match_type_year',
                                'modele_annee', 'modele_annee_option',  ], inplace=True)

In [69]:
df_combined_nodup.head()

Unnamed: 0,source_model,Versions,Portes,Energie,Boite,CO2\n(g/km),Prix,url,option_marque_select,option_year_select,source_file,Version_selected
0,Abarth 124 Spider,II 1.4 TURBO 170,2.0,Ess.,Mécanique,146 (nedc),34 500 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,2019,raw_data\prix_neuf_voitures_pack1.csv,abarth 124 2e generation spider
1,Abarth 124 Spider,II 1.4 TURBO 170 GT,2.0,Ess.,Mécanique,146 (nedc),40 900 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,2019,raw_data\prix_neuf_voitures_pack1.csv,abarth 124 2e generation spider
2,Abarth 124 Spider,II 1.4 TURBO 170 GT BVA,2.0,Ess.,Automatique,161 (nedc),42 900 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,2019,raw_data\prix_neuf_voitures_pack1.csv,abarth 124 2e generation spider
3,Abarth 124 Spider,II 1.4 TURBO 170 TURISMO,2.0,Ess.,Mécanique,146 (nedc),37 500 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,2019,raw_data\prix_neuf_voitures_pack1.csv,abarth 124 2e generation spider
4,Abarth 124 Spider,II 1.4 TURBO 170 TURISMO BVA,2.0,Ess.,Automatique,161 (nedc),39 500 €,https://www.caradisiac.com/fiches-techniques/m...,ABARTH,2019,raw_data\prix_neuf_voitures_pack1.csv,abarth 124 2e generation spider


In [70]:
df_combined_nodup.shape

(56680, 12)

In [71]:
# Exporter les données dans un fichier CSV pour l'utilisation ultérieure
df_combined_nodup.to_csv('../data/raw_data/prix_neuf_voitures_vf.csv', index=False, encoding='utf-8-sig')
print("Data exported successfully to ../data/raw_data/prix_neuf_voitures.csv")

Data exported successfully to ../data/raw_data/prix_neuf_voitures.csv


In [73]:
df_combined_nodup[df_combined_nodup['source_model'].str.upper().str.contains("CROSSLAND", na=False)]

Unnamed: 0,source_model,Versions,Portes,Energie,Boite,CO2\n(g/km),Prix,url,option_marque_select,option_year_select,source_file,Version_selected
32153,Opel Crossland X,1.2 81 EDITION,5.0,Ess.,Mécanique,116 (nedc),18 500 €,https://www.caradisiac.com/fiches-techniques/m...,OPEL,2017,raw_data\prix_neuf_voitures_pack2.csv,opel crossland x
32154,Opel Crossland X,1.2 81 INNOVATION,5.0,Ess.,Mécanique,116 (nedc),20 000 €,https://www.caradisiac.com/fiches-techniques/m...,OPEL,2017,raw_data\prix_neuf_voitures_pack2.csv,opel crossland x
32155,Opel Crossland X,1.2 ECOTEC TURBO 110 EDITION,5.0,Ess.,Mécanique,109 (nedc),20 000 €,https://www.caradisiac.com/fiches-techniques/m...,OPEL,2017,raw_data\prix_neuf_voitures_pack2.csv,opel crossland x
32156,Opel Crossland X,1.2 ECOTEC TURBO 110 INNOVATION,5.0,Ess.,Mécanique,111 (nedc),21 500 €,https://www.caradisiac.com/fiches-techniques/m...,OPEL,2017,raw_data\prix_neuf_voitures_pack2.csv,opel crossland x
32157,Opel Crossland X,1.2 TURBO 110 EDITION AUTOMATIQUE,5.0,Ess.,Automatique,123 (nedc),21 300 €,https://www.caradisiac.com/fiches-techniques/m...,OPEL,2017,raw_data\prix_neuf_voitures_pack2.csv,opel crossland x
...,...,...,...,...,...,...,...,...,...,...,...,...
62354,CROSSLAND,(2) CROSSLAND 1.2 TURBO 110 ELEGANCE BUSINESS ...,5.0,Ess.,Mécanique,131 (wltp),27 900 €,https://www.caradisiac.com/fiches-techniques/m...,OPEL,2023,,opel crossland
62355,CROSSLAND,(2) CROSSLAND 1.2 TURBO 130 ELEGANCE BUSINESS ...,5.0,Ess.,Automatique,141 (wltp),30 500 €,https://www.caradisiac.com/fiches-techniques/m...,OPEL,2023,,opel crossland
62356,CROSSLAND,(2) CROSSLAND 1.2 TURBO 130 ELEGANCE BVA6,5.0,Ess.,Automatique,142 (wltp),29 300 €,https://www.caradisiac.com/fiches-techniques/m...,OPEL,2023,,opel crossland
62357,CROSSLAND,(2) CROSSLAND 1.5 D 110 ELEGANCE BUSINESS BVM6,5.0,Dies.,Mécanique,127 (wltp),29 600 €,https://www.caradisiac.com/fiches-techniques/m...,OPEL,2023,,opel crossland


In [78]:
df_combined_nodup['source_model'].isnull().sum()

np.int64(0)

In [None]:
# df_model[(df_model['modele'].str.lower().str.contains("ds", na=False)) & (df_model['marque'].str.lower().str.contains('citroen', na=False))]

In [None]:
# Rename "marque" Citroen = DS if "modele" contains DS and "marque" contains Citroen
# df_model.loc[(df_model['marque'].str.lower().str.contains('citroen', na=False)) & (df_model['modele'].str.lower().str.contains('ds', na=False)), "marque"] = "DS"

In [None]:
# Checking again, should be empty
# df_model[(df_model['modele'].str.lower().str.contains("ds", na=False)) & (df_model['marque'].str.lower().str.contains('citroen', na=False))]

In [None]:
# Some cleaning
# df_model['marque'] = df_model['marque'].str.upper()
# df_model['finition'] = df_model['finition'].str.upper()

In [None]:
# Merging df_model with df_combined_nodup to get the listing price
# df_merge_1 = df_model.merge(df_combined_nodup, how="left", left_on=['marque', 'modele', 'annee'], right_on = ['option_marque_select', 'source_model', 'option_year_select'] )

In [None]:
# Checking the merge "exactly" or "partially"
# df_merge_1['match_type'] = df_merge_1.apply(lambda row: 'Exact' if row['finition'] == row['Versions'] else 'Partial', axis=1)

In [None]:
# df_merge_1[df_merge_1['finition'].str.upper().str.contains("SPORTBACK 30 TFSI", na=False)]

In [None]:
# df_combined_nodup[(df_combined_nodup['option_marque_select'].str.upper().str.contains("AUDI", na=False))
#                   & (df_combined_nodup['source_year'].astype(str) == "2023")
#                   ]