In [1]:
# Import necessary libraries
import pickle
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
from bs4 import BeautifulSoup
import time

In [2]:
# Define makes and their respective ids

makes = {
    "Audi": "1900", # done
    "Volkswagen": "25200", # done
    "Skoda": "22900", # done
    "Seat": "22500", # in progress
    # "BMW": "3500", # done
    # "Mercedes-Benz": "17200", # in progress
}

In [3]:
def scrape_page_for_make(make, page_number):
    # Create a new browser instance
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    # Create the base url
    link = f"https://suchen.mobile.de/fahrzeuge/search.html?dam=false&isSearchRequest=true&p=%3A30000&s=Car&sb=rel&vc=Car&ms={make}&lang=en"

    if page_number > 1:
        link += f"&pageNumber={page_number}"
    print(f"Starting to scrape {link}")

    try:
        driver.get(link)

        # Wait for the page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # Try to handle the cookie consent pop-up
        try:
            cookie_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable(
                    (By.XPATH, "//button[contains(text(), 'Accept')]")
                )
            )
            cookie_button.click()
            print("Cookie consent accepted.")
        except Exception as e:
            print("No cookie consent pop-up or unable to locate it:", e)

        # Pause to let the page fully load
        time.sleep(1)

        listings_source = driver.page_source
        listings_soup = BeautifulSoup(listings_source, "html.parser")
        print("Done scraping base link")

    finally:
        # Always close the driver
        driver.quit()

    return listings_soup

In [4]:
def extract_listing_links_from_soup(soup):
    listings_article = soup.find_all("article", attrs={"data-testid": "result-list-container"})[0]
    listings = listings_article.find_all("div", recursive=False)[1:]
    listings_links = [
        listing.find_all("a")[0]["href"] for listing in listings if listing.find_all("a")
    ]
    listings_links = [
        "https://suchen.mobile.de" + link for link in listings_links
    ]
    return listings_links

In [5]:
# Scrape the first 25 pages for each make
make_links = {}
for make, make_id in makes.items():
    make_links[make] = []
    for page_number in range(1, 51):
        soup = scrape_page_for_make(make_id, page_number)
        links = extract_listing_links_from_soup(soup)
        make_links[make].extend(links)
        time.sleep(1)

Starting to scrape https://suchen.mobile.de/fahrzeuge/search.html?dam=false&isSearchRequest=true&p=%3A30000&s=Car&sb=rel&vc=Car&ms=1900&lang=en
Cookie consent accepted.
Done scraping base link
Starting to scrape https://suchen.mobile.de/fahrzeuge/search.html?dam=false&isSearchRequest=true&p=%3A30000&s=Car&sb=rel&vc=Car&ms=1900&lang=en&pageNumber=2
Cookie consent accepted.
Done scraping base link
Starting to scrape https://suchen.mobile.de/fahrzeuge/search.html?dam=false&isSearchRequest=true&p=%3A30000&s=Car&sb=rel&vc=Car&ms=1900&lang=en&pageNumber=3
Cookie consent accepted.
Done scraping base link
Starting to scrape https://suchen.mobile.de/fahrzeuge/search.html?dam=false&isSearchRequest=true&p=%3A30000&s=Car&sb=rel&vc=Car&ms=1900&lang=en&pageNumber=4
Cookie consent accepted.
Done scraping base link
Starting to scrape https://suchen.mobile.de/fahrzeuge/search.html?dam=false&isSearchRequest=true&p=%3A30000&s=Car&sb=rel&vc=Car&ms=1900&lang=en&pageNumber=5
Cookie consent accepted.
Done sc

In [12]:
# Save the links to a pickle file
with open("./data/make_links.pkl", "wb") as f:
    pickle.dump(make_links, f)

In [7]:
def extract_tech_data_from_listing_link(listing_link):
    # Create a new browser instance for the listing
    driver = webdriver.Chrome(
        service=ChromeService(ChromeDriverManager().install())
    )
    try:
        driver.get(listing_link)

        # Wait for the page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # Try to handle the cookie consent pop-up
        try:
            cookie_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable(
                    (By.XPATH, "//button[contains(text(), 'Accept')]")
                )
            )
            cookie_button.click()
            # print("Cookie consent accepted.")
        except Exception as e:
            print("No cookie consent pop-up or unable to locate it:", e)

        # Pause to let the page fully load
        time.sleep(1)

        # Scrape the page content
        car_page = driver.page_source
        car_soup = BeautifulSoup(car_page, "html.parser")
        # print("Done scraping car page")

    finally:
        # Always close the driver
        driver.quit()
    #Extract the price
    try:
        car_price_div = car_soup.find_all("div", attrs={"data-testid": "vip-price-label"})[0]
        car_price = car_price_div.find_all("div")[0].text
        # Extract the technical data
        car_data = car_soup.find_all("article", attrs={"data-testid": "vip-technical-data-box"})[0]
        car_data = car_data.find_all("dl")[0]
        # extract all dt and dd tags
        car_data = car_data.find_all(["dt", "dd"])
        # zip them together
        car_data = list(zip(car_data[::2], car_data[1::2]))
        # make technical_data a dictionary
        car_data = {dt.text: dd.text for dt, dd in car_data}
        return car_data, car_price
    except Exception as e:
        print(f"Error extracting technical data for {listing_link}: {e}")
        return None, None

In [8]:
technical_data = {}
for make, links in make_links.items():
    technical_data[make] = []
    for link in tqdm(links):
        data, price = extract_tech_data_from_listing_link(link)
        if data is None or price is None:
            time.sleep(1)
            continue
        data = dict(data)
        data["price"] = price
        technical_data[make].append(data)
        time.sleep(1)

100%|██████████| 1002/1002 [1:59:26<00:00,  7.15s/it]
100%|██████████| 1002/1002 [2:08:56<00:00,  7.72s/it] 
  6%|▋         | 63/1002 [08:35<2:14:15,  8.58s/it]

Error extracting technical data for https://suchen.mobile.de/fahrzeuge/details.html?id=413155842&dam=false&isSearchRequest=true&lang=en&ms=22900%3B%3B%3B&p=%3A30000&pageNumber=4&ref=srp&refId=62ce5451-36e3-c4b9-07e4-9da877dca15a&s=Car&sb=rel&searchId=62ce5451-36e3-c4b9-07e4-9da877dca15a&vc=Car: list index out of range


  7%|▋         | 66/1002 [08:59<2:05:52,  8.07s/it]

Error extracting technical data for https://suchen.mobile.de/fahrzeuge/details.html?id=413155899&dam=false&isSearchRequest=true&lang=en&ms=22900%3B%3B%3B&p=%3A30000&pageNumber=4&ref=srp&refId=62ce5451-36e3-c4b9-07e4-9da877dca15a&s=Car&sb=rel&searchId=62ce5451-36e3-c4b9-07e4-9da877dca15a&vc=Car: list index out of range


100%|██████████| 1002/1002 [1:54:30<00:00,  6.86s/it] 
 92%|█████████▏| 917/1002 [1:44:14<09:16,  6.55s/it]

Error extracting technical data for https://suchen.mobile.de/fahrzeuge/details.html?id=400037678&dam=false&isSearchRequest=true&lang=en&ms=22500%3B%3B%3B&p=%3A30000&pageNumber=46&ref=srp&refId=2eca9031-dd3d-7ecb-f4d5-a7891e010ed7&s=Car&sb=rel&searchId=2eca9031-dd3d-7ecb-f4d5-a7891e010ed7&vc=Car: list index out of range


100%|██████████| 1002/1002 [1:53:52<00:00,  6.82s/it]


In [11]:
with open("./data/technical_data.pkl", "wb") as f:
    pickle.dump(technical_data, f)