In [2]:
import os
import re
import time
import random
import requests
import shutil
import undetected_chromedriver as uc

from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# --- Constants and URLs ---
START_URL = (
    "https://www.blocket.se/bilar/sok?"
    "filter=%7B\"key\"%3A\"modelYear\"%2C\"range\"%3A%7B\"start\"%3A\"2015\"%2C\"end\"%3A\"\"%7D%7D&"
    "filter=%7B\"key\"%3A\"sellerType\"%2C\"values\"%3A%5B\"Privat\"%5D%7D"
)
CARS_FOLDER = "cars_test"
LINKS_FILE = "car_links_test.txt"
CHROME_PATH = r"C:\Users\gisse\Downloads\chrome-win64\chrome-win64\chrome.exe"
DRIVER_PATH = r"C:\Users\gisse\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
CHROME_VERSION = 135

# --- Chrome Options and Driver Setup ---
options = uc.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--start-maximized")
options.binary_location = CHROME_PATH

driver = uc.Chrome(
    driver_executable_path=DRIVER_PATH,
    browser_executable_path=CHROME_PATH,
    version_main=CHROME_VERSION,
    options=options
)
 
wait = WebDriverWait(driver, 20)

# --- Utilities ---
def sanitize_filename(s):
    return re.sub(r'[\\/:*?\"<>|]+', '_', s)

def bypass_cookies():
    try:
        WebDriverWait(driver, 5).until(
            EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "iframe[id^='sp_message_iframe_']"))
        )
        WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button.sp_choice_type_11"))
        ).click()
        driver.switch_to.default_content()
    except:
        driver.switch_to.default_content()

def extract_price():
    try:
        el = driver.find_element(By.CSS_SELECTOR, "div[class*='Price__StyledPrice']")
        return re.sub(r"\D", "", el.text.strip()) or "N/A"
    except:
        return "N/A"

def get_ad_details():
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "article"))
    )
    try:
        title = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
    except:
        title = "UnknownTitle"
    return title, extract_price()

def save_carousel_images(folder):
    CSS_DIVS = "article div[style*='background-image']"
    CSS_NEXT_BTN = "button.SliderControls__StyledButton-sc-1dbsnpt-4.cIKvvT"
    all_urls, prev_count = set(), 0

    for _ in range(25):
        divs = driver.find_elements(By.CSS_SELECTOR, CSS_DIVS)
        for d in divs:
            style = d.get_attribute("style")
            m = re.search(r'url\("([^"]+)"\)', style)
            if m:
                all_urls.add(m.group(1))
        if len(all_urls) == prev_count:
            break
        prev_count = len(all_urls)
        try:
            btn = driver.find_element(By.CSS_SELECTOR, CSS_NEXT_BTN)
            if btn.is_enabled() and btn.is_displayed():
                btn.click()
                time.sleep(1)
            else:
                break
        except:
            break

    for i, url in enumerate(all_urls, 1):
        img_path = os.path.join(folder, f"img_{i}.jpg")
        try:
            with open(img_path, "wb") as f:
                f.write(requests.get(url).content)
        except Exception as e:
            print(f"Failed to save image from {url}: {e}")

def save_parameters(folder):
    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='ExpandableContent__Content']"))
        )
        labels = driver.find_elements(By.CSS_SELECTOR, "div[class*='ParamsWithIcons__StyledLabel']")
        values = driver.find_elements(By.CSS_SELECTOR, "div[class*='ParamsWithIcons__StyledParamValue']")
        for i in range(min(len(labels), len(values))):
            lbl = labels[i].text.strip() or f"NoLabel_{i}"
            val = values[i].text.strip() or "N/A"
            fn = sanitize_filename(lbl)
            with open(os.path.join(folder, f"{fn}.txt"), "w", encoding="utf-8") as ff:
                ff.write(val)
    except:
        pass

def scrape_single_ad(url):
    driver.get(url)
    bypass_cookies()
    title, price = get_ad_details()
    folder_name = sanitize_filename(title)[:80] or "NoTitle"
    folder = os.path.join(CARS_FOLDER, folder_name)
    os.makedirs(folder, exist_ok=True)

    with open(os.path.join(folder, "price.txt"), "w", encoding="utf-8") as f:
        f.write(price)
    with open(os.path.join(folder, "url.txt"), "w", encoding="utf-8") as f:
        f.write(url)

    save_carousel_images(folder)
    save_parameters(folder)

def collect_latest_links(n=50):
    print(f"[INFO] Collecting latest {n} car ad links...")
    driver.get(START_URL)
    bypass_cookies()

    collected = set()
    SCROLL_PAUSE_TIME = 2
    MAX_SCROLLS = 10

    for _ in range(MAX_SCROLLS):
        ads = driver.find_elements(By.CSS_SELECTOR, "div.list.w-full > div > a")
        for ad in ads:
            href = ad.get_attribute("href")
            if href and href.startswith("https://www.blocket.se/annons/") and href not in collected:
                collected.add(href)
                if len(collected) >= n:
                    break
        if len(collected) >= n:
            break
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)

    with open(LINKS_FILE, "w", encoding="utf-8") as f:
        for link in list(collected)[:n]:
            f.write(link + "\n")

    print(f"[INFO] Collected {len(collected)} links into {LINKS_FILE}")

def scrape_from_file():
    if not os.path.exists(LINKS_FILE):
        print(f"[ERROR] File {LINKS_FILE} not found.")
        return

    with open(LINKS_FILE, "r", encoding="utf-8") as f:
        all_links = [line.strip() for line in f if line.strip()]

    print(f"[INFO] Found {len(all_links)} links in {LINKS_FILE}. Starting scrape...")
    os.makedirs(CARS_FOLDER, exist_ok=True)

    skipped = 0
    scraped = 0

    for link in all_links:
        try:
            driver.get(link)
            bypass_cookies()
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "h1"))
            )
            title = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
            folder_name = sanitize_filename(title)[:80] or "NoTitle"
            folder = os.path.join(CARS_FOLDER, folder_name)

            if os.path.exists(os.path.join(folder, "url.txt")):
                skipped += 1
                print(f"[SKIP] Already scraped: {folder}")
                continue

            scrape_single_ad(link)
            scraped += 1

        except Exception as e:
            print(f"[ERROR] {link} -> {e}")
            continue

    print(f"[DONE] Scraped {scraped} new ads, skipped {skipped} existing ones.")

def remove_dirs_with_slap(cars_folder="cars_test"):
    removed_count = 0
    total_count = 0
    for root, dirs, files in os.walk(cars_folder):
        if "Biltyp.txt" in files:
            total_count += 1
            file_path = os.path.join(root, "Biltyp.txt")
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                    if any(x in content for x in ["Släp", "Slap", "Husvagn", "Vagn", "Husbil"]):
                        print(f"[REMOVE] {root}")
                        shutil.rmtree(root)
                        removed_count += 1
            except Exception as e:
                print(f"[ERROR] Failed to read or delete {file_path}: {e}")
    print(f"[CLEANUP DONE] Removed {removed_count} folders. {total_count - removed_count} kept.")

def main():
    collect_latest_links(n=50)
    scrape_from_file()
    remove_dirs_with_slap()
    driver.quit()

if __name__ == "__main__":
    main()


[INFO] Collecting latest 50 car ad links...
[INFO] Collected 40 links into car_links_test.txt
[INFO] Found 40 links in car_links_test.txt. Starting scrape...
[SKIP] Already scraped: cars_test\Volvo V90 Cross Country D5 AWD Geartronic Momentum Euro 6
[SKIP] Already scraped: cars_test\Toyota Corolla Hybrid e-CVT Executive Euro 6
[SKIP] Already scraped: cars_test\Citroën
[SKIP] Already scraped: cars_test\BMW X5 xDrive40e laddhybrid M Sport H_K Läder Navi
[SKIP] Already scraped: cars_test\Kia Sportage PHEV Advance Euro 6
[SKIP] Already scraped: cars_test\Volvo V90 D3 Business, Kinetic Euro 6
[SKIP] Already scraped: cars_test\Kia Picanto
[SKIP] Already scraped: cars_test\Volkswagen Tiguan Allspace 2.0 TDI 4Motion R-Line 7-Sits
[SKIP] Already scraped: cars_test\Nissan Qashqai 1.6 dCi XTRONIC-CVT Euro 5
[SKIP] Already scraped: cars_test\Hyundai Kona Electric 64 kWh Advanced, Trend
[SKIP] Already scraped: cars_test\Toyota Prius+ Hybrid CVT Euro 6
[SKIP] Already scraped: cars_test\BMW 330e Tour

In [None]:
! pip install undetected-chromedriver selenium requests ipykernel

Collecting undetected-chromedriver
  Using cached undetected_chromedriver-3.5.5-py3-none-any.whl
Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl (9.4 MB)
Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl (64 kB)
Collecting websockets
  Downloading websockets-15.0.1-cp39-cp39-win_amd64.whl (176 kB)
Collecting urllib3[socks]<3,>=1.26
  Using cached urllib3-2.4.0-py3-none-any.whl (128 kB)
Collecting trio-websocket~=0.9
  Downloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Collecting trio~=0.17
  Downloading trio-0.29.0-py3-none-any.whl (492 kB)
Collecting certifi>=2021.10.8
  Using cached certifi-2025.1.31-py3-none-any.whl (166 kB)
Collecting websocket-client~=1.8
  Downloading websocket_client-1.8.0-py3-none-any.whl (58 kB)
Collecting idna<4,>=2.5
  Using cached idna-3.10-py3-none-any.whl (70 kB)
Collecting charset-normalizer<4,>=2
  Using cached charset_normalizer-3.4.1-cp39-cp39-win_amd64.whl (102 kB)
Collecting sniffio>=1.3.0
  Downloading s

You should consider upgrading via the 'C:\Users\gisse\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.
