## Import Libraries

In [None]:
import time
import os
import requests
from selenium import webdriver # type: ignore
from selenium.webdriver.chrome.service import Service # type: ignore
from selenium.webdriver.common.by import By # type: ignore
import time
import random
import pandas as pd
import concurrent.futures

## Number of pages

In [None]:
driver = webdriver.Chrome()

url = 'https://www.schadeautos.nl/en/search/damaged/passenger-cars/1/1/0/0/0/0/1/0'
driver.get(url)
# Get the maximum number of pages

max_page = driver.find_element(By.XPATH, '/html/body/section[2]/div/div/div[2]/div/div[1]/ul/li[13]/a').get_attribute('href')
# max_page = max_page.text
max_page = int(max_page.split("/")[-1])
car_posts = []
i = 1

In [None]:
print(f"Total pages: {max_page}")

## Scrape all pages from the links of the posts

In [None]:
car_data = []

In [None]:
driver = webdriver.Chrome()

In [None]:
import concurrent.futures
import threading

# Create a lock
lock = threading.Lock()

def scrape_cars_posts(page):
    url = f'https://www.schadeautos.nl/en/search/damaged/passenger-cars/1/1/0/0/0/0/1/{page}'
    driver.get(url)
    car_items = driver.find_elements(By.CSS_SELECTOR, '.car-inner.flexinner')

    # Local list to store car data for this page
    car_data_batch = []

    # Collect links of the cars
    for item in car_items:
        title = item.find_element(By.TAG_NAME, 'h2')
        car_link = title.find_element(By.TAG_NAME, 'a')
        car_title = car_link.text
        car_href = car_link.get_attribute("href")
        car_data_batch.append({'title': car_title, 'link': car_href})

    # Acquire the lock before modifying the shared list
    with lock:
        car_data.extend(car_data_batch)

    print(f"Scraped {len(car_data_batch)} car posts from page {page}")


In [None]:


# Scraping car posts links for each page using multithreading
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(scrape_cars_posts, range(1, max_page + 1))

In [None]:
car_data = pd.DataFrame(car_data)
car_data.to_csv('./CSVs/car_data_new.csv', index=False)

In [None]:
car_data

In [None]:
import pandas as pd
df = pd.read_csv('./CSVs/car_data_new.csv')
df

In [None]:
print(f"Total cars: {len(df['link'])}")

## Keep only the links that matches the makes in the list 

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("./CSVs/car_data.csv")

# List of makes to filter (converted to lowercase)
makes_to_keep = ["dacia", "peugeot", "citroën", "renault", "ford", "toyota", 
                 "honda", "hyundai", "audi", "bmw", "volkswagen", "kia", 
                 "chevrolet", "mercedes", "nissan", "fiat"]

# Function to extract the make from the details
def extract_make(details):
    if pd.isna(details):
        return None
    words = details.split()
    if len(words) > 1:
        return words[0].lower()
    else:
        return None

# Apply the function to extract the make from each row and create a new column "Make"
df["Make"] = df["title"].apply(extract_make)

# Filter the DataFrame to include only the car posts with makes in the list (ignoring case)
filtered_df = df[df['Make'].isin(makes_to_keep)]

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv("./CSVs/filtered_car_posts.csv", index=False)
filtered_df


In [None]:
dropped_rows = len(df)-len(filtered_df)
dropped_rows

In [None]:
filter_dacia = filtered_df[filtered_df['Make'] == 'dacia']
filter_dacia


In [None]:
make_counts = filtered_df["Make"].value_counts()
sum_counts = make_counts.sum()
sum_counts

## Break the dataset on small parts

In [None]:
import pandas as pd
import numpy as np

# Load the filtered DataFrame
filtered_df = pd.read_csv("./CSVs/filtered_car_posts.csv")

# Drop the first column
filtered_df.drop(columns=filtered_df.columns[0], inplace=True)

# Split the DataFrame into 10 parts
filtered_df_parts = np.array_split(filtered_df, 5)

# Iterate over each part and save it as a separate CSV file
for i, part in enumerate(filtered_df_parts):
    part.to_csv(f"./CSVs/filtered_car_posts_{i + 1}.csv", index=False)

## Scrape all the Images

In [None]:
# Create a new directory for each make if it doesn't exist
if not os.path.exists("schadeautos"):
    os.makedirs("shcadeautos")

In [None]:
df = pd.read_csv('./CSVs/car_data_new.csv')

In [None]:
df

In [None]:
driver = webdriver.Chrome()

In [None]:
img_links = []

In [None]:
import threading

# Create a lock
lock = threading.Lock()

def scrape_imgs_links(link):
    # collecting images of each car
    driver.get(link)
    time.sleep(random.uniform(1, 5))

    # select all the images
    imgs = driver.find_elements(By.CSS_SELECTOR, '.thumbs img')
    
    # get all the links of the imgs
    img_links_batch = []  
    for img in imgs:
        img_src = img.get_attribute('src')
        img_links_batch.append(img_src)

    # Acquire the lock before modifying the shared list
    with lock:
        img_links.extend(img_links_batch)
    
    print(f"Downloaded {len(img_links_batch)} images")

In [None]:
def scrape_images_threaded(links):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(scrape_imgs_links, links)

# Scrape the images for the first 10 links
for link in df["link"][:10]:  
    scrape_images_threaded([link])

In [None]:
img_links

In [None]:
len(img_links)

In [None]:
img_links_lock = pd.DataFrame(img_links, columns=['img_links'])
img_links_lock.to_csv('./CSVs/img_links_lock.csv', index=False)

In [None]:
img_links_lock

In [None]:
img_links_lock = img_links_lock.drop_duplicates()
img_links_lock

In [None]:
img_links_df = pd.DataFrame(img_links, columns=['img_links'])

In [None]:
img_links_df.to_csv("./CSVs/img_links.csv", index=False)

In [None]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(scrape_imgs_links, df['link'])