# Data Scraping from Cars.com using Selenium

In [1]:
# Install Selenium and Chromium
!pip install selenium
!apt update
!apt install -y chromium-chromedriver

# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

# Configure Chrome options for Google Colab
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode (no GUI)
options.add_argument('--no-sandbox')  # Required for Colab
options.add_argument('--disable-dev-shm-usage')  # Prevents memory issues

# List of transmission types, drivetrain types, and fuel types with varying page limits
transmission_types = {
    "CVT": "cvt"
}

drivetrain_types = {
    "All-Wheel Drive": "all_wheel_drive"
}

# Fuel types with page limits
fuel_types = {
    "Hybrid": {"slug": "hybrid", "page_limit": 35}
}

# Initialize lists to store data
car_names, prices, dealer_names = [], [], []
transmission_list, drivetrain_list, fuel_type_list = [], [], []
ratings, reviews, locations, mileages = [], [], [], []

# Base URL with dealership filter and radius filter
base_url = "https://www.cars.com/shopping/results/?dealer_id=&include_shippable=true&keyword=&list_price_max=&list_price_min=&makes[]=&maximum_distance=75&mileage_max=&monthly_payment=&page_size=20&seller_type[]=dealership&sort=best_match_desc&stock_type=used&year_max=&year_min=&zip=21227"

# Loop through each combination of transmission, drivetrain, and fuel type
for transmission, trans_slug in transmission_types.items():
    for drivetrain, drive_slug in drivetrain_types.items():
        for fuel, fuel_data in fuel_types.items():
            fuel_slug = fuel_data["slug"]
            page_limit = fuel_data["page_limit"]

            # Loop through the pages based on the fuel type's page limit
            for page_num in range(1, page_limit + 1):
                # Construct the URL dynamically
                url = f"{base_url}&transmission_slugs[]={trans_slug}&drivetrain_slugs[]={drive_slug}&fuel_slugs[]={fuel_slug}&page={page_num}"

                # Initialize the WebDriver for each page
                driver = webdriver.Chrome(options=options)
                driver.get(url)

                # Wait for the page to load completely
                time.sleep(5)

                # Try to find the vehicle elements
                vehicle_elements = driver.find_elements(By.CLASS_NAME, 'vehicle-card-main')

                # Check if no results were found (likely the last page or no data for the combination)
                if len(vehicle_elements) == 0:
                    print(f"No results found on page {page_num} for Transmission: {transmission}, Drivetrain: {drivetrain}, Fuel: {fuel}")
                    driver.quit()
                    break  # Exit the loop if there are no more results

                # Loop through the vehicle elements and extract data
                for vehicle in vehicle_elements:
                    try:
                        # Extract car name
                        try:
                            name = vehicle.find_element(By.CLASS_NAME, 'title').text
                        except:
                            name = 'N/A'
                        car_names.append(name)

                        # Extract price
                        try:
                            price = vehicle.find_element(By.CLASS_NAME, 'primary-price').text
                        except:
                            price = 'N/A'
                        prices.append(price)

                        # Extract dealer name
                        try:
                            dealer_name = vehicle.find_element(By.CLASS_NAME, 'dealer-name').text
                        except:
                            dealer_name = 'N/A'
                        dealer_names.append(dealer_name)

                        # Append the transmission type (from loop)
                        transmission_list.append(transmission)

                        # Append the drivetrain type (from loop)
                        drivetrain_list.append(drivetrain)

                        # Append the fuel type (from loop)
                        fuel_type_list.append(fuel)

                        # Extract rating (if available)
                        try:
                            rating = vehicle.find_element(By.CSS_SELECTOR, 'spark-rating').get_attribute('rating')
                        except:
                            rating = 'N/A'
                        ratings.append(rating)

                        # Extract reviews count (if available)
                        try:
                            reviews_count = vehicle.find_element(By.CLASS_NAME, 'sds-rating__link').text.strip('()')
                        except:
                            reviews_count = 'N/A'
                        reviews.append(reviews_count)

                        # Extract location (if available)
                        try:
                            location = vehicle.find_element(By.CLASS_NAME, 'miles-from').text
                        except:
                            location = 'N/A'
                        locations.append(location)

                        # Extract mileage (if available)
                        try:
                            mileage = vehicle.find_element(By.CLASS_NAME, 'mileage').text
                        except:
                            mileage = 'N/A'
                        mileages.append(mileage)

                    except Exception as e:
                        print(f"Error extracting data: {e}")

                # Print progress
                print(f"Completed page {page_num}/{page_limit} for Fuel: {fuel}, Transmission: {transmission}, Drivetrain: {drivetrain}")

                # Close the browser after processing the current page
                driver.quit()

# After extracting data, create the DataFrame
df = pd.DataFrame({
    'Car Name': car_names,
    'Price': prices,
    'Dealer Name': dealer_names,
    'Transmission Type': transmission_list,
    'Drivetrain': drivetrain_list,
    'Fuel Type': fuel_type_list,
    'Rating': ratings,
    'Reviews': reviews,
    'Location': locations,
    'Mileage': mileages
})

# Save the data to a CSV file in Google Colab
csv_filename = "/content/cars_listings_cvt_al_hybrid.csv"
df.to_csv(csv_filename, index=False)

# Display the DataFrame
print(df)

# Provide download link for the file
from google.colab import files
files.download(csv_filename)


Collecting selenium
  Downloading selenium-4.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.1-py3-none-any.whl.metadata (5.1 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.28.1-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m89.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m38.5 MB/s

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>