In [9]:
# Import dependencies
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
import time

In [10]:
# Set up Splinter
# Make sure chromedriver.exe is in PATH
browser= Browser('chrome')
# Store url and visit
base_url='https://www.cars.com/shopping/results/?makes[]=honda&maximum_distance=50&models[]=honda-pilot&page=1&page_size=100&stock_type=all&zip=61606'
browser.visit(base_url)

# Create lists to store details
models = []
mileages= []
statuses = [] 
prices = []
dealers = []
cities = []

# Create a function to handle scraping current page
# Handle errors with ifs
def scrape_page():
    html = browser.html
    car_soup = soup(html, 'html.parser')
    car_list = car_soup.find_all('div', class_='vehicle-card')

    for car in car_list:
        # Extract car model
        car_model = car.find('h2', class_='title')
        model = str(car_model.get_text(strip=True)) if car_model else 'N/A'
        
        # Extract car mileage
        car_mileage = car.find('div', class_='mileage')
        mileage = str(car_mileage.get_text(strip=True)) if car_mileage else 'N/A'

        # Extract car status
        car_status = car.find('p', class_='stock-type')
        status = str(car_status.get_text(strip=True)) if car_status else 'N/A'

        # Extract car price
        car_price = car.find('span', class_='primary-price')
        price = str(car_price.get_text(strip=True)) if car_price else 'N/A'

        # Extract dealer name
        car_dealer = car.find('div', class_='dealer-name').find('strong')
        dealer = str(car_dealer.get_text(strip=True)) if car_dealer else 'N/A'

        # Extract dealer city
        dealer_city = car.find('div', class_='miles-from')
        city = str(dealer_city.get_text(strip=True)) if dealer_city else 'N/A'

        # Append details to lists
        models.append(model)
        statuses.append(status)
        prices.append(price)
        dealers.append(dealer)
        cities.append(city)
        mileages.append(mileage)

# Loop through all pages
while True:
    # Scrape the current page
    scrape_page()

    # Check if there is a "Next" button to go to the next page
    next_button = browser.find_by_css('a[aria-label="Next page"]')

    if next_button:
        try:
            next_button.click()  # Click on the next button
            print("Navigating to the next page...")
            time.sleep(30)  # Wait for the page to load, adjust as necessary
        except Exception as e:
            print(f"Exception occurred while navigating: {str(e)}")
            break  # Exit the loop if there's an issue navigating to the next page
    else:
        print("No more next button found. Exiting.")
        break  # Exit the loop if there's no next button

No more next button found. Exiting.


In [11]:
# Create df
data={
    'Model': models,
    'Mileage (mi)': mileages,
    'Status': statuses,
    'Price (USD)': prices,
    'Dealer': dealers,
    'City': cities 
    }
car_df= pd.DataFrame(data)

In [12]:
# Close the browser
browser.quit()

In [13]:
# Display DF
print("Original DataFrame:")
print(car_df)
print()

Original DataFrame:
                                   Model Mileage (mi)           Status  \
0                    2019 Honda Pilot EX   70,137 mi.             Used   
1       2021 Honda Pilot Special Edition   74,967 mi.             Used   
2                  2022 Honda Pilot EX-L   23,868 mi.             Used   
3                  2013 Honda Pilot EX-L   87,615 mi.             Used   
4                  2020 Honda Pilot EX-L   50,994 mi.             Used   
5                  2016 Honda Pilot EX-L  149,646 mi.             Used   
6   2025 Honda Pilot Touring 8-Passenger          N/A              New   
7              2025 Honda Pilot AWD EX-L          N/A              New   
8                    2015 Honda Pilot SE  101,325 mi.             Used   
9                  2013 Honda Pilot EX-L   95,382 mi.             Used   
10                      2023 Honda Pilot   27,755 mi.  Honda Certified   
11    2020 Honda Pilot AWD Black Edition   79,655 mi.             Used   
12              20

In [14]:
# Check for missing values:
missing_values= car_df.isnull().sum()
print("Missing values:")
print(missing_values)
print()

Missing values:
Model           0
Mileage (mi)    0
Status          0
Price (USD)     0
Dealer          0
City            0
dtype: int64



In [15]:
# Split 'Model' column into 'Year', 'Manufacturer', 'Model', 'Trim'
car_df[['Year', 'Manufacturer', 'Model', 'Trim']] = car_df['Model'].str.split(' ', 3, expand=True)

# Replace 'Not Priced' with NaN and convert 'Price' column to numeric
car_df['Price (USD)'] = pd.to_numeric(car_df['Price (USD)'].replace('Not Priced', pd.NA).str.replace('[\$,]', '', regex=True), errors='coerce')

# Clean Mileage column
car_df['Mileage (mi)'] = pd.to_numeric(car_df['Mileage (mi)'].replace('N/A', pd.NA).str.replace('mi.', '').str.replace(',', '').str.strip(), errors='coerce')

# Reorder columns as required
car_df = car_df[['Year', 'Manufacturer', 'Model', 'Trim', 'Mileage (mi)', 'Status', 'Price (USD)', 'Dealer', 'City']]

# Display the updated DataFrame
print("Final DataFrame:")
print(car_df)

Final DataFrame:
    Year Manufacturer  Model                 Trim  Mileage (mi)  \
0   2019        Honda  Pilot                   EX       70137.0   
1   2021        Honda  Pilot      Special Edition       74967.0   
2   2022        Honda  Pilot                 EX-L       23868.0   
3   2013        Honda  Pilot                 EX-L       87615.0   
4   2020        Honda  Pilot                 EX-L       50994.0   
5   2016        Honda  Pilot                 EX-L      149646.0   
6   2025        Honda  Pilot  Touring 8-Passenger           NaN   
7   2025        Honda  Pilot             AWD EX-L           NaN   
8   2015        Honda  Pilot                   SE      101325.0   
9   2013        Honda  Pilot                 EX-L       95382.0   
10  2023        Honda  Pilot                 None       27755.0   
11  2020        Honda  Pilot    AWD Black Edition       79655.0   
12  2016        Honda  Pilot              Touring       93331.0   
13  2015        Honda  Pilot              Tou

  car_df['Mileage (mi)'] = pd.to_numeric(car_df['Mileage (mi)'].replace('N/A', pd.NA).str.replace('mi.', '').str.replace(',', '').str.strip(), errors='coerce')


In [16]:
car_df.to_excel("Pilot.xlsx")