In [8]:
# Import dependencies
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
import time

In [9]:
# Set up Splinter
browser = Browser('chrome')

# Base URL for Honda CR-V and CR-V Hybrid
base_url = 'https://www.cars.com/shopping/results/?dealer_id=&include_shippable=true&keyword=&list_price_max=&list_price_min=&makes[]=honda&maximum_distance=150&mileage_max=&models[]=honda-cr_v&models[]=honda-cr_v_hybrid&monthly_payment=&page={}&page_size=100&sort=best_match_desc&stock_type=all&year_max=&year_min=&zip=61606'

# Create lists to store details
models = []
statuses = []
mileages = []
prices = []
dealers = []
cities = []
urls = []  # List to store URLs

# Set to store unique identifiers of scraped cars
scraped_car_ids = set()

# Function to scrape current page
def scrape_page(url):
    browser.visit(url)
    time.sleep(5)  # Adjust as necessary
    
    html = browser.html
    car_soup = soup(html, 'html.parser')
    car_list = car_soup.find_all('div', class_='vehicle-card')

    if not car_list:
        print(f"No car listings found for URL: {url}. Exiting.")
        return False

    for car in car_list:
        # Extract car model
        car_model = car.find('h2', class_='title')
        model = str(car_model.get_text(strip=True)) if car_model else 'N/A'

        # Extract car mileage
        car_mileage = car.find('div', class_='mileage')
        mileage = str(car_mileage.get_text(strip=True)) if car_mileage else 'N/A'

        # Extract car status
        car_status = car.find('p', class_='stock-type')
        status = str(car_status.get_text(strip=True)) if car_status else 'N/A'

        # Extract car price
        car_price = car.find('span', class_='primary-price')
        price = str(car_price.get_text(strip=True)) if car_price else 'N/A'

        # Extract dealer name
        car_dealer_div = car.find('div', class_='dealer-name')
        if car_dealer_div:
            car_dealer = car_dealer_div.find('strong')
            dealer = str(car_dealer.get_text(strip=True)) if car_dealer else 'N/A'
        else:
            dealer = 'N/A'

        # Extract dealer city
        dealer_city = car.find('div', class_='miles-from')
        city = str(dealer_city.get_text(strip=True)) if dealer_city else 'N/A'

        # Extract car URL
        car_url = car.find('a', class_='vehicle-card-link')['href'] if car.find('a', class_='vehicle-card-link') else 'N/A'

        # Check if this car has already been scraped
        if car_url in scraped_car_ids:
            continue  # Skip if already scraped
        
        # Add car URL to set of scraped car IDs
        scraped_car_ids.add(car_url)

        # Append details to lists
        models.append(model)
        statuses.append(status)
        prices.append(price)
        dealers.append(dealer)
        cities.append(city)
        mileages.append(mileage)
        urls.append(car_url)  # Append URL to list

    return True

# Counter to track consecutive pages with no new data
no_new_data_count = 0
max_no_new_data_allowed = 3  # Adjust as necessary

# Timeout settings
timeout_seconds = 300  # 5 minutes timeout
start_time = time.time()

# Initial page number
page_number = 1
url = base_url.format(page_number)

while True:
    if time.time() - start_time > timeout_seconds:
        print(f"Timeout ({timeout_seconds} seconds) exceeded for URL: {url}. Exiting.")
        break
    
    if not scrape_page(url):
        # Increment no_new_data_count if no new data found
        no_new_data_count += 1
        if no_new_data_count >= max_no_new_data_allowed:
            print(f"No new data found for {max_no_new_data_allowed} consecutive pages for URL: {url}. Exiting.")
            break
    else:
        # Reset no_new_data_count if new data found
        no_new_data_count = 0
    
    # Increment page number and update URL
    page_number += 1
    url = base_url.format(page_number)
    print(f"Scraping page {page_number}...")

# Print number of cars scraped
print(f"Number of unique cars scraped: {len(scraped_car_ids)}")

# Print or use the scraped data as needed


Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Timeout (300 seconds) exceeded for URL: https://www.cars.com/shopping/results/?dealer_id=&include_shippable=true&keyword=&list_price_max=&list_price_min=&makes[]=honda&maximum_distance=150

In [10]:
# Create df
data={
    'Model': models,
    'Mileage (mi)': mileages,
    'Status': statuses,
    'Price (USD)': prices,
    'Dealer': dealers,
    'City': cities ,
    'URL': urls
    }
car_df= pd.DataFrame(data)

In [11]:
# Close the browser
browser.quit()

In [5]:
# Clear lists after scraping
# clear_lists()

In [12]:
# Display DF
print("Original DataFrame:")
print(car_df)
print()

Original DataFrame:
                                     Model Mileage (mi)           Status  \
0                       2021 Honda CR-V EX   59,806 mi.             Used   
1                       2014 Honda CR-V LX   99,396 mi.             Used   
2                       2008 Honda CR-V EX  163,345 mi.             Used   
3                     2017 Honda CR-V EX-L  173,049 mi.             Used   
4                     2017 Honda CR-V EX-L   31,665 mi.             Used   
...                                    ...          ...              ...   
1884        2024 Honda CR-V Hybrid Sport-L    3,644 mi.  Honda Certified   
1885          2023 Honda CR-V Hybrid Sport   10,382 mi.  Honda Certified   
1886  2024 Honda CR-V Hybrid Sport Touring    3,187 mi.             Used   
1887        2021 Honda CR-V Hybrid Touring   45,186 mi.             Used   
1888           2022 Honda CR-V Hybrid EX-L   17,134 mi.             Used   

     Price (USD)                                             Dealer

In [13]:
# Check for missing values:
missing_values= car_df.isnull().sum()
print("Missing values:")
print(missing_values)
print()

Missing values:
Model           0
Mileage (mi)    0
Status          0
Price (USD)     0
Dealer          0
City            0
URL             0
dtype: int64



In [14]:

# Split 'Model' column into 'Year', 'Manufacturer', 'Model', and 'Trim'
# Handling 'CR-V' and 'CR-V Hybrid' correctly
split_df = car_df['Model'].str.extract(r'(?P<Year>\d{4})\s+(?P<Manufacturer>[\w-]+)\s+(?P<Model>CR-V(?:\s+Hybrid)?)\s+(?P<Trim>.*)')
#  Captures group called 'Year' (?P<Year> ; \d{4} matches to 4 digits
# Matches 1+ whitespace characters to account for space between Year and Manufacturer \s+
# Captures another group called Manufacturer, matches by 1+ words (?P<Manufacturer>[\w-]+)
# Captures group called Model (?P<Model> ; CR-V(?:\s+Hybrid)?) Matches to string "CR-V" 
# or by a white space (?:\s) plus the string "Hybrid" +Hybrid)?)
# Captures group called Trim (?P<Trim>) ; Matches any character 0+ times 
# (i.e. captures everything after Model as the Trim) using .*

# Replace 'Not Priced' with NaN and convert 'Price' column to numeric
car_df['Price (USD)'] = pd.to_numeric(car_df['Price (USD)'].replace('Not Priced', pd.NA).str.replace('[\$,]', '', regex=True), errors='coerce')

# Clean the 'Mileage' column
car_df['Mileage (mi)'] = pd.to_numeric(car_df['Mileage (mi)'].str.replace('mi.', '').str.replace(',', '').str.strip(), errors='coerce')

# Concatenate split_df with the remaining columns from car_df
car_df = pd.concat([split_df, car_df[['Status', 'Price (USD)', 'Dealer', 'City', 'Mileage (mi)', 'URL']]], axis=1)

# Reorder columns as required
car_df = car_df[['Year', 'Manufacturer', 'Model', 'Trim', 'Mileage (mi)', 'Status', 'Price (USD)', 'Dealer', 'City', 'URL']]

# Display the updated DataFrame
print("Final DataFrame:")
print(car_df)

Final DataFrame:
      Year Manufacturer        Model           Trim  Mileage (mi)  \
0     2021        Honda         CR-V             EX       59806.0   
1     2014        Honda         CR-V             LX       99396.0   
2     2008        Honda         CR-V             EX      163345.0   
3     2017        Honda         CR-V           EX-L      173049.0   
4     2017        Honda         CR-V           EX-L       31665.0   
...    ...          ...          ...            ...           ...   
1884  2024        Honda  CR-V Hybrid        Sport-L        3644.0   
1885  2023        Honda  CR-V Hybrid          Sport       10382.0   
1886  2024        Honda  CR-V Hybrid  Sport Touring        3187.0   
1887  2021        Honda  CR-V Hybrid        Touring       45186.0   
1888  2022        Honda  CR-V Hybrid           EX-L       17134.0   

               Status  Price (USD)  \
0                Used      27602.0   
1                Used      13295.0   
2                Used       6995.0   
3 

  car_df['Mileage (mi)'] = pd.to_numeric(car_df['Mileage (mi)'].str.replace('mi.', '').str.replace(',', '').str.strip(), errors='coerce')


In [15]:
car_df

Unnamed: 0,Year,Manufacturer,Model,Trim,Mileage (mi),Status,Price (USD),Dealer,City,URL
0,2021,Honda,CR-V,EX,59806.0,Used,27602.0,Zeigler Nissan of Orland Park,"Orland Park, IL",/vehicledetail/8acfc917-c94c-45e0-a513-9981ba0...
1,2014,Honda,CR-V,LX,99396.0,Used,13295.0,Auto House of Bloomington,"Bloomington, IL (38 mi.)",/vehicledetail/bfb30a67-a596-4de9-b359-6813c39...
2,2008,Honda,CR-V,EX,163345.0,Used,6995.0,Auto House of Bloomington,"Bloomington, IL (38 mi.)",/vehicledetail/b01ef623-50ee-4a81-8968-9047ba1...
3,2017,Honda,CR-V,EX-L,173049.0,Used,16066.0,Yemm Chevrolet Buick GMC,"Galesburg, IL (45 mi.)",/vehicledetail/721879f7-f75a-41de-8243-f3245cf...
4,2017,Honda,CR-V,EX-L,31665.0,Used,23998.0,CarMax Austin North - Offering Express Pickup ...,,/vehicledetail/8a0d3898-e67d-4bfd-b310-09bbdef...
...,...,...,...,...,...,...,...,...,...,...
1884,2024,Honda,CR-V Hybrid,Sport-L,3644.0,Honda Certified,38992.0,Germain Honda of Beavercreek,"Dayton, OH (303 mi.)",/vehicledetail/162230b7-e47b-4e0b-acb9-bfffb77...
1885,2023,Honda,CR-V Hybrid,Sport,10382.0,Honda Certified,31987.0,AutoNation Honda West Knoxville,"Knoxville, TN (445 mi.)",/vehicledetail/bfa24e91-bc55-4247-9eb0-f93c34b...
1886,2024,Honda,CR-V Hybrid,Sport Touring,3187.0,Used,41292.0,Chevrolet Buick GMC of Millersburg,"Millersburg, OH (406 mi.)",/vehicledetail/09709257-d790-450c-bf1b-2a42315...
1887,2021,Honda,CR-V Hybrid,Touring,45186.0,Used,30133.0,Roper Honda,"Joplin, MO (361 mi.)",/vehicledetail/4738a2df-6a09-4b26-836e-211cc35...


In [11]:
print(car_df.to_string())

    Year Manufacturer        Model           Trim  Mileage (mi)           Status  Price (USD)                                                           Dealer                           City
0   2017        Honda         CR-V           EX-L      173049.0             Used        16066                                         Yemm Chevrolet Buick GMC         Galesburg, IL (45 mi.)
1   2013        Honda         CR-V           EX-L       93804.0             Used        13995                                        Auto House of Bloomington       Bloomington, IL (38 mi.)
2   2025        Honda         CR-V           EX-L           NaN              New        37850                                     O'Brien Honda of Bloomington       Bloomington, IL (38 mi.)
3   2023        Honda         CR-V           EX-L       21648.0             Used        31998  CarMax Austin North - Offering Express Pickup and Home Delivery                            N/A
4   2024        Honda         CR-V           EX-L 

In [16]:
car_df.to_excel("CR-V.xlsx")