In [1]:
#Import needed libraries components
import json
import csv

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver import ChromeOptions
from time import sleep





In [2]:
# assignment of required html tags xpaths and classnames to use with selenium

WEBSITE_URL = "https://www.cars.com/research/"
XPATH_MAKE = '//*[@id="make-select"]'
XPATH_MODEL = '//*[@id="model-select"]'
XPATH_YEAR = '//*[@id="year-select"]'
XPATH_PERCENTAGE_RECOMMENDED = './/div[2]/div[2]'
XPATH_NEXT_BUTTON = "//*[@id='vehicle-reviews-section']/div/nav/div/*[contains(text(),'Next')]"
CLASS_SUMMARY = 'summary-container '
CLASS_RATING_VALUE = 'sds-rating__count'
CLASS_BREAKDOWN_TABLE = 'review-breakdown'

CLASS_REVIEW = 'consumer-review-container'





In [3]:
#Driver(browser path)
driver_service = Service(r'C:\Users\PC\Documents\chromedriver\chromedriver.exe')
#Headless -> True : Use with GUI | False : without GUI
headless = True
chrome_options = ChromeOptions()

if headless:
    chrome_options.add_argument("--headless")
#Launch browser
driver = webdriver.Chrome(service=driver_service, options = chrome_options)
#Open website link in launched browser
driver.get(WEBSITE_URL)



NameError: name 'CHROMEDRIVER_PATH' is not defined

In [None]:
# Function that writes data to given csv file
# Used to write a savefile that contains the makes, models and years of already scraped cars in order to skip them when you re launch the script
def write_to_csv(data : tuple, filename_without_ext : str):
    with open(f"{filename_without_ext}.csv", 'a+') as file_pointer:
        csv_output = csv.writer(file_pointer)
        csv_output.writerow(data)
        print(f"Data written to {filename_without_ext}.csv")
    

In [None]:
#Function that loads data from a csv file
def load_from_csv(filename_without_ext : str):
    data = []
    try:
        with open(f"{filename_without_ext}.csv", "r") as file_pointer:
            reader = csv.reader(file_pointer)
            data = [tuple(row) for row in reader if row]
    except FileNotFoundError as e:
        print(f"File {filename_without_ext}.csv not found")
    return data
    

In [None]:
#Function that returns the options of a HTML Select element
def get_options(select_element):
    return [option.get_attribute('value') for option in select_element.find_elements_by_tag_name("option") if option.get_attribute('value') != '']

In [None]:
#Function that generates the link to a cars review page with following cars.com format
def create_car_reviews_link(make, model, year):
    return f"{'_'.join(make.split(' '))}-{'_'.join(model.split(' '))}-{year}/consumer-reviews"

In [None]:
# Function that opens a link of a page in a new tab
def open_new_window(driver, url):
    driver.execute_script(f"""window.open("{url}","_blank");""")
    driver.switch_to.window(driver.window_handles[-1])
    # sleep(2)
    


In [None]:
#Function that returns processed reviews contained in a page given a list of reviews as paramater
def get_car_reviews(reviews): 
    reviews_list = []
    for review in reviews:
        review_info = review.find_element_by_class_name('review-byline').find_elements_by_tag_name('div')
        description = review.find_element_by_class_name('review-body')
        review_date = review_info[0].text
        user_data = review_info[1].text.split('By')[1].split('from')
        
        reviews_list.append( {'title' : review.find_element_by_tag_name('h3').text,
                'date' : review_date,
                'user_name' : user_data[0].lstrip(),
                'user_location': user_data[1].lstrip() if len(user_data) == 2 else 'N/A',
                'review_text': description.text
                }
                )
    
    return reviews_list

In [None]:
#Function that returns all cars information 'Rating, recommended, summary(breakdown) and reviews' of a car given the tab that has the car's 
# main page as parameter, this function also browses all available reviews pages
def get_car_information(driver):
    # Initialization of dictionary which will contain car information, if reviews are not available this will be returned as a default value
    car_info_dict = {'rating' : '0.0',
                    'recommended_pct': 'N/A',
                    'breakdown': {},
                    'reviews': []
                }
    #Initialize next page button 
    next_button = None
    try:      
        # Get HTML page elements using XPATHs and classnames defined before
        summary = driver.find_element_by_class_name(CLASS_SUMMARY)
        rating = summary.find_element_by_class_name(CLASS_RATING_VALUE)
        recommended_percentage = summary.find_element_by_xpath(XPATH_PERCENTAGE_RECOMMENDED)
        breakdown_table = summary.find_element_by_class_name(CLASS_BREAKDOWN_TABLE)
        table_elements = breakdown_table.find_element_by_tag_name("ul").find_elements_by_tag_name("li")
        #Get next page button
        next_button = driver.find_element_by_xpath(XPATH_NEXT_BUTTON)
        # Store rating and recommended percentage
        car_info_dict['rating'] = rating.text
        car_info_dict['recommended_pct'] = recommended_percentage.text
        # Iterate on the breakdown table element and store each key with its corresponding value
        for element in table_elements:
            breakdown_elements = element.text.split('\n')
            breakdown_type, breakdown_value = breakdown_elements[0], breakdown_elements[1]
            car_info_dict['breakdown'][breakdown_type] = breakdown_value
        # Check if there is a next page button
        if next_button is not None:
            # Variable to check if the browser is on the first page of the reviews
            is_first_page = True
            # Variable that says if there are still review pages
            there_is_next = next_button.is_enabled()
            
            while(there_is_next or is_first_page):

                sleep(2)
                is_first_page = False
                reviews = driver.find_elements_by_class_name(CLASS_REVIEW)
                car_reviews = get_car_reviews(reviews)
                there_is_next = next_button.is_enabled()
                if car_reviews:
                    #If page contains reviews -> store them
                    car_info_dict['reviews'] += car_reviews
                if there_is_next:
                    #Click on the next button to go to the next page if the button is enable 'there are more pages'
                    next_button.click()   
                    next_button = driver.find_element_by_xpath(XPATH_NEXT_BUTTON)
    #Exception thrown if selenium can't find an element in the page 'Used for page that do not contain reviews e.g : 2022 cars'
    except NoSuchElementException as e:
        print("Error occured Continuing")

    return car_info_dict
    

In [None]:
#Function that writes data to a given json file
# Used for storing scraped cars data
def write_to_json(data, file_name):
    with open(f'{file_name}.json', 'w+') as output_file:
        json.dump(data, output_file)
        print(f"Data written to {file_name}.json" )

In [None]:
#Function that loads data from a json file
def load_from_json(file_name):
    try:
        with open(f'{file_name}.json', 'r') as input_file:
            data = json.load(input_file)
    except FileNotFoundError:
        return None
    return data

In [None]:
#reset
# driver.close()
# driver.switch_to.window(driver.window_handles[-1])

In [None]:
# Save file name
SAVED_DATA_FILE = "save"
# Load save file
previous_data = load_from_csv(SAVED_DATA_FILE) 
if previous_data is None:
    print("No save found")
else:
    print("Save found")
    print(f"Saved data {previous_data}")
    


Save found
Saved data [('acura', 'cl', '2003'), ('acura', 'cl', '2002'), ('acura', 'cl', '2001'), ('acura', 'cl', '1999'), ('acura', 'cl', '1998'), ('acura', 'cl', '1997'), ('acura', 'ilx', '2022'), ('acura', 'ilx', '2021'), ('acura', 'ilx', '2020'), ('acura', 'ilx', '2019'), ('acura', 'ilx', '2018'), ('acura', 'ilx', '2017'), ('acura', 'ilx', '2016'), ('acura', 'ilx', '2015'), ('acura', 'ilx', '2014'), ('acura', 'ilx', '2013'), ('acura', 'ilx_hybrid', '2014'), ('acura', 'ilx_hybrid', '2013'), ('acura', 'integra', '2001'), ('acura', 'integra', '2000'), ('acura', 'integra', '1999'), ('acura', 'integra', '1998'), ('acura', 'integra', '1997'), ('acura', 'integra', '1996'), ('acura', 'integra', '1995'), ('acura', 'integra', '1994'), ('acura', 'integra', '1993'), ('acura', 'integra', '1992'), ('acura', 'legend', '1995'), ('acura', 'legend', '1994'), ('acura', 'legend', '1993'), ('acura', 'legend', '1992'), ('acura', 'mdx', '2022'), ('acura', 'mdx', '2020'), ('acura', 'mdx', '2019'), ('acura

In [None]:
# Initialize a default dictionary for storing cars data
DEFAULT_CARS_OBJECT = {'cars':[]}
# Get HTML select elements in the first page using the XPATHS defined above
select_make = driver.find_element_by_xpath(XPATH_MAKE)
select_model = driver.find_element_by_xpath(XPATH_MODEL)
select_year = driver.find_element_by_xpath(XPATH_YEAR)
# Initialize a Selenium Select object for each select element
dropdown_make = Select(select_make)
dropdown_model = Select(select_model)
dropdown_year = Select(select_year)
# Get all options of the make select and iterate on them
makes = get_options(select_make)

for make in makes:
    # Put the current make value in the select element in the page
    dropdown_make.select_by_value(make)
    # Get all models for the current make and iterate on the models
    models = get_options(select_model)
    for model in models:
        # Put the current model value in the model's select element
        dropdown_model.select_by_value(model)
        # Get all different years options for the model and iterate on them
        years = get_options(select_year)

        for year in years:
            # Check if there's a save 'already scraped cars'
            if previous_data:
                print("Looking in saved data")
                found_saved = False
                #Iterate on the saved cars and skip if the current combination (make, model, year) already exists
                for saved_car_index, saved_car in enumerate(previous_data):
                    if saved_car == (make, model, year):
                        found_saved = True
                        print(f"Skipping {(make, model, year)}")
                        #Remove the found element
                        previous_data.pop(saved_car_index)
                if found_saved:
                    #Reiterate if current car already exists
                    continue
            # Put the current year value in the page's year select         
            dropdown_year.select_by_value(year)
            print(f"""Currently searching for make : {make}, 
            model : {model}, 
            year : {year}"""
            )
            # Generate page link using cars.com format
            car_link = create_car_reviews_link(make, model, year)
            # Open a new window/tab using the generated link
            open_new_window(driver, car_link)
            # Load previously stored cars data
            cars = load_from_json('cars2')
            # Assign the default value if no car data was previously stored or the data contained in the file if the file exists
            cars = DEFAULT_CARS_OBJECT if cars is None else cars
            # Add car data to previous data
            cars['cars'].append({
                'make' : make,
                'model': model,
                'year': year,  
                'info': get_car_information(driver)
            })
            # Write all the data into the file
            write_to_json(cars,'cars_data')
            # Close car page
            driver.close()
            # Switch to the first page
            driver.switch_to.window(driver.window_handles[0])
            # Write combination to save file
            write_to_csv((make, model, year), "save")
        
    


  select_make = driver.find_element_by_xpath(XPATH_MAKE)
  select_model = driver.find_element_by_xpath(XPATH_MODEL)
  select_year = driver.find_element_by_xpath(XPATH_YEAR)


Looking in saved data
Skipping ('acura', 'cl', '2003')
Looking in saved data
Skipping ('acura', 'cl', '2002')
Looking in saved data
Skipping ('acura', 'cl', '2001')
Looking in saved data
Skipping ('acura', 'cl', '1999')
Looking in saved data
Skipping ('acura', 'cl', '1998')
Looking in saved data
Skipping ('acura', 'cl', '1997')
Looking in saved data
Skipping ('acura', 'ilx', '2022')
Looking in saved data
Skipping ('acura', 'ilx', '2021')
Looking in saved data
Skipping ('acura', 'ilx', '2020')
Looking in saved data
Skipping ('acura', 'ilx', '2019')
Looking in saved data
Skipping ('acura', 'ilx', '2018')
Looking in saved data
Skipping ('acura', 'ilx', '2017')
Looking in saved data
Skipping ('acura', 'ilx', '2016')
Looking in saved data
Skipping ('acura', 'ilx', '2015')
Looking in saved data
Skipping ('acura', 'ilx', '2014')
Looking in saved data
Skipping ('acura', 'ilx', '2013')
Looking in saved data
Skipping ('acura', 'ilx_hybrid', '2014')
Looking in saved data
Skipping ('acura', 'ilx_h

  summary = driver.find_element_by_class_name(CLASS_SUMMARY)
  next_button = driver.find_element_by_xpath(XPATH_NEXT_BUTTON)
  reviews = driver.find_elements_by_class_name(CLASS_REVIEW)
  next_button = driver.find_element_by_xpath(XPATH_NEXT_BUTTON)


KeyboardInterrupt: 

In [None]:
#Close chrome session
# try:
#     driver.quit()
# except Exception as e:
#     print(e.__str__)