# Used car prices - Autotrade

## Data Collection

#### Data source: https://www.autotrader.com.au/
#### Scope: Perth WA, 25km radius from postcode 6000
#### Pages scraped from the site: 251

In [None]:
# Import libraries
from splinter import Browser
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import os
import json
import time
import random
import traceback

import pandas as pd

##### Note: The below code takes up to 12 hours running in a regular computer with regular internet connection !

In [None]:
# Empty list to store car data
car_data_set = []

# Set the number of pages you want to scrape
starting_page = 0
num_pages = 200

# Scrape data from multiple pages
print('Starting web scraping')
print('-------------------------------------------------------------')

current_page = starting_page

while current_page < starting_page + num_pages:
    
    try:
        page = current_page

        # Set up Splinter
        driver_path = os.path.join(os.path.expanduser("~"),"Documents", "DATA ANALYTICS BOOTCAMP", "Apps", "chromedriver_win32", "chromedriver.exe")

        chrome_options = Options()
        chrome_options.add_argument("--incognito")

        browser = Browser('chrome', service=Service(executable_path=driver_path), options=chrome_options)

        # Visit the Autotrader site
        url = f'https://www.autotrader.com.au/for-sale/wa/perth?page={(page+1)}&distance=25'
        browser.visit(url)
        html = browser.html

        # Create a Beautiful Soup object
        soup = BeautifulSoup(html, 'html.parser')

        # Get all car listings
        car_listing = soup.find_all('a', class_='carListing')

        # Scrape car data from car listings
        for i in range(len(car_listing)):
            
            # Create an empty dictionary to store individual car data
            car_data = {}

            # Scrape year model and append to car_data
            h3 = car_listing[i].find('h3', class_='carListing--title')
            year_model = h3.contents[0].strip()
            car_data['Year model'] = year_model

            # Click on the car listing
            wait = WebDriverWait(browser.driver, 10)
            car_listing = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.carListing')))
            car_listing[i].click()

            # Wait for the details page to load
            time.sleep(3)

            # Get the HTML of the current page after clicking the button
            html = browser.html

            # Create a Beautiful Soup object
            soup_details = BeautifulSoup(html, 'html.parser')

            # Scrape car spec and append to car_data
            h1 = soup_details.find('h1', class_='title')
            if h1 is not None:
                car_spec = h1.get_text(strip=True)
                car_data['Car Spec'] = car_spec
            else:
                car_data['Car Spec'] = ""

            # Find the button element for the next click (see more details)
            wait = WebDriverWait(browser.driver, 10)
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.vehicleDetails--sectionHeader > div.vehicleDetails--title')))
            next_button.click()

            # Scrape car details and append to car_data
            tr = soup_details.find_all('tr', class_='table--row')

            for row in tr:
                label = row.find('td', class_='table--label')
                value = row.find('td', class_='table--value')

                label_text = label.get_text(strip=True)
                value_text = value.get_text(strip=True)

                car_data[label_text] = value_text

            # Append the car data dictionary to the list
            car_data_set.append(car_data)

            # Find the button element for the back click (back to the car listings)
            wait = WebDriverWait(browser.driver, 10)
            back_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.detailsBackButton')))
            back_button.click()

            # Wait for the car listings page to load
            time.sleep(3)

            # Re-find the car listings
            car_listing = soup.find_all('a', class_='carListing')

        # Quit the browser
        browser.quit()

        # Create DataFrame with the scraped data
        car_df = pd.DataFrame(car_data_set)
        car_df.head()

        # Save the data and export data to csv file
        car_df.to_csv(f'data_collection_output/output_from_{starting_page+1}_to_{page+1}.csv', index=False)

        # Print progress
        print(f'Page {page+1} - Status: completed and saved')

        # Move to the next page
        current_page += 1

    except Exception as e:
        print(f'Error occurred on page {current_page+1}. Restarting from the current car listing.')
        traceback.print_exc()

        print('Starting web scraping')
        print('-------------------------------------------------------------')

        # Repeat the loop on the same page
        continue