# Predicting used car prices - Autotrade

### 1. Data Collection

#### Data source: https://www.autotrader.com.au/
#### Scope: Perth WA, 25km radius from postcode 6000

In [5]:
# Import libraries
from splinter import Browser
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import os
import json
import time
import random

import pandas as pd

In [6]:
#### Web scraping set up and access homepage of Autotrade website

# Set up Splinter
driver_path = os.path.join(os.path.expanduser("~"),"Documents", "DATA ANALYTICS BOOTCAMP", "Apps", "chromedriver_win32", "chromedriver.exe")
browser = Browser('chrome',service=Service(executable_path=driver_path))

# Visit the Mars news site
url = 'https://www.autotrader.com.au/for-sale/wa/perth?distance=25'
browser.visit(url)
html = browser.html

# Create a Beautiful Soup object
soup = BeautifulSoup(html, 'html.parser')

In [7]:
# Empty list to store car data
car_data_set = []

# Set the number of pages you want to scrape
num_pages = 1

# Scrape data from multiple pages
for page in range(num_pages):

    # Get all car listings
    car_listing = soup.find_all('a', class_='carListing')

    # Scrape car data from car listings
    for i in range(len(car_listing)):
        
        # Create an empty dictionary to store individual car data
        car_data = {}

        # Scrape year model and append to car_data
        h3 = car_listing[i].find('h3', class_='carListing--title')
        year_model = h3.contents[0].strip()
        car_data['Year model'] = year_model

        # Click on the car listing
        wait = WebDriverWait(browser.driver, 10)
        car_listing = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.carListing')))
        car_listing[i].click()

        # Wait for the details page to load
        time.sleep(3)

        # Get the HTML of the current page after clicking the button
        html = browser.html

        # Create a Beautiful Soup object
        soup_details = BeautifulSoup(html, 'html.parser')

        # Scrape car spec and append to car_data
        h1 = soup_details.find('h1', class_='title')
        if h1 is not None:
            car_spec = h1.get_text(strip=True)
            car_data['Car Spec'] = car_spec
        else:
            car_data['Car Spec'] = ""

        # Find the button element for the next click (see more details)
        wait = WebDriverWait(browser.driver, 10)
        next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.vehicleDetails--sectionHeader > div.vehicleDetails--title')))
        next_button.click()

        # Scrape car details and append to car_data
        tr = soup_details.find_all('tr', class_='table--row')

        for row in tr:
            label = row.find('td', class_='table--label')
            value = row.find('td', class_='table--value')

            label_text = label.get_text(strip=True)
            value_text = value.get_text(strip=True)

            car_data[label_text] = value_text

        # Append the car data dictionary to the list
        car_data_set.append(car_data)

        # Find the button element for the back click (back to the car listings)
        wait = WebDriverWait(browser.driver, 10)
        back_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.detailsBackButton')))
        back_button.click()

        # Wait for the car listings page to load
        time.sleep(3)

        # Re-find the car listings
        car_listing = soup.find_all('a', class_='carListing')

    # Find the button element for the next page click
    wait = WebDriverWait(browser.driver, 10)
    next_page_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'p[data-cy="pagination--arrow-right"]')))
    next_page_button.click()

    # Wait for the next page to load
    time.sleep(3)

car_data_set

[{'Year model': '2007',
  'Car Spec': '2007 Audi TT 8J Coupe 2dr S tronic 6sp 2.0T',
  'Kilometres': '104,108km',
  'Seller type': 'Dealer: Used',
  'Price': '$17,990',
  'Transmission': 'Sports Automatic Dual Clutch',
  'Body type': '-',
  'Drive type': '-',
  'Engine': '-',
  'Fuel type': '',
  'Fuel consumption': '-',
  'Colour ext / int': 'Blue / -',
  'Registration': '1CWG869',
  'Rego expiry': '-',
  'VIN': 'TRUZZZ8J981023299',
  'Stock No': '3620',
  'ANCAP Safety rating': '',
  'Green overall rating': 'LeafCreated with Sketch.LeafCreated with Sketch.LeafCreated with Sketch.LeafCreated with Sketch.LeafCreated with Sketch.',
  'Dealer': 'Madman motors',
  'Address': '14 Buckingham Drive, Wangara, WA',
  'Seating capacity': '',
  'Doors': '',
  'Front tyre size': '',
  'Front rim size': '',
  'Rear tyre size': '',
  'Rear rim size': '',
  'Injection / Carburation': '',
  'CC': '',
  'Number of cylinders': '',
  'Front suspension': '',
  'Rear suspension': '',
  'Front brakes': '',

In [8]:
# Create DataFrame with the scraped data
car_df = pd.DataFrame(car_data_set)
car_df.head()

Unnamed: 0,Year model,Car Spec,Kilometres,Seller type,Price,Transmission,Body type,Drive type,Engine,Fuel type,...,Make,Model,Variant,Series,Warranty when new (months),Warranty when new (kms),Service interval (months),Service interval (kms),Country of origin,Vehicle segment
0,2007,2007 Audi TT 8J Coupe 2dr S tronic 6sp 2.0T,"104,108km",Dealer: Used,"$17,990",Sports Automatic Dual Clutch,-,-,-,,...,Audi,TT,8J Coupe 2dr S tronic 6sp 2.0T,,,,,,,
1,2019,2019 Hyundai I30 Active PD2 MY19,"74,070km",Dealer: Used,"$20,998",Automatic,"HATCH, 4 Doors, 5 Seats",Front Wheel Drive,"4 cyl, 2 L",Unleaded Petrol,...,Hyundai,I30,Active,PD2 MY19,60.0,999000.0,12.0,15000.0,,
2,2019,2019 Hyundai I30 Active PD2 MY19,"66,975km",Dealer: Used,"$21,998",Automatic,"HATCH, 4 Doors, 5 Seats",Front Wheel Drive,"4 cyl, 2 L",Unleaded Petrol,...,Hyundai,I30,Active,PD2 MY19,60.0,999000.0,12.0,15000.0,,
3,2019,2019 Hyundai I30 Active PD2 MY19,"68,845km",Dealer: Used,"$21,998",Automatic,"HATCH, 4 Doors, 5 Seats",Front Wheel Drive,"4 cyl, 2 L",Unleaded Petrol,...,Hyundai,I30,Active,PD2 MY19,60.0,999000.0,12.0,15000.0,,
4,2017,2017 Mitsubishi Triton GLX (4X4) MQ MY18,"188,978km",Dealer: Used,"$24,750",Automatic,"Ute Tray, 4 Doors, 5 Seats",4x4,"4 cyl, 2.4 L",Diesel,...,Mitsubishi,Triton,GLX (4X4),MQ MY18,60.0,100000.0,12.0,15000.0,,


In [9]:
# Export data to csv file
car_df.to_csv('output.csv', index=False)

In [10]:
# Quit the browser
browser.quit()