In [1]:
import requests
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import requests_cache
import lxml.html as lxl
import re

In [2]:
MAX_PAGE = 333

# 1. Functions

## Scraping the webpages to get all the car urls

In [10]:
## Get all the urls for all the listed used vehicles on truecar.com
def urls_scraping(base_url = 'https://www.truecar.com/used-cars-for-sale/listings/'):
    urls = []
    pages = []
    for i in range(1, MAX_PAGE+1):
        pages.append(base_url + '?page=' + str(i))
    for page in pages:
        try:
            response = requests.get(page)
            response.raise_for_status()
        except:
            break
        root = lxl.fromstring(response.content)
        url = ['https://www.truecar.com' + link for link in root.xpath('//div[@data-qa="Listings"]/a/@href')]
        urls += url
    
    return urls 

## Parse one url to get information and return a dataframe

In [5]:
# function to scrape one single url of a single used car listing.
def page_scraping(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except:
        return
    root = lxl.fromstring(response.content)
    
    # extract vehicle year, make and model information
    year = root.xpath('//div[@class="text-truncate heading-3 margin-right-2 margin-right-sm-3"]/text()')[0].split(' ')[0]    
    make = root.xpath('//div[@class="text-truncate heading-3 margin-right-2 margin-right-sm-3"]/text()')[0].split(' ')[1]    
    model = root.xpath('//div[@class="text-truncate heading-3 margin-right-2 margin-right-sm-3"]/text()')[0].split(' ')[2:]    
    # need to extract vehicle type (suv or sedan) and sub-model info
    sub_model = root.xpath('//div[@class="text-truncate heading-4 text-muted"]/text()')[0] 

    # city and state, geospatial information.
    city = root.xpath('//span[@data-qa="used-vdp-header-location"]/text()[1]')[0]
    state = root.xpath('//span[@data-qa="used-vdp-header-location"]/text()[3]')[0]
    
    # vehicle mileage
    mileage = root.xpath('//span[@data-qa="used-vdp-header-miles"]/text()[1]')[0]
    
    # vehicle price information
    price = root.xpath('//html/body/div[2]/div[3]/div/div[4]/div/div[2]/div[1]/div/div/div[1]/div[3]/span/text()')
    price = str(price)
    
    # vehicle characteristics
    exterior_color = root.xpath('//div[@data-qa="vehicle-overview-item-Exterior Color"]/div[2]/ul/li/text()')[0]
    interior_color = root.xpath('//div[@data-qa="vehicle-overview-item-Interior Color"]/div[2]/ul/li/text()')[0]
    mpg_city = root.xpath('//div[@data-qa="vehicle-overview-item-MPG"]/div[2]/ul/li/text()')[0].split('/')[0].split(' ')[0]
    mpg_hwy = root.xpath('//div[@data-qa="vehicle-overview-item-MPG"]/div[2]/ul/li/text()')[0].split('/')[1].split(' ')[1]
    engine = root.xpath('//div[@data-qa="vehicle-overview-item-Engine"]/div[2]/ul/li/text()')[0]
    transmission = root.xpath('//div[@data-qa="vehicle-overview-item-Transmission"]/div[2]/ul/li/text()')[0]
    drive_type = root.xpath('//div[@data-qa="vehicle-overview-item-Drive Type"]/div[2]/ul/li/text()')[0]
    fuel_type = root.xpath('//div[@data-qa="vehicle-overview-item-Fuel Type"]/div[2]/ul/li/text()')[0]
    popular_feature = root.xpath('//div[@data-test="popularFeatures"]//li[@class="_19zze7p"]/p/text()')
    
    # vehicle history information, will extract four variables from here.
    vehicle_history = root.xpath('//li[@class="_h9wfdq"]/text()')
    
    # whether the car is a certified preowned car.
    if "used-vdp-header-cpo" in response.text:
        cpo = True
    else:
        cpo = False
    
    
    return pd.Series({'year':year, 'make': make, 'model': model, 'sub_model': sub_model, 'city':city, 'state': state,
           'mileage': mileage, 'price': price, 'exterior_color': exterior_color,
                     'interior_color': interior_color, 'mpg_city': mpg_city, 'mpg_hwy': mpg_hwy, 'engine': engine,
                     'transmission': transmission, 'drive_type': drive_type, 'fuel_type': fuel_type, 
                     'popular_feature': popular_feature, 'vehicle_history': vehicle_history, 'cpo': cpo})

## Use multi-processing to scrape all urls and merge into one dataframe

In [6]:
# Use multi-processing to speed up the web-scraping
# fully make use of 8 cores of my macbook pro.
from multiprocessing import Pool
num_partitions = 32
num_cores = 8
def parallelize(urls, func):
    url_set = np.array_split(urls, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, url_set))
    pool.close()
    pool.join()
    return df
# main function to scrape all the urls and merge all the data into one dataframe
def scraping(urls):
    scraping_data = [page_scraping(url) for url in urls]
    return pd.concat(scraping_data, axis=1).T

# 2. Main function

## Getting all urls

In [4]:
%%time
urls=urls_scraping() # extract all vehicle urls from allowed 333 pages.
urls[:10]

CPU times: user 18.3 s, sys: 976 ms, total: 19.3 s
Wall time: 5min 20s


['https://www.truecar.com/used-cars-for-sale/listing/5UXWX9C5XH0D98042/2017-bmw-x3/?sponsoredVehiclePosition=0',
 'https://www.truecar.com/used-cars-for-sale/listing/1FTEW1EP8JFD62429/2018-ford-f-150/?sponsoredVehiclePosition=0',
 'https://www.truecar.com/used-cars-for-sale/listing/1GTW7AFG1J1905780/2018-gmc-savana-cargo-van/?sponsoredVehiclePosition=1',
 'https://www.truecar.com/used-cars-for-sale/listing/KNMAT2MV7HP502570/2017-nissan-rogue/',
 'https://www.truecar.com/used-cars-for-sale/listing/JTMRFREV1HD198638/2017-toyota-rav4/',
 'https://www.truecar.com/used-cars-for-sale/listing/3FA6P0H77HR222432/2017-ford-fusion/',
 'https://www.truecar.com/used-cars-for-sale/listing/1FTEW1EG3JKE45550/2018-ford-f-150/',
 'https://www.truecar.com/used-cars-for-sale/listing/2GNAXSEVXJ6278985/2018-chevrolet-equinox/',
 'https://www.truecar.com/used-cars-for-sale/listing/4T1BF1FK5HU328455/2017-toyota-camry/',
 'https://www.truecar.com/used-cars-for-sale/listing/SHHFK7H26HU223996/2017-honda-civic/']

## Returning the raw dataframe

In [7]:
%%time
data = parallelize(urls, scraping)

CPU times: user 997 ms, sys: 295 ms, total: 1.29 s
Wall time: 15min 25s


In [8]:
print(data.shape)
data.head(20)

(9990, 19)


Unnamed: 0,year,make,model,sub_model,city,state,mileage,price,exterior_color,interior_color,mpg_city,mpg_hwy,engine,transmission,drive_type,fuel_type,popular_feature,vehicle_history,cpo
0,2017,BMW,[X3],xDrive28i AWD,Marietta,GA,77549,"['$16,709']",Mineral White Metallic,Mocha,21,28,2.0L Inline-4 Gas Turbocharged,Automatic,AWD,Gas,[],"[0, reported accidents, Personal or Rental Us...",False
1,2018,Ford,[F-150],XLT SuperCrew 5.5' Box 4WD,Deland,FL,11624,"['$26,985']",Shadow Black,Medium Earth Gray,16,22,2.7L V-6 Gas Turbocharged,Automatic,4WD,Gas,"[Cruise Control, Bluetooth, Backup Camera]","[0, reported accidents, Personal or Rental Us...",False
2,2018,GMC,"[Savana, Cargo, Van]",2500 Short Wheelbase,Pelham,AL,23997,"['$19,990']",Summit White,Unknown,N,cty,6.0L V-8 Gas,Automatic,RWD,Gas,"[Cruise Control, Backup Camera]","[0, reported accidents, Personal or Rental Us...",False
3,2017,Nissan,[Rogue],SV AWD,Palmetto Bay,FL,78116,"['$9,997']",Magnetic Black,Charcoal,25,32,2.5L Inline-4 Gas,Automatic,AWD,Gas,"[Front Heated Seats, Cruise Control, Backup Ca...","[2, reported accidents, Fleet or Mixed Use, C...",False
4,2017,Toyota,[RAV4],XLE AWD,Great Falls,MT,189335,"['$13,495']",Super White,Black,22,28,2.5L Inline-4 Gas,Automatic,AWD,Gas,[],"[0, reported accidents, Personal or Rental Us...",False
5,2017,Ford,[Fusion],SE FWD,Denver,NC,99515,"['$8,799']",Shadow Black,Black,21,32,2.5L Inline-4 Gas,Automatic,FWD,Gas,"[Cruise Control, Bluetooth, Backup Camera]","[1, reported accidents, Fleet or Mixed Use, C...",False
6,2018,Ford,[F-150],XLT SuperCrew 5.5' Box 4WD,Boulder,CO,43503,"['$26,800']",Oxford White,Dark Earth Gray,16,22,3.5L V-6 Gas Turbocharged,Automatic,4WD,Gas,[],"[0, reported accidents, Personal or Rental Us...",False
7,2018,Chevrolet,[Equinox],LT with 1LT AWD,Belton,MO,148518,"['$9,888']",Silver Ice Metallic,Jet Black,24,30,1.5L Inline-4 Gas Turbocharged,Automatic,AWD,Gas,"[Power Trunk/Liftgate, Remote Engine Start, Bl...","[0, reported accidents, Personal or Rental Us...",False
8,2017,Toyota,[Camry],XLE I4 Automatic,Shelby,NC,34896,"['$12,924']",Midnight Black,Almond,24,33,2.5L Inline-4 Gas,Automatic,FWD,Gas,[],"[0, reported accidents, Personal or Rental Us...",False
9,2017,Honda,[Civic],LX Hatchback CVT,Duluth,GA,102068,"['$9,985']",Crystal Black Pearl,Select,31,40,1.5L Inline-4 Gas Turbocharged,Automatic,FWD,Gas,"[Cruise Control, Bluetooth, Backup Camera]","[2, reported accidents, Fleet or Mixed Use, C...",False


In [9]:
data.to_csv('../data/usedCarListing.csv', encoding = 'utf-8')