This is a workbook that I have prepared to scrape car price data from cars.com. This is for a project that I am completing for my WGU masters program. 

This is my first experience using the beatiful soup library.

After watching a few tutorials. I was able to break the scraping process for this site down into a set of discrete steps and apply those steps to search results for a few different cars that I searched on the site.

In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os

In [5]:
import re

regexalpha = re.compile('[^a-zA-Z]') #get letters only

In [7]:
#this cell can used for testing if data elements need to be added to the worksheet
page = requests.get('https://www.cars.com/for-sale/searchresults.action/?dealerType=localOnly&mdId=22101&mkId=20088&mlgId=28860&page=1&perPage=100&rd=99999&searchSource=GN_REFINEMENT&sort=relevance&yrId=56007%2C58487%2C30031936&zc=55901')
soup = BeautifulSoup(page.content,'html.parser')
car_data = soup.find_all(class_='listing-row__details')
car_meta = soup.find_all(class_='listing-row__meta')

In [50]:
def get_row_details(item):
    #Listing of details for each car listing on the page
    return item.find_all(class_='listing-row__details')

In [29]:
def get_meta_details(item):
    #get the meta details for each row in the page
    return item.find_all(class_='listing-row__meta')

In [35]:
def get_ext_col(item):
    #get external color from listing row meta
    return item.find_all("li")[0].get_text().strip().split('\n')[-1].strip()

In [37]:
def get_int_col(item):
    #get internal color from listing row meta
    return item.find_all("li")[1].get_text().strip().split('\n')[-1].strip()

In [42]:
def get_trans(item):
    #get transmission type from listing row meta
    return item.find_all("li")[2].get_text().strip().split('\n')[-1].strip()

In [44]:
def get_drv_trn(item):
    #get drive train details from listing row meta
    return item.find_all("li")[3].get_text().strip().split('\n')[-1].strip()

In [53]:
def get_stock_type(item):
    
    if item.find(class_='listing-row__stocktype'):
        return item.find(class_='listing-row__stocktype').get_text().strip()
    else:
        return None

In [54]:
def get_price(item):
    #prices for each care listing
    if item.find(class_='listing-row__price'):
        if item.find(class_='listing-row__price').get_text().strip().strip('$').replace(',', '') == 'Not Priced':
            return 'Not Priced'
        else:
            data = item.find(class_='listing-row__price').get_text().strip().strip('mi.').replace(',', '')
            data = re.sub('[^0-9]','', data)
            return int(data)
    else:
        return 0

In [55]:
def get_miles(item):
    #mileage for each car listing
    if item.find(class_='listing-row__mileage'):
        if item.find(class_='listing-row__mileage').get_text().strip().strip('mi.').replace(',', '') == '-- ':
            return '-- '
        else:
            data = item.find(class_='listing-row__mileage').get_text().strip().strip('mi.').replace(',', '')
            data = re.sub('[^0-9]','', data)
            return int(data)
    else:
        return 0

In [56]:
def get_model(item):
    #model description for each car listing
    return item.find(class_='listing-row__title').get_text().strip()

In [57]:
def get_dealer(item):
    #get the name of dealer
    return regexalpha.sub('',item.find(class_='dealer-name').get_text())

In [59]:
def get_dealer_phone(item):
    #get phone number for dealer
    return re.sub('[^0-9]','', item.find(class_='dealer-name').get_text())

In [48]:
def get_data(page_in):
    
    #list of target classes from each page
    car_data = get_row_details(page_in)
    car_meta = get_meta_details(page_in)

    #extract items from each car ad on the page
    prices = [get_price(item) for item in car_data]
    mileage = [get_miles(item) for item in car_data]
    model = [get_model(item) for item in car_data]
    dealer = [get_dealer(item) for item in car_data]
    dealer_phone = [get_dealer_phone(item) for item in car_data]
    stock_type = [get_stock_type(item) for item in car_data]
    ext_Color = [get_ext_col(item) for item in car_meta]
    int_color = [get_int_col(item) for item in car_meta]
    trans_type = [get_trans(item) for item in car_meta]
    drive_train = [get_drv_trn(item) for item in car_meta]
    
    car_prices = pd.DataFrame({
                        'model': model,
                        'stock_type': stock_type,
                        'mileage': mileage,
                        'price': prices,
                        'dealer': dealer,
                        'dealer_phone': dealer_phone,
                        'ext_color': ext_Color,
                        'int_color': int_color,
                        'trans_type': trans_type,
                        'drive_train': trans_type,
                        })
    return car_prices


In [61]:
def ins_format(site):
    #insert brackets into url string for string parsing using .format
    pattern = re.compile(r'&page=\d[&]')
    return re.sub(pattern, r'&page={}&', site)

In [62]:
def get_car_prices(pages,site):
    '''
    Author: Henry Greeley
    Load search results from cars.com for a designated model.
    tell formula number of pages and url of one of the pages.
    int, str -> DataFrame
    '''
    #empty dataframe
    car_data_out = pd.DataFrame(columns =['model','stock_type','mileage', 'price','dealer','dealer_phone'\
                                         ,'ext_color','int_color','trans_type','drive_train'])
    #add brackets to site url
    site = ins_format(site)
    
    #loop through each page extracting all elements
    for i in range(1,pages+1):
        #request page
        page = requests.get(site.format(str(i)))
        #parse into soup object
        soup = BeautifulSoup(page.content,'html.parser')
        #apply parsing functions for individual elements
        #load to dataframe
        car_data_out = car_data_out.append(get_data(soup))
    
    #output dataframe with rows indexed and na for miles as 0
    return car_data_out.reset_index()[['model','stock_type','mileage', 'price','dealer','dealer_phone'\
                                         ,'ext_color','int_color','trans_type','drive_train']]
    

In [64]:
#get data for 2015-2017 toyota siena
siena = 'https://www.cars.com/for-sale/searchresults.action/?dealerType=localOnly&mdId=22101&mkId=20088&mlgId=28860&page=1&perPage=100&rd=99999&searchSource=GN_REFINEMENT&sort=relevance&yrId=56007%2C58487%2C30031936&zc=55901'

siena_2015_17 = get_car_prices(30,siena)

siena_2015_17.to_csv('siena_2015_17.csv')

siena_2015_17

Unnamed: 0,model,stock_type,mileage,price,dealer,dealer_phone,ext_color,int_color,trans_type,drive_train
0,2016 Toyota Sienna SE,Used,37113,18998,FirstAveMotors,6122848699,Black,Black,Automatic,Automatic
1,2017 Toyota Sienna SE Premium,Used,28550,27000,DanDeeryToyota,3197745839,Black,Black,Automatic,Automatic
2,2017 Toyota Sienna XLE,Used,25956,28107,DanDeeryToyota,3197745839,Silver,Gray,Automatic,Automatic
3,2017 Toyota Sienna Limited Premium,Toyota Certified,63822,28000,DanDeeryToyota,3197745818,White,Beige,Automatic,Automatic
4,2017 Toyota Sienna Limited Premium,Used,49674,28712,DanDeeryToyota,3197745839,White,Gray,Automatic,Automatic
5,2015 Toyota Sienna L,Used,44597,15752,DiamondBuickGMCofAlexandria,3204210542,Silver,Gray,Automatic,Automatic
6,2017 Toyota Sienna XLE Premium,Used,27050,26950,QCAutoplexcom,3097514094,Silver,Gray,Automatic,Automatic
7,2016 Toyota Sienna XLE,Used,45808,23488,WildeHonda,2625102679,Brown,Other,Automatic,Automatic
8,2017 Toyota Sienna Limited,Used,29704,26950,ZeiglerChryslerJeepDodgeRAMofSchaumburg,8474534520,White,Gray,Automatic,Automatic
9,2017 Toyota Sienna XLE,Used,39986,19999,ASMCARS,4024017820,Gray,Gray,Automatic,Automatic


In [68]:
#get data for 2012-2014 toyota siena
siena_2012_14 = 'https://www.cars.com/for-sale/searchresults.action/?dealerType=localOnly&mdId=22101&mkId=20088&mlgId=28860&page=1&perPage=100&rd=99999&searchSource=GN_REFINEMENT&sort=relevance&yrId=39723%2C47272%2C51683&zc=55901'

siena_2012_14 = get_car_prices(10,siena_2012_14)

siena_2012_14.to_csv('siena_2012_14.csv')

siena_2012_14

Unnamed: 0,model,stock_type,mileage,price,dealer,dealer_phone,ext_color,int_color,trans_type,drive_train
0,2013 Toyota Sienna Limited,Used,80341,15470,RAVMOTORS,9522361164,Green,Gray,Automatic,Automatic
1,2014 Toyota Sienna XLE,Used,78946,13000,SergPrivateSeller,,Black,Gray,Automatic,Automatic
2,2014 Toyota Sienna LE,Used,106989,14490,VanHornNissanofStevensPoint,9204494427,Beige,Beige,Automatic,Automatic
3,2013 Toyota Sienna Limited,Used,97425,17975,BillionToyota,6052315896,White,Gray,Automatic,Automatic
4,2014 Toyota Sienna LE,Used,39390,14995,FineAutoSales,4142191020,Silver,Gray,Automatic,Automatic
5,2014 Toyota Sienna L,Used,42255,13995,TheInternetCarLot,4028818051,White,Beige,Automatic,Automatic
6,2013 Toyota Sienna XLE,Used,71720,13995,TheInternetCarLot,4028818051,Beige,Beige,Automatic,Automatic
7,2012 Toyota Sienna Limited,Used,47473,17995,TheInternetCarLot,4028818051,Green,Gray,Automatic,Automatic
8,2014 Toyota Sienna XLE,Used,69138,18980,LexusOfHighlandPark,8885151410,Black,Gray,Automatic,Automatic
9,2013 Toyota Sienna Limited,Used,111469,15991,McGrathAcurainMortonGrove,8479299438,Gray,Gray,Automatic,Automatic


In [63]:
#get data for 2019 toyota rav 4
page_in = 'https://www.cars.com/for-sale/searchresults.action/?dealerType=localOnly&mdId=21780&mkId=20088&mlgId=28860&page=1&perPage=100&rd=99999&searchSource=PAGINATION&sort=relevance&yrId=36362520&zc=55901'

rav4_2019 = get_car_prices(47,page_in)

rav4_2019.to_csv('rav4_2019.csv')
rav4_2019

Unnamed: 0,model,stock_type,mileage,price,dealer,dealer_phone,ext_color,int_color,trans_type,drive_train
0,2019 Toyota RAV4 XLE Premium,Toyota Certified,5053,30000,RochesterToyota,8888760843,White,Gray,Automatic,Automatic
1,2019 Toyota RAV4 XLE,Used,16189,25597,KenVanceMotors,8775112907,Gray,Other,Automatic,Automatic
2,2019 Toyota RAV4 XLE,Toyota Certified,26340,23455,DanDeeryToyota,3197745818,Blue,Black,Automatic,Automatic
3,2019 Toyota RAV4 XLE,Used,26542,23277,ThysMotorCompany,8669451538,Silver,Other,Automatic,Automatic
4,2019 Toyota RAV4 XLE,Used,20493,23490,LeMieuxSonToyota,9204714921,White,Black,Automatic,Automatic
5,2019 Toyota RAV4 XLE,Toyota Certified,18325,24963,AutoNationToyotaLibertyville,8882544312,Blue,Black,Automatic,Automatic
6,2019 Toyota RAV4 XLE,Used,19121,23990,ElginToyota,2245237149,White,Gray,Automatic,Automatic
7,2019 Toyota RAV4 XLE,Used,18149,25200,BillWalshToyota,8157683485,Black,Black,Automatic,Automatic
8,2019 Toyota RAV4 Adventure,New,0,32498,CorwinToyotaofBellevue,4025570470,Blue,Brown,Automatic,Automatic
9,2019 Toyota RAV4 XLE,Toyota Certified,13347,25432,CrownToyotaVW,6169315692,White,Black,Automatic,Automatic


In [72]:
siena_2012_2017 = siena_2012_14.append(siena_2015_17).reset_index()[['model','stock_type','mileage', 'price','dealer','dealer_phone'\
                                         ,'ext_color','int_color','trans_type','drive_train']]
siena_2012_2017.to_csv('siena_2012_2017.csv')

In [65]:
#get data for 2019 Hyundai Santa Fe
santafe = 'https://www.cars.com/for-sale/searchresults.action/?dealerType=localOnly&mdId=21899&mkId=20064&mlgId=28860&page=1&perPage=100&rd=99999&searchSource=GN_REFINEMENT&sort=relevance&yrId=36362520&zc=55901'

santafe = get_car_prices(16,santafe)

santafe.to_csv('santafe.csv')

santafe

Unnamed: 0,model,stock_type,mileage,price,dealer,dealer_phone,ext_color,int_color,trans_type,drive_train
0,2019 Hyundai Santa Fe SE 2.4,Used,21710,22395,KenVanceMotors,8775112907,Silver,Black,Automatic,Automatic
1,2019 Hyundai Santa Fe SE 2.4,Used,21958,22295,KenVanceMotors,8775112907,Gray,Black,Automatic,Automatic
2,2019 Hyundai Santa Fe Limited 2.4,Used,4020,29653,BuerkleAutomotive,6513215017,Gray,Black,Automatic,Automatic
3,2019 Hyundai Santa Fe SE 2.4,Used,14457,20998,WalserHyundai,6513215083,Gray,Other,Automatic,Automatic
4,2019 Hyundai Santa Fe SE 2.4,Used,51041,19970,BillionAutoHyundaiofIowaCity,8884399850,Blue,Black,Automatic,Automatic
5,2019 Hyundai Santa Fe SE 2.4,Used,15524,19961,WillisAutomotive,8667768977,Brown,Black,Automatic,Automatic
6,2019 Hyundai Santa Fe SE 2.4,Used,18944,21467,BergstromImportsonVictoryLane,8882467614,Gray,Black,Automatic,Automatic
7,2019 Hyundai Santa Fe Ultimate 2.0T,New,0,31870,GreenFamilyHyundai,3099482148,Gray,Black,Automatic,Automatic
8,2019 Hyundai Santa Fe SE 2.4,Used,35511,19995,WildeToyota,4143126720,White,Black,Automatic,Automatic
9,2019 Hyundai Santa Fe SEL Plus 2.4,Used,4775,25371,VanHornHyundaiofSheboygan,9203951871,Gray,Black,Automatic,Automatic


In [66]:
#get data for 2019 Honda CRV
crv2019 = 'https://www.cars.com/for-sale/searchresults.action/?dealerType=localOnly&mdId=20762&mkId=20017&mlgId=28860&page=1&perPage=100&rd=99999&searchSource=GN_REFINEMENT&sort=relevance&yrId=36362520&zc=55901'

crv2019 = get_car_prices(50,crv2019)

crv2019.to_csv('crv2019.csv')

crv2019

Unnamed: 0,model,stock_type,mileage,price,dealer,dealer_phone,ext_color,int_color,trans_type,drive_train
0,2019 Honda CR-V EX,Used,1009,27000,LutherBrookdaleHonda,7635152679,White,White,CVT,CVT
1,2019 Honda CR-V LX,Used,240,23777,LutherBrookdaleHonda,7635152679,Black,Black,CVT,CVT
2,2019 Honda CR-V LX,Used,1032,23495,LutherBrookdaleHonda,7635152679,White,White,CVT,CVT
3,2019 Honda CR-V LX,New,0,25469,CommunityHonda,3192843151,White,White,CVT,CVT
4,2019 Honda CR-V EX,Used,2019,26998,RichardsonMotors,5632392190,Silver,Black,CVT,CVT
5,2019 Honda CR-V LX,New,0,25381,GustmanHonda,9202437345,Black,Black,CVT,CVT
6,2019 Honda CR-V LX,New,0,26945,ZimmermanHonda,3095175556,Silver,Gray,CVT,CVT
7,2019 Honda CR-V EX,Used,4551,27422,WildeChryslerJeepDodgeRamSubaru,2622902668,White,White,Automatic,Automatic
8,2019 Honda CR-V EX-L,New,0,33440,InternationalHonda,9205874138,Gray,Gray,CVT,CVT
9,2019 Honda CR-V LX,Honda Certified,9743,22594,RacineHonda,2623241120,Gray,Other,CVT,CVT


In [67]:
#get data for 2019 Ford Escape
escape2019 = 'https://www.cars.com/for-sale/searchresults.action/?dealerType=localOnly&mdId=21088&mkId=20015&mlgId=28860&page=1&perPage=100&rd=99999&searchSource=GN_REFINEMENT&sort=relevance&yrId=36362520&zc=55901'

escape2019 = get_car_prices(37,escape2019)

escape2019.to_csv('escape2019.csv')

escape2019

Unnamed: 0,model,stock_type,mileage,price,dealer,dealer_phone,ext_color,int_color,trans_type,drive_train
0,2019 Ford Escape SE,Used,26237,17891,TomKadlecKia,8883542179,Blue,Gray,Automatic,Automatic
1,2019 Ford Escape SE,Used,3500,15995,KostersCarKorner,5072424104,Orange,Black,Automatic,Automatic
2,2019 Ford Escape SEL,Used,9964,20900,NorthtownFordInc,7156193039,White,Other,Automatic,Automatic
3,2019 Ford Escape SE,Used,22310,18299,KenVanceMotors,8775112907,Black,Other,Automatic,Automatic
4,2019 Ford Escape SEL,Used,6659,18995,AceAutoCars,9522320189,Red,Black,Automatic,Automatic
5,2019 Ford Escape Titanium,Used,12253,22155,MorriesBuffaloFord,6122133948,Black,Black,Automatic,Automatic
6,2019 Ford Escape Titanium,New,0,29960,MiddletonFord,6084782666,White,Black,Automatic,Automatic
7,2019 Ford Escape Titanium,New,0,29978,MiddletonFord,6084782666,White,Black,Automatic,Automatic
8,2019 Ford Escape Titanium,New,0,30769,KayserFordLincoln,6083381566,White,Black,Automatic,Automatic
9,2019 Ford Escape SE,Used,25191,20981,BoucherNissanWaukesha,8666906248,Black,Black,Automatic,Automatic


In [75]:
#append all car data gathered above together

#list of car dataframes
car_objs = [escape2019,crv2019,santafe,rav4_2019]

car_data_all = pd.DataFrame(columns =['model','stock_type','mileage', 'price','dealer','dealer_phone'\
                                         ,'ext_color','int_color','trans_type','drive_train'])

for obj in car_objs:
    car_data_all = car_data_all.append(obj)
    
car_data_all = car_data_all.reset_index()[['model','stock_type','mileage', 'price','dealer','dealer_phone'\
                                         ,'ext_color','int_color','trans_type','drive_train']]
    
car_data_all.to_csv('car_data_all.csv')

car_data_all

Unnamed: 0,model,stock_type,mileage,price,dealer,dealer_phone,ext_color,int_color,trans_type,drive_train
0,2019 Ford Escape SE,Used,26237,17891,TomKadlecKia,8883542179,Blue,Gray,Automatic,Automatic
1,2019 Ford Escape SE,Used,3500,15995,KostersCarKorner,5072424104,Orange,Black,Automatic,Automatic
2,2019 Ford Escape SEL,Used,9964,20900,NorthtownFordInc,7156193039,White,Other,Automatic,Automatic
3,2019 Ford Escape SE,Used,22310,18299,KenVanceMotors,8775112907,Black,Other,Automatic,Automatic
4,2019 Ford Escape SEL,Used,6659,18995,AceAutoCars,9522320189,Red,Black,Automatic,Automatic
5,2019 Ford Escape Titanium,Used,12253,22155,MorriesBuffaloFord,6122133948,Black,Black,Automatic,Automatic
6,2019 Ford Escape Titanium,New,0,29960,MiddletonFord,6084782666,White,Black,Automatic,Automatic
7,2019 Ford Escape Titanium,New,0,29978,MiddletonFord,6084782666,White,Black,Automatic,Automatic
8,2019 Ford Escape Titanium,New,0,30769,KayserFordLincoln,6083381566,White,Black,Automatic,Automatic
9,2019 Ford Escape SE,Used,25191,20981,BoucherNissanWaukesha,8666906248,Black,Black,Automatic,Automatic
