Scraping Auto trader website to extract relevant information about cars using BeautifulSoup Library.

In [2]:
#import Libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup

Auto trader only allows to scraper top 1000 listing per search. To extract large amount of data, I created a list of postcode around the UK and used for loop to scraper car data around postcodes and pages.

# Web scraper

1. I decided to pick 10 postcodes around the UK to extract large amount of data as 1000 listing per Search and each page has roughly 10 car listings

2. I'm extracting information like car title, prices, mileage, engine size ,gearbox and etc.

3. I have also created a empty list and appending the extract data into that list

4. Converting list to DataFrame

5. Exporting data into csv file.

In [3]:
# I create the following postcodes
postcode = ['BH11AR','SL14DX','SY231AB','TR196PJ','BN207AE','BS247EY','NR293PS','PE251FD','LL536ED','HS29DX']

#empty lists 
car_title =[]
car_price = []

#empty dict 
veh_dict_2 = {}
for x in range(0,10):
    veh_dict_2['att{0}'.format(x)] = []

for page in range(1, 100):  #each pages has 10 listing, so the range is set up for 100 (1000 listing per search)
    for code in postcode:   #each postcode 
        
        #url for the web scraper 
        
        req_url = f"https://www.autotrader.co.uk/car-search?sort=distance&" \
                f"postcode={code}&radius=200&onesearchad=Used&onesearchad=Nearly%20New&" \
                f"onesearchad=New&page={page}"
        
        req = urlopen(req_url)
        #print(req_url)
        page_html = req.read()
        req.close()
        page_soup = BeautifulSoup(page_html, "html.parser")
        #print(page_soup)
        
        
        
        #BeautifulSoup.findAll is deprecated use find_all instead
        listings = page_soup.find_all("li", {"class": "search-page__result"})
        for listing in listings:
            information_container = listing.find("div", {"class": "information-container"})
            title_container = information_container.find("a", {
               "class": "js-click-handler listing-fpa-link tracking-standard-link"})
            title = title_container.text.replace('\n',' ')
            price = listing.find("div", {"class": "vehicle-price"}).text
            #print(title)
            car_title.append(title)
            car_price.append(price)

            #Vehcile Information
            Veh_information = listing.find('ul', class_='listing-key-specs')
            specs = Veh_information.find_all('li')
            for i, att in enumerate(veh_dict_2.keys()):
                try:
                    veh_dict_2[att].append(specs[i].text.strip())
                except:
                    veh_dict_2[att].append(None)
                #print(specs_dict)


# Lists to DataFrame

In [4]:
#using pandas library
import pandas as pd

In [5]:
vec_1 = pd.DataFrame({ 'Car_Title':car_title,
  'Car_Price':car_price
})

In [6]:
vec_2 =pd.DataFrame(veh_dict_2)
vec_2.columns = ['Reg_Year','Car_Type_Door','Car_Mileage','Car_engine_size','Car_Horse_PW','Car_Type_Vehicle','Car_Filling_in','Type_of_owner','Car_Service_History','Car_Other']

In [7]:
data = pd.concat([vec_1,vec_2],axis=1)

Data scraped from each of the postcode can overlap each other, so I'm going to drop any duplications.

In [8]:
data=data.drop_duplicates()

In [10]:
data.head()

Unnamed: 0,Car_Title,Car_Price,Reg_Year,Car_Type_Door,Car_Mileage,Car_engine_size,Car_Horse_PW,Car_Type_Vehicle,Car_Filling_in,Type_of_owner,Car_Service_History,Car_Other
0,Nissan Qashqai 1.5L Acenta dCi 5dr,"£11,100",2017 (66 reg),SUV,"22,165 miles",1.5L,114BHP,Manual,Diesel,ULEZ,,
1,Peugeot 206 1.6 16v Sport 3dr,£655,2005 (55 reg),Hatchback,"100,500 miles",1.6L,110BHP,Manual,Petrol,4 owners,Part service history,
2,Vauxhall Astra 1.6 i 16v Elite 5dr,"£1,650",2008 (08 reg),Hatchback,"119,051 miles",1.6L,113BHP,Manual,Petrol,4 owners,ULEZ,
3,Audi A1 2.0 TDI Amplified Edition Sportback 5...,"£35,000",2012 (62 reg),Hatchback,"95,000 miles",2.0L,141BHP,Manual,Diesel,1 owner,Full service history,
4,Mercedes-Benz CLS 2.1 CLS250 BlueEFFICIENCY 7...,"£13,750",2013 (13 reg),Coupe,"47,000 miles",2.1L,201BHP,Automatic,Diesel,2 owners,,


# export to csv

In [11]:
#data.to_csv('auto_scraper.csv')
#data.to_csv('web_scraper.csv')