# Scraping used car data from Cazoo.

Run the first two cells to use the scraper. cazoo_scraper methods will obtain all required data, please read the instructions.

The rest of the code is a breakdown that shows the justificaton for each line.

In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

In [37]:
class cazoo_scraper:
    def __init__(self):
        """
        This package will extract data from cazoo.co.uk. 
        
        Note: import: BeautifulSoup, requests,pandas, numpy and re to use this module.
        
        Steps.
        
        >>>1. Use .parse(pages) method to extract information
             pages: number of pages on cazoo showing cars, do not exceed number of pages on website.
        
        >>>2. After parsing, use `.clean_html()` to extract all data 
        
        >>>3. After extraction use `.to_df()` to create pandas dataframe
        
        
        """
        self.baseurl = 'https://www.cazoo.co.uk/cars/?page='
        self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        
        self.results_spec = []
        self.results_price = []
        
        self.price = []
        self.name = []
        self.engine = []
        self.transmission = []
        self.mileage = []
        self.year = []
    def parse(self,pages):
        '''
        For parsing cazoo webpage, enter number of pages to be requested and parsed. Do not exceed number of result pages on the website
        
        Output is a nested list, first list containing car specifications and second containing prices.
        
        '''
        
        page_numbers = range(1,pages + 1)
        print( f'Data from {pages} in: https://www.cazoo.co.uk')
        
        for i in page_numbers:
            
            response = requests.get(self.baseurl + str(i), headers = self.headers)
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            spec = soup.find_all('div', {'class':"vehicle-cardstyles__DetailWrap-sc-1bxv5iu-5 LKLJD"})
            price = soup.find_all('div', {'class':"pricingstyles__Wrap-rs9839-0 WBvKv"})
    
            self.results_spec.append(spec)
            self.results_price.append(price)
            
        if len(results_spec) == pages:
            print( 'Data retrieved succesfully')
            
        return self.results_spec, self.results_price
        
        
    def clean_html(self):
        '''  
        This method will clean html data and store lists that can be then turned to a pandas dataframe using `.to_df()`.
        
        '''
        spec_one = []
        spec_two = []
        
        for l,page in enumerate(self.results_price):
            for k,i in enumerate(page):
                p = i.find('p', {'class': 'full-pricestyles__Price-sc-12l0fhp-0 etmdWv'})
                if p != None:
                    price.append(p.get_text())
        
                elif p == None:
                    clean = self.results_price[l][k].get_text()
                    value = re.findall('(£\d+(\,)?\d+\d){2}',clean)[0][0]# will return the second pattern match, the current price
                    self.price.append(value)
        
        
        
        for page in self.results_spec:
            s_1 = []
            s_2 = []
            
            for i in page:
                n = i.find('p', {'class' :"vehicle-cardstyles__CustomTitle-sc-1bxv5iu-6 gCwRSy"})
                s = i.find('p', {'class' :"vehicle-cardstyles__DisplayVariant-sc-1bxv5iu-7 cErJHu"})
                m = i.find('ul', {'class':'vehicle-cardstyles__TagWrap-sc-1bxv5iu-8 bUEvWV'})
   
                self.name.append(n.get_text())
                s_1.append(s.get_text())
                s_2.append(m.get_text())
            spec_one.append(s_1)
            spec_two.append(s_2)
        
        for spec in spec_one:
            for car in spec:
                p = re.search('\d(\.\d)?L', car)
                
                if p !=None:
                    self.engine.append(p.group())
                else:
                    self.engine.append(np.NaN)
        
        

        def match(pattern):
            output =[]
            for spec in spec_two:
                for car in spec:
                    m = re.search(pattern, car)
                    if m != None:
                        output.append(m.group())
                        
                    elif m == None and len(mi)==0  and spec.index(s) == len(spec)-1:
                        output.append(np.NaN)
           
    
            return output
                    
        self.transmission.extend(match('(Manual)|(Automatic)'))
        self.fuel.extend(match('[A-Z][a-z]+(\W)?[a-z]+(\s)?[a-z]+$'))
        self.year.extend(match('\d{3}\d'))
        self.mileage.extend(match('\d{0,2}(\,)?\d+'))
        
    def to_df(self):
        '''
        Make a pandas dataframe out of extracted data.
        The output is a clean pandas dataframe ready for data analysis, model fitting or export.
        
    
        '''
        
        df = pd.DataFrame({'Name': self.name,'Year': self.year, 'Mileage': self.mileage, 'Engine_Size': self.engine, 
                           'Transmission': self.transmission, 'Fuel_Type': self.fuel ,'Price(£)': self.price})
        
        
        brand = []
        make = []
        names = df["Name"]
        
        for i in names:
            s = i.split(" ")
            if len(s) == 2:
                b = s[0]
                m = s[1]
            elif len(s) ==3:
                b = s[0]
                m = ' '. join(s[1:])
            
            elif len(s) == 4:
                if "Land Rover" in ' '.join(s):
                    b = ' '.join(s[:2])
                    m = ' '.join(s[1:])
                else:
                    b = s[0]
                    m = ' '.join(s[1:])
            elif len(s) == 5:
                if "Land Rover" in ' '.join(s):
                    b = ' '.join(s[:2])
                    m = ' '.join(s[1:])
                else:
                    b = s[0]
                    m = ' '.join(s[1:])
            brand.append(b)
            make.append(m)
        b_df = pd.DataFrame({'Brand':brand})
        m_df = pd.DataFrame({'Make':make})

        b_m = pd.concat([b_df,m_df], axis = 1)
        df = pd.concat([b_m, df], axis = 1)
        df = df.drop(columns = "Name")
        
        def cleaner(df,pattern, col_name, t):
            for i in range(0,len(df)):
                univ_key = {ord('£'): None, ord(','):None} # the'universal key' symbols common to all columns that should be removed
                if pd.notna(df.loc[i,col_name]): # A 'NaN' value will throw an error on re.search()so exclude
                    key = re.search( pattern, df.loc[i, col_name]).group() #search for pattern and extract useful information
                    new = key.translate(univ_key) # remove symbols with univ_key
                    df.loc[i,col_name] = t(new) #replace old with new
            
            
                else: 
                    continue # skips to next when a NaN value is reached
                    
        cleaner(df,'\d*([\,]|\d)*\d','Price(£)',int)
        cleaner(df,'\d{0,2}(\,)?\d+', 'Mileage',int)
        cleaner(df,'\d(\.\d)?', 'Engine_Size',float)
        cleaner(df,'\d{3}\d', 'Year',int)
        
        return df    

In [6]:
results_spec = []
results_price = []

for i in range(1,155):
    url = 'https://www.cazoo.co.uk/cars/?page='+ str(i)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    
    response = requests.get(url, headers = headers)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    spec = soup.find_all('div', {'class':"vehicle-cardstyles__DetailWrap-sc-1bxv5iu-5 LKLJD"})
    price = soup.find_all('div', {'class':"pricingstyles__Wrap-rs9839-0 WBvKv"})
    
    results_spec.append(spec)
    results_price.append(price)

In [7]:
for l,page in enumerate(results_price):
    for k,i in enumerate(page):
        
        p = i.find('p', {'class': 'full-pricestyles__Price-sc-12l0fhp-0 etmdWv'})
    
        if p == None:
        
            print(l,k)

1 4
1 13
1 39
2 8
3 15
3 44
4 2
4 9
4 42
4 43
6 7
6 39
7 19
8 7
8 39
9 11
9 39
10 40
13 14
16 41
17 17
17 19
18 6
18 43
19 32
19 37
20 22
20 32
20 38
22 8
22 32
22 34
23 8
23 31
23 46
24 5
24 43
25 15
25 25
26 25
26 37
27 16
29 6
30 28
31 2
31 22
31 46
33 16
35 37
38 40
39 18
39 19
41 23
41 44
42 11
42 42
43 12
43 45
44 40
47 9
47 11
50 24
50 40
51 37
52 24
52 36
55 19
58 22
59 1
59 3
60 16
61 42
71 4
71 26
72 20
74 37
75 3
75 25
78 20
78 39
79 21
79 35
80 42
81 12
81 13
81 39
82 5
82 24
83 27
84 20
85 5
85 8
87 35
87 41
87 42
88 22
88 46
89 5
89 22
90 24
90 25
92 7
93 35
94 6
94 8
94 17
94 23
96 3
99 45
102 16
103 20
103 35
103 41
105 1
105 17
105 18
105 20
105 38
106 4
106 5
107 4
107 45
108 2
109 9
109 30
109 32
109 37
110 27
112 17
113 22
113 39
114 27
116 25
116 46
117 9
117 10
117 11
119 7
119 8
120 14
120 44
121 14
125 6
126 21
129 12
130 13
130 34
131 35
131 43
132 20
136 18
136 38
136 45
136 46
137 17
138 5
139 5
140 41
140 46
142 36
143 13
143 37
144 28
145 40
148 20
149 44
1

In [35]:
len(results_price)

154

In [104]:
results_price[7][24].get_text() 

'was £9,550£9,300£183/month HPPlus £49 admin fee'

Using `p = i.find('p', {'class': 'full-pricestyles__Price-sc-12l0fhp-0 etmdWv'})` to extract price data threw out some None values investigating these cases reveled that some cards did not store price data in the class needed, instead it was stored in a new variable, to show that the price has been reduced it will be necessary to extract all the text from these cases and clean data.

the error prone results take specfic for with the old price coming before the new one, the new one will be extracted for the dataframe using regex

In [8]:
price = []

for l,page in enumerate(results_price):
    for k,i in enumerate(page):
        
        p = i.find('p', {'class': 'full-pricestyles__Price-sc-12l0fhp-0 etmdWv'})
    
        if p != None:
            price.append(p.get_text())
        
        elif p == None:
            clean = results_price[l][k].get_text()
            value = re.findall('(£\d+(\,)?\d+\d){2}',clean)[0][0]# will return the second pattern match, the current price
            price.append(value)

In [9]:
len(price) # correct length indicating succesful extraction of all data

7212

In [10]:
name = []
spec_one = []
spec_two = []

for page in results_spec:
    s_1 = []
    s_2 = []
    for i in page:
        n = i.find('p', {'class' :"vehicle-cardstyles__CustomTitle-sc-1bxv5iu-6 gCwRSy"})
        s = i.find('p', {'class' :"vehicle-cardstyles__DisplayVariant-sc-1bxv5iu-7 cErJHu"})
        m = i.find('ul', {'class':'vehicle-cardstyles__TagWrap-sc-1bxv5iu-8 bUEvWV'})
   
        name.append(n.get_text())
        s_1.append(s.get_text())
        s_2.append(m.get_text())
    spec_one.append(s_1)
    spec_two.append(s_2)

In [11]:
spec_two[79][7]

'13,762 miles2018 regAutomaticPetrol'

In [647]:
for i,spec in enumerate(spec_one):
    for k,car in enumerate(spec):
        p = re.search('\d(\.\d)?L', car)
        
        if p ==None:
            print(i,k,p)

0 1 None
0 2 None
0 5 None
54 17 None
58 25 None
61 0 None
62 3 None
75 14 None
79 7 None
85 17 None
92 32 None
92 33 None
96 36 None
97 8 None
98 19 None
99 16 None
101 44 None
102 1 None
103 5 None
104 44 None
105 28 None
106 33 None
106 43 None
107 19 None
107 23 None
108 21 None
108 40 None
109 20 None
110 10 None
110 11 None
110 12 None
111 43 None
112 10 None
112 30 None
113 11 None
113 33 None
113 42 None
114 9 None
114 41 None
115 35 None
116 29 None
117 21 None
118 4 None
118 20 None
119 6 None
119 17 None
119 24 None
119 31 None
120 37 None
124 6 None
125 11 None
125 14 None
125 15 None
125 22 None
127 10 None
127 19 None
127 38 None
128 3 None
128 40 None
128 42 None
131 28 None
132 27 None
133 10 None
134 0 None
134 4 None
134 27 None
135 37 None
135 40 None
136 25 None
136 27 None
137 4 None
137 9 None
137 33 None
137 41 None
137 44 None
137 46 None
138 6 None
138 40 None
138 42 None
138 43 None
139 16 None
139 22 None
139 36 None
140 3 None
140 16 None
140 18 None
140 26 

In [646]:
results_spec[0][1].get_text()

'Tesla Model X75D6,344 miles2017 regAutomaticElectric'

cases of missing values for engine  indicate either special cases such as audi S-Line, or electric vehicles, these none results will treated as NaN. 

In [12]:
engine = []
for spec in spec_one:
    for car in spec:
        p = re.search('\d(\.\d)?L', car)
        
        if p !=None:
            engine.append(p.group())
        else:
            engine.append(np.NaN)

In [13]:
len(engine)

7212

In [14]:
spec_two[74][37]

'57,519 miles2019 regManualPetrol'

In [15]:
for i,page in enumerate(spec_two):
    for l,car in enumerate(page):
        c = re.search('[A-Z][a-z]+(\W)?[a-z]+(\s)?[a-z]+$',car)
        #d=re.search('[A-Z]+[a-z]+',c)
        if c != None:
            print(c,i,l)

<re.Match object; span=(26, 32), match='Diesel'> 0 0
<re.Match object; span=(26, 32), match='Petrol'> 0 1
<re.Match object; span=(26, 32), match='Petrol'> 0 2
<re.Match object; span=(26, 32), match='Petrol'> 0 3
<re.Match object; span=(26, 32), match='Petrol'> 0 4
<re.Match object; span=(26, 32), match='Petrol'> 0 5
<re.Match object; span=(26, 32), match='Petrol'> 0 6
<re.Match object; span=(26, 32), match='Petrol'> 0 7
<re.Match object; span=(26, 32), match='Petrol'> 0 8
<re.Match object; span=(26, 32), match='Petrol'> 0 9
<re.Match object; span=(26, 32), match='Petrol'> 0 10
<re.Match object; span=(26, 32), match='Petrol'> 0 11
<re.Match object; span=(26, 32), match='Petrol'> 0 12
<re.Match object; span=(26, 32), match='Petrol'> 0 13
<re.Match object; span=(26, 32), match='Petrol'> 0 14
<re.Match object; span=(26, 32), match='Petrol'> 0 15
<re.Match object; span=(26, 32), match='Petrol'> 0 16
<re.Match object; span=(26, 32), match='Petrol'> 0 17
<re.Match object; span=(26, 32), match

In [16]:


def match(pattern):
    output =[]
    for spec in spec_two:
        for car in spec:
            
            m = re.search(pattern, car)
            if m != None:
                output.append(m.group())
            
            elif m == None and len(mi)==0  and spec.index(s) == len(spec)-1:
                output.append(np.NaN)
           
    
    return output

In [None]:
match('(Manual)|(Automatic)')
match('[A-Z][a-z]+(\W)?[a-z]+(\s)?[a-z]+$') 
match('\d{3}\d')
match('\d{0,2}(\,)?\d+')

In [17]:
transmission = match('(Manual)|(Automatic)')

In [18]:
fuel = match('[A-Z][a-z]+(\W)?[a-z]+(\s)?[a-z]+$') 
#intended to match the last string pattern, 
#from the final Capital letter followed by simple, with optional non alphanumeric and white space inbetween


In [19]:
year = match('\d{3}\d')

In [20]:
mileage = match('\d{0,2}(\,)?\d+')

In [21]:
cazoo_cars = pd.DataFrame({'Name': name,'Year': year, 'Mileage': mileage, 'Engine_Size': engine, 
                           'Transmission': transmission, 'Fuel_Type': fuel ,'Price(£)': price})

In [22]:
cazoo_cars

Unnamed: 0,Name,Year,Mileage,Engine_Size,Transmission,Fuel_Type,Price(£)
0,Dacia Sandero,2015,58600,1.5L,Manual,Diesel,"£5,300"
1,MG MG3,2015,42916,1.5L,Manual,Petrol,"£5,500"
2,Citroen C1,2016,38217,1L,Manual,Petrol,"£6,200"
3,Skoda Citigo,2015,11626,1L,Manual,Petrol,"£6,350"
4,Fiat Panda,2015,14954,1.2L,Manual,Petrol,"£6,400"
...,...,...,...,...,...,...,...
7207,Mini Countryman,2021,4327,2L,Automatic,Petrol,"£39,400"
7208,Tesla Model X,2017,6344,,Automatic,Electric,"£59,700"
7209,Audi e-tron,2020,12494,,Automatic,Electric,"£56,250"
7210,Land Rover Range Rover,2017,19614,3L,Automatic,Diesel,"£50,600"


In [23]:
cazoo_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7212 entries, 0 to 7211
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          7212 non-null   object
 1   Year          7212 non-null   object
 2   Mileage       7212 non-null   object
 3   Engine_Size   7114 non-null   object
 4   Transmission  7212 non-null   object
 5   Fuel_Type     7212 non-null   object
 6   Price(£)      7212 non-null   object
dtypes: object(7)
memory usage: 394.5+ KB


In [24]:
names = cazoo_cars["Name"]


In [25]:
lengths = []
for i in names:
    s = i.split(" ")
    len(s) > 2
        
    lengths.append(len(s))
np.unique(lengths) 

array([2, 3, 4, 5])

There are rows with upto 5 different words making up name, any row with only 2 words is most likey to contain the brand in position 0 and model in position 1.

the `row_finder(number)` function will display all the rows with the number of words needed to investigate.

In [26]:
def row_finder(number):
    rows = []
    for i in names:
        s = i.split(" ")
        if len(s) == number:
            j = ' '.join(s)
            rows.append(j)
    return rows

looking at rows with more than 2 words, the

In [541]:
np.unique(row_finder(3)) # These rows indicate that the first word denotes the brand and the rest the model.

array(['Abarth 124 Spider', 'Alfa Romeo Giulia', 'Alfa Romeo Giulietta',
       'Alfa Romeo MiTo', 'Audi A3 Cabriolet', 'Audi A4 Avant',
       'Audi A5 Cabriolet', 'Audi A6 Allroad', 'Audi A6 Avant',
       'Audi A6 Saloon', 'Audi Q4 e-tron', 'Audi S4 Avant',
       'BMW 1 Series', 'BMW 2 Series', 'BMW 3 Series', 'BMW 4 Series',
       'BMW 5 Series', 'BMW 6 Series', 'BMW X6 M', 'Citroen C3 Aircross',
       'Citroen C3 Picasso', 'Citroen C4 Cactus', 'Citroen C4 Picasso',
       'Citroen C5 Aircross', 'DS DS 3', 'DS DS 4', 'Dacia Logan MCV',
       'Dacia Sandero Stepway', 'Fiat 124 Spider', 'Fiat 500 Electric',
       'Fiat 500X Dolcevita', 'Ford Grand C-Max', 'Hyundai Santa Fe',
       'Jeep Grand Cherokee', 'Land Rover Discovery', 'Lexus CT 200h',
       'Lexus ES 300h', 'Lexus GS 300', 'Lexus IS 300', 'Lexus NX 200t',
       'Lexus NX 300h', 'Lexus RC 300h', 'Lexus UX 250h', 'MG MG HS',
       'MG MG ZS', 'Mazda MX-5 RF', 'Mercedes-Benz A Class',
       'Mercedes-Benz B Class', 'M

In [542]:
np.unique(row_finder(4)) # except for land rover all rows indicate that the first word denotes make, and the rest model.

array(['Citroen Grand C4 Picasso', 'Citroen Grand C4 SpaceTourer',
       'DS DS 3 CABRIO', 'DS DS 3 CROSSBACK', 'DS DS 4 CROSSBACK',
       'DS DS 7 CROSSBACK', 'Land Rover Discovery Sport',
       'Land Rover Range Rover', 'Volvo V40 Cross Country'], dtype='<U28')

In [543]:
np.unique(row_finder(5)) # except for land rover all rows follow the same pattern as previous, land rover is the only maker with two words.

array(['BMW 2 Series Active Tourer', 'BMW 2 Series Gran Coupe',
       'BMW 2 Series Gran Tourer', 'BMW 3 Series Gran Turismo',
       'BMW 4 Series Gran Coupe', 'BMW 5 Series Gran Turismo',
       'BMW 6 Series Gran Coupe', 'BMW 6 Series Gran Turismo',
       'Land Rover Range Rover Evoque', 'Land Rover Range Rover Sport',
       'Land Rover Range Rover Velar'], dtype='<U29')

In [27]:
brand = []
make = []
for i in names:
    s = i.split(" ")
    if len(s) == 2:
        b = s[0]
        m = s[1]
    elif len(s) ==3:
        b = s[0]
        m = ' '. join(s[1:])
    elif len(s) == 4:
        if "Land Rover" in ' '.join(s):
            b = ' '.join(s[:2])
            m = ' '.join(s[1:])
        else:
            b = s[0]
            m = ' '.join(s[1:])
    elif len(s) == 5:
        if "Land Rover" in ' '.join(s):
            b = ' '.join(s[:2])
            m = ' '.join(s[1:])
        else:
            b = s[0]
            m = ' '.join(s[1:])
    brand.append(b)
    make.append(m)

b_df = pd.DataFrame({'Brand':brand})
m_df = pd.DataFrame({'Make':make})

b_m = pd.concat([b_df,m_df], axis = 1)
cazoo_cars = pd.concat([b_m, cazoo_cars], axis = 1)
cazoo_cars = cazoo_cars.drop(columns = "Name")

In [28]:
def cleaner(df,pattern, col_name, t):
    
    for i in range(0,len(df)):
        univ_key = {ord('£'): None, ord(','):None} # the'universal key' symbols common to all columns that should be removed
        if pd.notna(df.loc[i,col_name]): # A 'NaN' value will throw an error on re.search()so exclude
            
            key = re.search( pattern, df.loc[i, col_name]).group() #search for pattern and extract useful information
            new = key.translate(univ_key) # remove symbols with univ_key
            df.loc[i,col_name] = t(new) #replace old with new
            
            
        else: 
            continue # skips to next when a NaN value is reached
    
    

In [29]:
cleaner(cazoo_cars,'\d*([\,]|\d)*\d','Price(£)',int)
cleaner(cazoo_cars,'\d{0,2}(\,)?\d+', 'Mileage',int)
cleaner(cazoo_cars,'\d(\.\d)?', 'Engine_Size',float)
cleaner(cazoo_cars,'\d{3}\d', 'Year',int)

In [30]:
cazoo_cars

Unnamed: 0,Brand,Make,Year,Mileage,Engine_Size,Transmission,Fuel_Type,Price(£)
0,Dacia,Sandero,2015,58600,1.5,Manual,Diesel,5300
1,MG,MG3,2015,42916,1.5,Manual,Petrol,5500
2,Citroen,C1,2016,38217,1.0,Manual,Petrol,6200
3,Skoda,Citigo,2015,11626,1.0,Manual,Petrol,6350
4,Fiat,Panda,2015,14954,1.2,Manual,Petrol,6400
...,...,...,...,...,...,...,...,...
7207,Mini,Countryman,2021,4327,2.0,Automatic,Petrol,39400
7208,Tesla,Model X,2017,6344,,Automatic,Electric,59700
7209,Audi,e-tron,2020,12494,,Automatic,Electric,56250
7210,Land Rover,Rover Range Rover,2017,19614,3.0,Automatic,Diesel,50600


In [31]:
cazoo_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7212 entries, 0 to 7211
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Brand         7212 non-null   object
 1   Make          7212 non-null   object
 2   Year          7212 non-null   object
 3   Mileage       7212 non-null   object
 4   Engine_Size   7114 non-null   object
 5   Transmission  7212 non-null   object
 6   Fuel_Type     7212 non-null   object
 7   Price(£)      7212 non-null   object
dtypes: object(8)
memory usage: 450.9+ KB


In [33]:
cazoo_cars.to_csv("cazoo_cars.csv", index=False)