## Scraping Houses Data from Zoopla.com

Zoopla.com is a great website that lets you, the prospective buyer, search for properties to either buy on rent in different parts of the United Kingdom.

Below, I scrape the prices, number of bedrooms, baths, living rooms, date of listing, and the agent contact for all the houses in England on the website.

In [1]:
# Importing the necessary packages

from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
page = requests.get('https://www.zoopla.co.uk/for-sale/houses/england/?page_size=25&q=England&radius=0&results_sort=newest_listings&pn=1')
soup = BeautifulSoup(page.content, 'html.parser')
#soup.prettify()

In [3]:
# Creating the data for holding all the data once I am done scraping.
columns = ['Description', 'Location', 'Price', 'Bedrooms', 'Reception Rooms', 'Bathrooms', 'Date Listed', 'Agent Contact']

df = pd.DataFrame(columns = columns)
df

Unnamed: 0,Description,Location,Price,Bedrooms,Reception Rooms,Bathrooms,Date Listed,Agent Contact


In [4]:
for div in soup.find_all('div', class_ = "earci3d1 css-tk5q7b-Wrapper-ListingCard-StyledListingCard e2uk8e10"):
    price = div.find('p', class_ ="css-18tfumg-Text eczcs4p0").get_text().replace('£', '')
    try:
        bds = div.find_all("p", class_ = "css-ulgz99-Text eczcs4p0")[0].get_text()
    except:
        bds = ''
    try:
        bths = div.find_all("p", class_ = "css-ulgz99-Text eczcs4p0")[1].get_text()
    except:
        bths = ''
    try:
        lrms = div.find_all("p", class_ = "css-ulgz99-Text eczcs4p0")[2].get_text()
    except:
        lrms = ''
    description = div.find("h2", class_ = "css-c7hd0c-Heading2-StyledAddress e2uk8e14").get_text().split(' for sale')[0]
    location = div.find('p', class_ = "css-wfe1rf-Text eczcs4p0").get_text()
    try:
        new_home = div.find('span', class_ = "css-9inre7-Tag-StyledTag e2uk8e13").get_text()
    except:
        new_home = 'No'
    listed_on = div.find('span', class_ = "css-19cu4sz-Text eczcs4p0").get_text().split('Listed on ')[1]
    agent_no = div.find('a', class_ = "css-m5wyy8-ButtonLink-Button-ButtonLinkWithIcon-ContactLink e2uk8e2").get_text()
    
    #print(description)
    #print(listed_on, price, bds, bths, lrms, description, location, new_home, agent_no)
    # ['Description', 'Location', 'Price', 'Bedrooms', 'Reception Rooms', 'Bathrooms', 'Date Listed', 'Agent Contact']
    data = {
        'Description': [description],
        'Location': [location],
        'Price': [price],
        'Bedrooms': [bds],
        'Reception Rooms': [lrms],
        'Bathrooms': [bths],
        'Date Listed': [listed_on],
        'New Home': new_home,
        'Agent Contact': [agent_no]
    }
    
    temp_df = pd.DataFrame(data)
    df = pd.concat([df, temp_df], ignore_index = True)
df.head(10)

Unnamed: 0,Description,Location,Price,Bedrooms,Reception Rooms,Bathrooms,Date Listed,Agent Contact,New Home
0,4 bed terraced house,"Molyneux Square, Hampton Vale, Peterborough PE7",230000,4,2.0,2.0,5th Mar 2021,01733 850634,No
1,3 bed terraced house,"Clarke Road, Northampton NN1",230000,3,1.0,1.0,5th Mar 2021,01604 726337,No
2,2 bed detached bungalow,"Hyacinth Close, Rossendale BB4",177500,2,2.0,1.0,5th Mar 2021,01706 408586,No
3,4 bed semi-detached house,"Northway Road, Croydon, Croydon, Surrey CR0",475000,4,2.0,2.0,5th Mar 2021,020 8033 0399,No
4,2 bed end terrace house,"Kemble Street, Woodrow North, Redditch B98",190000,2,2.0,2.0,5th Mar 2021,01527 329804,No
5,3 bed detached house,"Market Street Clay Cross, Derbyshire S45",269995,3,,2.0,5th Mar 2021,01332 494185,New home
6,2 bed terraced house,"Cambridge Street, Mansfield NG18",99950,2,1.0,1.0,5th Mar 2021,01322 584475,Investment
7,3 bed link-detached house,"The Rank, North Bradley, Trowbridge BA14",325000,3,2.0,1.0,5th Mar 2021,01225 839232,No
8,2 bed bungalow,"Mount Close, New Milton, Hampshire BH25",345000,2,,,5th Mar 2021,01425 292853,No
9,3 bed terraced house,"Avenue Road Extension, Leicester LE2",200000,3,,1.0,5th Mar 2021,0116 448 0380,Investment


We can edit the code so that it grabs data from the first 11 pages!

In [5]:
for i in range(1, 11):
    url = 'https://www.zoopla.co.uk/for-sale/houses/england/?page_size=25&q=England&radius=0&results_sort=newest_listings&pn={}'.format(i)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    for div in soup.find_all('div', class_ = "earci3d1 css-tk5q7b-Wrapper-ListingCard-StyledListingCard e2uk8e10"):
        price = div.find('p', class_ ="css-18tfumg-Text eczcs4p0").get_text().replace('£', '')
        try:
            bds = div.find_all("p", class_ = "css-ulgz99-Text eczcs4p0")[0].get_text()
        except:
            bds = ''
        try:
            bths = div.find_all("p", class_ = "css-ulgz99-Text eczcs4p0")[1].get_text()
        except:
            bths = ''
        try:
            lrms = div.find_all("p", class_ = "css-ulgz99-Text eczcs4p0")[2].get_text()
        except:
            lrms = ''
        description = div.find("h2", class_ = "css-c7hd0c-Heading2-StyledAddress e2uk8e14").get_text().split(' for sale')[0]
        location = div.find('p', class_ = "css-wfe1rf-Text eczcs4p0").get_text()
        try:
            new_home = div.find('span', class_ = "css-9inre7-Tag-StyledTag e2uk8e13").get_text()
        except:
            new_home = 'No'
        listed_on = div.find('span', class_ = "css-19cu4sz-Text eczcs4p0").get_text().split('Listed on ')[1]
        agent_no = div.find('a', class_ = "css-m5wyy8-ButtonLink-Button-ButtonLinkWithIcon-ContactLink e2uk8e2").get_text()

        #print(description)
        #print(listed_on, price, bds, bths, lrms, description, location, new_home, agent_no)
        # ['Description', 'Location', 'Price', 'Bedrooms', 'Reception Rooms', 'Bathrooms', 'Date Listed', 'Agent Contact']
        data = {
            'Description': [description],
            'Location': [location],
            'Price': [price],
            'Bedrooms': [bds],
            'Reception Rooms': [lrms],
            'Bathrooms': [bths],
            'Date Listed': [listed_on],
            'New Home': new_home,
            'Agent Contact': [agent_no]
        }

        temp_df = pd.DataFrame(data)
        df = pd.concat([df, temp_df], ignore_index = True)
df

Unnamed: 0,Description,Location,Price,Bedrooms,Reception Rooms,Bathrooms,Date Listed,Agent Contact,New Home
0,4 bed terraced house,"Molyneux Square, Hampton Vale, Peterborough PE7",230000,4,2,2,5th Mar 2021,01733 850634,No
1,3 bed terraced house,"Clarke Road, Northampton NN1",230000,3,1,1,5th Mar 2021,01604 726337,No
2,2 bed detached bungalow,"Hyacinth Close, Rossendale BB4",177500,2,2,1,5th Mar 2021,01706 408586,No
3,4 bed semi-detached house,"Northway Road, Croydon, Croydon, Surrey CR0",475000,4,2,2,5th Mar 2021,020 8033 0399,No
4,2 bed end terrace house,"Kemble Street, Woodrow North, Redditch B98",190000,2,2,2,5th Mar 2021,01527 329804,No
...,...,...,...,...,...,...,...,...,...
270,3 bed semi-detached house,"""The Danbury"" at Burlow Road, Harpur Hill, Bux...",185995,3,,1,5th Mar 2021,01298 437902,New home
271,3 bed semi-detached house,"""The Stafford"" at Burlow Road, Harpur Hill, Bu...",216995,3,,1,5th Mar 2021,01298 437902,New home
272,3 bed semi-detached house,"""The Stafford"" at Burlow Road, Harpur Hill, Bu...",216995,3,,1,5th Mar 2021,01298 437902,New home
273,3 bed detached house,"Wyson, Brimfield, Ludlow SY8",262995,3,1,1,5th Mar 2021,01584 539767,No
