In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import random

In [None]:
BOROUGHS = {
    "City of London": "5E61224",
    "Barking and Dagenham": "5E61400",
    "Barnet": "5E93929",
    "Bexley": "5E93932",
    "Brent": "5E93935",
    "Bromley": "5E93938",
    "Camden": "5E93941",
    "Croydon": "5E93944",
    "Ealing": "5E93947",
    "Enfield": "5E93950",
    "Greenwich": "5E61226",
    "Hackney": "5E93953",
    "Hammersmith and Fulham": "5E61407",
    "Haringey": "5E61227",
    "Harrow": "5E93956",
    "Havering": "5E61228",
    "Hillingdon": "5E93959",
    "Hounslow": "5E93962",
    "Islington": "5E93965",
    "Kensington and Chelsea": "5E61229",
    "Kingston upon Thames": "5E93968",
    "Lambeth": "5E93971",
    "Lewisham": "5E61413",
    "Merton": "5E61414",
    "Newham": "5E61231",
    "Redbridge": "5E61537",
    "Richmond upon Thames": "5E61415",
    "Southwark": "5E61518",
    "Sutton": "5E93974",
    "Tower Hamlets": "5E61417",
    "Waltham Forest": "5E61232",
    "Wandsworth": "5E93977",
    "Westminster": "5E93980",
}

In [None]:
all_apartment_links = [] # stores apartment links
all_description = [] # stores number of bedrooms in the apartment
all_address = [] # stores address of apartment
all_price = [] # stores the listing price of apartment
all_geoson = [] #stores the geoson of the listing

In [None]:
def main():

    for borough in list(BOROUGHS.values()):

        # initialise index, this tracks the page number we are on. every additional page adds 24 to the index
        index = 0

        key = [key for key, value in BOROUGHS.items() if value == borough]
        print(f"We are scraping the borough named: {key}")
        for pages in range(41):

            # define our user headers
            headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
            }

            # the website changes if the you are on page 1 as compared to other pages
            if index == 0:
                rightmove = f"https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%{borough}&sortType=6&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords="

            elif index != 0:
                rightmove = f"https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%{borough}&sortType=6&index={index}&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords="

            # request our webpage
            res = requests.get(rightmove, headers=headers)

            # check status
            res.raise_for_status()

            soup = BeautifulSoup(res.text, "html.parser")
            # This gets the list of apartments
            apartments = soup.find_all("div", class_="l-searchResult is-list")

                # This gets the number of listings
            number_of_listings = soup.find(
                    "span", {"class": "searchHeader-resultCount"}
                )
            number_of_listings = number_of_listings.get_text()
            number_of_listings = int(number_of_listings.replace(",", ""))

            for i in range(len(apartments)):

                # tracks which apartment we are on in the page
                apartment_no = apartments[i]

                # append link
                apartment_info = apartment_no.find("a", class_="propertyCard-link")
                link = "https://www.rightmove.co.uk" + apartment_info.attrs["href"]
                all_apartment_links.append(link)

                # append address
                address = (
                    apartment_info.find("address", class_="propertyCard-address")
                    .get_text()
                    .strip()
                )
                all_address.append(address)

                # append description
                description = (
                    apartment_info.find("h2", class_="propertyCard-title")
                    .get_text()
                    .strip()
                    )
                all_description.append(description)

                # append price
                price = (
                    apartment_no.find("div", class_="propertyCard-priceValue")
                    .get_text()
                    .strip()
                )

            
                all_price.append(price)

            print(f"You have scrapped {pages + 1} pages of apartment listings.")
            print(f"You have {number_of_listings - index} listings left to go")
            print("\n")

            # code to ensure that we do not overwhelm the website
            time.sleep(random.randint(1, 3))

            # Code to count how many listings we have scrapped already.
            index = index + 24

            if index >= number_of_listings:
                break
            # convert data to dataframe
        data = {
            "Links": all_apartment_links,
            "Address": all_address,
            "Description": all_description,
            "Price": all_price,
        }
        df = pd.DataFrame.from_dict(data)
        df.to_csv(r"rightmove_data.csv", encoding="utf-8", header="true", index = False)


if __name__ == "__main__":
    main()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


You have scrapped 28 pages of apartment listings.
You have 2164 listings left to go


You have scrapped 29 pages of apartment listings.
You have 2138 listings left to go


You have scrapped 30 pages of apartment listings.
You have 2116 listings left to go


You have scrapped 31 pages of apartment listings.
You have 2093 listings left to go


You have scrapped 32 pages of apartment listings.
You have 2068 listings left to go


You have scrapped 33 pages of apartment listings.
You have 2044 listings left to go


You have scrapped 34 pages of apartment listings.
You have 2018 listings left to go


You have scrapped 35 pages of apartment listings.
You have 1994 listings left to go


You have scrapped 36 pages of apartment listings.
You have 1972 listings left to go


You have scrapped 37 pages of apartment listings.
You have 1946 listings left to go


You have scrapped 38 pages of apartment listings.
You have 1922 listings 