In [68]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs

# Webscrape
## Selenium + BeautifulSoup

In [2]:
# #### access URL in browser
# url = 'https://www.spareroom.co.uk/flatshare/?search_id=1323266289&mode=list'

# browser = webdriver.Chrome()
# browser.get(url)

In [35]:
# #### accept cookies
# try:
#     button = browser.find_element(By.ID, "onetrust-accept-btn-handler")
#     try:
#         button.click()
#     except ElementNotInteractableException:
#         pass
# except NameError:
#     pass

# #### get page source
# html_source = browser.page_source

# #### parse w/ Soup
# soup = BeautifulSoup(html_source, 'html.parser')
# articles = soup.find_all('article', class_='panel-listing-result listing-bold')
# featured_article = soup.find_all('article', class_ = 'panel-listing-result listing-featured')
# print(len(featured_article))

# articles = featured_article + articles 

# for article in articles:
#     title = article.find('h2').text.strip()
#     price = article.find('strong', class_='listingPrice').text.strip()
#     location = article.find('span', class_='listingLocation').text.strip()
#     description = article.find('p', class_='description').text.strip()
#     availability = article.find('strong').text.strip()
#     href = article.find('a')['href']

#     parsed_url = urlparse(href)
#     query_params = parse_qs(parsed_url.query)

#     flatshare_id = query_params.get('flatshare_id', [None])[0]
#     fad_id = query_params.get('fad_id', [None])[0]
#     result_id = flatshare_id if flatshare_id else fad_id

# # Print or store the extracted information
#     print(f"Title: {title}")
#     print(f"Price: {price}")
#     print(f"Location: {location}")
#     print(f"Description: {description}")
#     print(f"Availability: {availability}")
#     print(f"ID: {result_id}")
#     print("---")

In [80]:
class SpareRoomScraper:
    def __init__(self, browser):
        self.browser = browser
        # self.n_pages = n_pages


    def scrape_pages(self, n_pages):
        master_list = []
        for i in range(0, n_pages):
            self.close_register_popup()
            listings = self.scrape_listings()
            master_list.append(listings)
            button = self.browser.find_element(By.ID, "paginationNextPageLink")
            time.sleep(5)
            button.click()

        return pd.DataFrame.from_dict(master_list[0])

    def scrape_listings(self):
        self.accept_cookies()
        html_source = self.browser.page_source
        soup = BeautifulSoup(html_source, 'html.parser')

        listings = []

        articles = soup.find_all('article', class_='panel-listing-result listing-bold')
        featured_articles = soup.find_all('article', class_='panel-listing-result listing-featured')
        
        # print(f"Number of featured articles: {len(featured_articles)}")
        
        all_articles = featured_articles + articles

        for article in all_articles:
            listing = self.extract_listing_info(article)
            listings.append(listing)
        
        return listings


    def accept_cookies(self):
        try:
            button = self.browser.find_element(By.ID, "onetrust-accept-btn-handler")
            button.click()
        except (NoSuchElementException, ElementNotInteractableException):
            pass

    def close_register_popup(self):
        try:
            button = self.browser.find_element(By.ID, "reg_close")
            button.click()
        except (NoSuchElementException, ElementNotInteractableException):
            pass

    def extract_listing_info(self, article):
        title = article.find('h2').text.strip()
        price = article.find('strong', class_='listingPrice').text.strip()
        location = article.find('span', class_='listingLocation').text.strip()
        description = article.find('p', class_='description').text.strip()
        availability = article.find_all('strong')
        if len(availability) > 1:
            availability = availability[2].text.strip()
        href = article.find('a')['href']

        result_id = self.extract_id_from_url(href)

        return {
            'title': title,
            'price': price,
            'location': location,
            'description': description,
            'availability': availability,
            'href': href,
            'id': result_id
        }

    def extract_id_from_url(self, url):
        parsed_url = urlparse(url)
        query_params = parse_qs(parsed_url.query)

        flatshare_id = query_params.get('flatshare_id', [None])[0]
        fad_id = query_params.get('fad_id', [None])[0]
        
        return flatshare_id if flatshare_id else fad_id


In [81]:
#### access URL in browser
url = 'https://www.spareroom.co.uk/flatshare/?search_id=1323266289&mode=list'

browser = webdriver.Chrome()
browser.get(url)

In [82]:
scraper = SpareRoomScraper(browser)

# listings_1stpage = scraper.scrape_listings()

listings_npgs = scraper.scrape_pages(n_pages = 1)
listings_npgs

[<strong class="listingPrice">£720-£950<abbr title=""> pcm</abbr><!-- 950 --></strong>, <strong class="listingPrice">
£720-£950<abbr title=""> pcm</abbr><!-- 950 --> </strong>, <strong>Available 22nd Sep</strong>]
[<strong class="listingPrice">£1,066<abbr title=""> pcm</abbr><!-- 1066 --></strong>, <strong class="listingPrice">
£1,066<abbr title=""> pcm</abbr><!-- 1066 --> </strong>, <strong>Available now</strong>]
[<strong class="listingPrice">£1,250<abbr title=""> pcm</abbr><!-- 1250 --></strong>, <strong class="listingPrice">
£1,250<abbr title=""> pcm</abbr><!-- 1250 --> </strong>, <strong>Available now</strong>]
[<strong class="listingPrice">£1,000<abbr title=""> pcm</abbr><!-- 1000 --></strong>, <strong class="listingPrice">
£1,000<abbr title=""> pcm</abbr><!-- 1000 --> </strong>, <strong>Available 1st Oct</strong>]
[<strong class="listingPrice">£1,200<abbr title=""> pcm</abbr><!-- 1200 --></strong>, <strong class="listingPrice">
£1,200<abbr title=""> pcm</abbr><!-- 1200 --> </str

Unnamed: 0,title,price,location,description,availability,href,id
0,🌼Amazing flats WhiteChapel/Aldgate East,£720-£950 pcm,Whitechapel (E1),Amazing flats WhiteChapel/Aldgate East\n\n****...,Available 22nd Sep,/flatshare/fad_click.pl?fad_id=17480469&search...,17480469
1,Beautiful double room in Brixton + balcony - O...,"£1,066 pcm",Brixton (SW9),AUTUMN DISCOUNT !!!!! BEAUTIFUL PROPERTY IN BR...,Available now,/flatshare/flatshare_detail.pl?flatshare_id=16...,16127643
2,Massive Double 1250 Luxury Hous 1 Stop Tower Hill,"£1,250 pcm",Tower Bridge (E1),HELLO EVERYONE !!\n\nAMAZING OPPORTUNITY !!\n\...,Available now,/flatshare/flatshare_detail.pl?flatshare_id=17...,17205604
3,Ensuite near Hammersmith hospital and Imperial...,"£1,000 pcm",White City (W12),"Hello,\n\nI have one good size ensuite room av...",Available 1st Oct,/flatshare/flatshare_detail.pl?flatshare_id=17...,17488320
4,"Furnished double room, modern flat, rooftop☀️✨","£1,200 pcm",Streatham Hill (SW16),"My name is Ana Maria, and I am originally from...",Available 27th Oct,/flatshare/flatshare_detail.pl?flatshare_id=69...,6996409
5,"Spacious Studio room, en suit, All bills included","£1,100 pcm",Leyton (E10),"SPACIOUS ROOM: STUDIO \nwith En-suit, \n\nThi...",Available 20th Sep,/flatshare/flatshare_detail.pl?flatshare_id=15...,15868077
6,"Lovely 3 bed flat with balcony, Elephant &castle","£2,800 pcm",Elephant and Castle (SE17),Please quote ref 14 when you contact.\n\nPostc...,Available 22nd Sep,/flatshare/flatshare_detail.pl?flatshare_id=15...,15885247
7,Large 3 Bed Flat in Finchley,"£2,700 pcm",Finchley (N3),3 Bed Flat in Finchley benefitting from a newl...,Available now,/flatshare/flatshare_detail.pl?flatshare_id=17...,17488280
8,Bow-Big Double Room Excellent Price,£850 pcm,Bow (E3),You will love this excellent large doubl...,Available 10th Oct,/flatshare/flatshare_detail.pl?flatshare_id=84...,8458402
9,4 Bed Flat Student Friendly Ref: 12Ahc,"£2,720 pcm",Hendon (NW4),PROPERTY NEEDS TO BE RENTED ANDER ONE TENANCY ...,Available 27th Sep,/flatshare/flatshare_detail.pl?flatshare_id=17...,17339338


# Test lookups

In [85]:
lookup = pd.read_csv("NSP21CL_NOV23_UK_LU.csv", engine='python', encoding='ISO-8859-1')

In [86]:
lookup.shape

(2697530, 25)

In [88]:
lookup.head(2)

Unnamed: 0,pcd7,pcd8,pcds,dointr,doterm,usertype,oseast1m,osnrth1m,oa21cd,oac11cd,...,lsoa21nm,msoa21cd,msoa21nm,soac11cd,soac11nm,ladcd,ladnm,ladnmw,laccd,lacnm
0,AB1 0AA,AB1 0AA,AB1 0AA,198001,199606.0,0,385386.0,801193.0,S00090303,1C3,...,,S02001237,,8A,Affluent communities,S12000033,Aberdeen City,,2A1,Larger Towns and Cities
1,AB1 0AB,AB1 0AB,AB1 0AB,198001,199606.0,0,385177.0,801314.0,S00090303,1C3,...,,S02001237,,8A,Affluent communities,S12000033,Aberdeen City,,2A1,Larger Towns and Cities


In [138]:
lookup.loc[lookup['pcd7'].str.startswith('SW4'), :]

Unnamed: 0,pcd7,pcd8,pcds,dointr,doterm,usertype,oseast1m,osnrth1m,oa21cd,oac11cd,...,lsoa21nm,msoa21cd,msoa21nm,soac11cd,soac11nm,ladcd,ladnm,ladnmw,laccd,lacnm
2326661,SW4 0AA,SW4 0AA,SW4 0AA,198001,,0,528732.0,175272.0,E00015329,2D3,...,Lambeth 013E,E02000630,Lambeth 013,6A,Inner city cosmopolitan,E09000022,Lambeth,,5A1,London Cosmopolitan
2326662,SW4 0AB,SW4 0AB,SW4 0AB,200905,,0,528652.0,175251.0,E00015329,2D3,...,Lambeth 013E,E02000630,Lambeth 013,6A,Inner city cosmopolitan,E09000022,Lambeth,,5A1,London Cosmopolitan
2326663,SW4 0AD,SW4 0AD,SW4 0AD,200112,,0,529496.0,175394.0,E00015327,2D3,...,Lambeth 017C,E02000634,Lambeth 017,6A,Inner city cosmopolitan,E09000022,Lambeth,,5A1,London Cosmopolitan
2326664,SW4 0AE,SW4 0AE,SW4 0AE,198001,,0,528589.0,175351.0,E00015323,3D3,...,Lambeth 013E,E02000630,Lambeth 013,6A,Inner city cosmopolitan,E09000022,Lambeth,,5A1,London Cosmopolitan
2326665,SW4 0AF,SW4 0AF,SW4 0AF,200902,,0,529283.0,175489.0,E00171550,3D3,...,Lambeth 013C,E02000630,Lambeth 013,6A,Inner city cosmopolitan,E09000022,Lambeth,,5A1,London Cosmopolitan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2328208,SW4 9ZQ,SW4 9ZQ,SW4 9ZQ,199712,200101.0,1,529241.0,174677.0,E00015256,2D2,...,Lambeth 019E,E02000636,Lambeth 019,6A,Inner city cosmopolitan,E09000022,Lambeth,,5A1,London Cosmopolitan
2328209,SW4 9ZR,SW4 9ZR,SW4 9ZR,199712,200602.0,1,529353.0,175449.0,E00015315,2D2,...,Lambeth 013C,E02000630,Lambeth 013,6A,Inner city cosmopolitan,E09000022,Lambeth,,5A1,London Cosmopolitan
2328210,SW4 9ZS,SW4 9ZS,SW4 9ZS,199806,200106.0,1,529241.0,174677.0,E00015256,2D2,...,Lambeth 019E,E02000636,Lambeth 019,6A,Inner city cosmopolitan,E09000022,Lambeth,,5A1,London Cosmopolitan
2328211,SW4 9ZT,SW4 9ZT,SW4 9ZT,199806,199906.0,1,529241.0,174677.0,E00015256,2D2,...,Lambeth 019E,E02000636,Lambeth 019,6A,Inner city cosmopolitan,E09000022,Lambeth,,5A1,London Cosmopolitan


In [93]:
msoa = pd.read_csv("TS011-2021-6-filtered-2024-09-19T20_52_48Z.csv")

In [95]:
msoa.shape

(6012, 5)

In [99]:
msoa.head(2)

Unnamed: 0,Middle layer Super Output Areas Code,Middle layer Super Output Areas,Household deprivation (6 categories) Code,Household deprivation (6 categories),Observation
0,E02000001,City of London 001,-8,Does not apply,0
1,E02000001,City of London 001,1,Household is not deprived in any dimension,2937


In [135]:
msoa.loc[msoa['Middle layer Super Output Areas Code'].str.contains('E02000636'), :]

Unnamed: 0,Middle layer Super Output Areas Code,Middle layer Super Output Areas,Household deprivation (6 categories) Code,Household deprivation (6 categories),Observation,area
3618,E02000636,Lambeth 019,-8,Does not apply,0,Lambeth
3619,E02000636,Lambeth 019,1,Household is not deprived in any dimension,2020,Lambeth
3620,E02000636,Lambeth 019,2,Household is deprived in one dimension,783,Lambeth
3621,E02000636,Lambeth 019,3,Household is deprived in two dimensions,231,Lambeth
3622,E02000636,Lambeth 019,4,Household is deprived in three dimensions,44,Lambeth
3623,E02000636,Lambeth 019,5,Household is deprived in four dimensions,1,Lambeth


In [None]:
msoa.loc[msoa['Middle layer Super Output Areas'].str.contains('Clapham'), :]

In [112]:
msoa['area'] = msoa['Middle layer Super Output Areas'].str.replace(r'\d+', '', regex=True)
msoa.groupby('area').count()

Unnamed: 0_level_0,Middle layer Super Output Areas Code,Middle layer Super Output Areas,Household deprivation (6 categories) Code,Household deprivation (6 categories),Observation
area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Barking and Dagenham,132,132,132,132,132
Barnet,252,252,252,252,252
Bexley,168,168,168,168,168
Brent,210,210,210,210,210
Bromley,234,234,234,234,234
Camden,162,162,162,162,162
City of London,6,6,6,6,6
Croydon,270,270,270,270,270
Ealing,246,246,246,246,246
Enfield,216,216,216,216,216


In [113]:
wards = pd.read_csv( "TS011-2021-6-filtered-2024-09-19T21_00_54Z.csv")

In [114]:
wards.shape

(4080, 5)

In [115]:
wards.head()

Unnamed: 0,Electoral wards and divisions Code,Electoral wards and divisions,Household deprivation (6 categories) Code,Household deprivation (6 categories),Observation
0,E05009317,Bethnal Green,-8,Does not apply,0
1,E05009317,Bethnal Green,1,Household is not deprived in any dimension,2882
2,E05009317,Bethnal Green,2,Household is deprived in one dimension,2483
3,E05009317,Bethnal Green,3,Household is deprived in two dimensions,1343
4,E05009317,Bethnal Green,4,Household is deprived in three dimensions,501


In [130]:
wards.loc[wards['Electoral wards and divisions'].str.contains('Clapham'), :].groupby('Electoral wards and divisions').count()

Unnamed: 0_level_0,Electoral wards and divisions Code,Household deprivation (6 categories) Code,Household deprivation (6 categories),Observation
Electoral wards and divisions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Clapham Common & Abbeville,6,6,6,6
Clapham East,6,6,6,6
Clapham Park,6,6,6,6
Clapham Town,6,6,6,6


In [120]:
lsoa = pd.read_csv( "TS011-2021-6-filtered-2024-09-19T20_31_35Z.csv")

In [121]:
lsoa.shape

(29964, 5)

In [122]:
lsoa.head(3)

Unnamed: 0,Lower layer Super Output Areas Code,Lower layer Super Output Areas,Household deprivation (6 categories) Code,Household deprivation (6 categories),Observation
0,E01000001,City of London 001A,-8,Does not apply,0
1,E01000001,City of London 001A,1,Household is not deprived in any dimension,548
2,E01000001,City of London 001A,2,Household is deprived in one dimension,253


In [123]:
lsoa.loc[lsoa['Lower layer Super Output Areas'].str.startswith('Clapham'), :].groupby('Lower layer Super Output Areas').count()

Unnamed: 0_level_0,Lower layer Super Output Areas Code,Household deprivation (6 categories) Code,Household deprivation (6 categories),Observation
Lower layer Super Output Areas,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [136]:
msoa_names = pd.read_csv('MSOA-Names-2.2.csv')
msoa_names.head(3)

Unnamed: 0,msoa21cd,msoa21nm,msoa21nmw,msoa21hclnm,msoa21hclnmw,localauthorityname,type
0,E02006534,Adur 001,Adur 001,Hillside,,Adur,Present in 2011
1,E02006535,Adur 002,Adur 002,Buckingham,,Adur,Present in 2011
2,E02006536,Adur 003,Adur 003,North Lancing,,Adur,Present in 2011


In [137]:
msoa_names.loc[msoa_names['msoa21cd'].str.startswith('E02000636'), :]

Unnamed: 0,msoa21cd,msoa21nm,msoa21nmw,msoa21hclnm,msoa21hclnmw,localauthorityname,type
3419,E02000636,Lambeth 019,Lambeth 019,Clapham South,,Lambeth,Present in 2011
