In [23]:
# Libraries

import sys
import math
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests

In [24]:
# Inital information

url = 'https://propertylink.estatesgazette.com/commercial-property-for-rent/bath/page-1?filters%5Bnot_property_states%5D%5B%5D=Under+Offer&filters%5Bto_let_price%5D%5Bcurrency%5D=gbp&filters%5Bto_let_price%5D%5Bfrom%5D=&filters%5Bto_let_price%5D%5Bsize_modifier%5D=sqft&filters%5Bto_let_price%5D%5Bto%5D=&search=Bath&sort_direction=desc&sort_field=created_at&view=grid'
# change page numbers for these!
html = requests.get(url)
soup = bs(html.text, 'html.parser')

body = soup.find('body')
header = body.find('div', class_ = 'results-header mt-5')
property_count = int(header.find('div', class_ = 'results-message').contents[0].strip())
pages_needed = math.ceil(property_count / 24) + 1
# extra pages only include featured properties - so doesn't go past max. page index.

print(f"There are {property_count} properties on the website currently.\nSo {pages_needed} pages are needed.")

There are 69 properties on the website currently.
So 4 pages are needed.


In [25]:
# Function for getting links

def get_links(soup):
    links = []

    i = 1
    max_iterations = 30
    while len(links) < 24 and i <= max_iterations:
        elements = soup.select(f"body > div.container.mt-3 > div.results.row > div:nth-child({i}) > div.card.h-100.card--results > a")
        # Featured properties still appear elsewhere - so aren't needed. It also causes a break when an empty page is reached.

        # if not elements:
        #     elements = soup.select(f"body > div.container.mt-3 > div.results.row > div:nth-child({i}) > div.card.h-100.card--featured > a")

        if not elements:
            i += 1
            continue

        for element in elements:
            href = element['href']
            link = 'https://propertylink.estatesgazette.com'+ href
            links.append(link)
        i += 1

    return links


all_links = get_links(soup)


In [26]:
# Looping through pages

base_url = 'https://propertylink.estatesgazette.com/commercial-property-for-rent/bath/page-{}?filters%5Bnot_property_states%5D%5B%5D=Under+Offer&filters%5Bto_let_price%5D%5Bcurrency%5D=gbp&filters%5Bto_let_price%5D%5Bfrom%5D=&filters%5Bto_let_price%5D%5Bsize_modifier%5D=sqft&filters%5Bto_let_price%5D%5Bto%5D=&search=Bath&sort_direction=desc&sort_field=created_at&view=grid'

all_links = []  # Store all links from all pages

for page_num in range(1, pages_needed+1):  # Loop over the page numbers (1 to 3 in this example)
    url = base_url.format(page_num)
    html = requests.get(url)
    soup = bs(html.text, 'html.parser')
    
    page_links = get_links(soup)
    all_links.extend(page_links)  # Add the links from this page to the all_links list

    print(f"Page {page_num} has {len(page_links)} links.")

property_links = list(set(all_links)) # removing duplicates
print(f"Total links collected: {len(property_links)}")


Page 1 has 23 links.
Page 2 has 23 links.
Page 3 has 23 links.
Page 4 has 0 links.
Total links collected: 69


In [27]:
# Single property test

single_link = property_links[2]
print(single_link)

html = requests.get(single_link)
soup = bs(html.text, 'html.parser')

# soup.select('body > div.details-section.details-agent-contacts > div > div > div > div.col-md-4.col-12 > div.agent-name')
agent_name = soup.find('div', class_='agent-name').find_all_next(string=True)[1].strip()


agency_name = soup.select_one('body > div.details-section.details-agent-contacts > div > div > div > div.col-md-4.col-12 > div.agent-logo > a > img')['alt']
agency_name = soup.find("meta", {"name": "gwa_advertiserName"})
agency_name = agency_name.get("content")

id_tag = soup.find("meta", {"name": "gwa_contentID"})
id_tag = id_tag.get("content")

# address = soup.select('body > div.wrapper > div.container-wrapper.bg-white > div > div:nth-child(5) > div.details-section-body.row > div:nth-child(3)') # Full div
# address = address[0].get_text(strip=True) # gives address with below stuck to the front
# address = address.replace("ADDRESS", "").strip() #tidy

address = soup.select('body > div.wrapper > div.container-wrapper.bg-white > div > h1')
address = address[0].get_text(strip=True)
address = address.replace("sharebookmark", "").strip() #tidy

size = soup.select('body > div.wrapper > div.container-wrapper.bg-white > div > div:nth-child(5) > div.details-section-body.row > div:nth-child(2)')
size = size[0].get_text(strip=True) # gives size with below stuck to the front
size = size.replace("SIZE", "").strip() #tidy

type = soup.select('body > div.wrapper > div.container-wrapper.bg-white > div > div:nth-child(5) > div.details-section-body.row > div:nth-child(4)')
type = type[0].get_text(strip=True)
type = type.replace("TYPE", "").strip()

price = soup.select('body > div.wrapper > div.container-wrapper.bg-white > div > div:nth-child(5) > div.details-section-body.row > div:nth-child(1)')
price = price[0].get_text(strip=True)
price = price.replace("TO RENT", "").strip()

date_posted = soup.select('body > div.wrapper > div.container-wrapper.bg-white > div > div:nth-child(5) > div.details-section-body.row > div:nth-child(6)')
date_posted = date_posted[0].get_text(strip=True)
date_posted = date_posted.replace("POSTED ON", "").strip()

desc = soup.select_one('body > div:nth-of-type(4) > div:nth-of-type(2) > div > div:nth-of-type(5) > div > div:nth-of-type(1)')
desc = desc.get_text(strip=True)

latitude = soup.find('meta', itemprop='latitude')['content']
longitude = soup.find('meta', itemprop='longitude')['content']

print(agent_name)
print(agency_name)
print(id_tag)
print(address)
print(size)
print(type)
print(price)
print(date_posted)
print(desc,'\n')
print(latitude, ',', longitude)



https://propertylink.estatesgazette.com/property-details/6975849-second-floor-14-queen-square-bath-bath-and-north-east-somerset
Philip Marshall
Carter Jonas LLP
6975849
Second Floor, 14 Queen Square, Bath, Bath and North East Somerset
683 Sq Ft
Office, Offices
£12,500.00  Per  Annum
19th August 2024
The property comprises a number of interconnecting offices all at second floor level. The office space benefits from shared WCs, showers and kitchen facilities.The property is situated on the western side of Queen Square in a prominent position on the corner within Bath city centre., next to Chapel Row. The offices are within a 15 minute walk of Bath Spa railway station, the Southgate Shopping Centre and a variety of restaurants, bars and amenities within the city centre.Bath is a UNESCO World Heritage City of international repute with an approximate population of 99,277 (ONS mid-year population estimates 2018). The city is a regional hub for retail and leisure and is one of the United King

In [28]:
# Collecting all property information

def assign_value(elements, keyword):
    for element in elements:
        text = element.get_text(strip=True)
        if keyword in text:
            return text.replace(keyword, "").strip()
    return None

property_data = []

for property in property_links:
    # print(property)
    html = requests.get(property)
    soup = bs(html.text, 'html.parser')

    agent_name = soup.find('div', class_='agent-name').find_all_next(string=True)[1].strip()
    
    agency_name = soup.find("meta", {"name": "gwa_advertiserName"})
    agency_name = agency_name.get("content")

    id_tag = soup.find("meta", {"name": "gwa_contentID"})
    id_tag = id_tag.get("content")

    desc = soup.select_one('body > div:nth-of-type(4) > div:nth-of-type(2) > div > div:nth-of-type(5) > div > div:nth-of-type(1)')
    desc = desc.get_text(strip=True)

    # address = soup.select('body > div.wrapper > div.container-wrapper.bg-white > div > h1')
    # address = address[0].get_text(strip=True)
    # address = address.replace("sharebookmark", "").strip() #tidy

    elements = soup.select('body > div.wrapper > div.container-wrapper.bg-white > div > div:nth-child(5) > div.details-section-body.row > div')

    # Assign values based on keywords - thank you ChatGPT for this one...
    price = assign_value(elements, "TO RENT")
    size = assign_value(elements, "SIZE")
    address = assign_value(elements, "ADDRESS")
    type = assign_value(elements, "TYPE")
    # tenure = assign_value(elements, "TENURE")
    date_posted = assign_value(elements, "POSTED ON")

    latitude = soup.find('meta', itemprop='latitude')['content']
    longitude = soup.find('meta', itemprop='longitude')['content']
    

    property_data.append({
        'Id': id_tag,
        'Address': address,
        'Price': price,
        'Size': size,
        'Property Type': type,
        'Agency Name': agency_name,
        'Agent Name': agent_name,
        'Description': desc,
        'Property Link': property,
        'Date Posted': date_posted,
        'Latitude': latitude,
        'Longitude': longitude
    })

    

df = pd.DataFrame(property_data)



df.head(2)

Unnamed: 0,Id,Address,Price,Size,Property Type,Agency Name,Agent Name,Description,Property Link,Date Posted,Latitude,Longitude
0,6881244,"27 Upper Borough Walls, Bath, Bath And North East Somerset","£12,500.00 - £18,000.00 Per Annum","729 - 1,566 Sq Ft","General Retail, Restaurants/Cafes, Retail, Licensed & Leisure",Carter Jonas LLP,Stuart Williams,First and Basement Floors - Available to Let* Incentives Available *The premises front Upper B...,https://propertylink.estatesgazette.com/property-details/6881244-27-upper-borough-walls-bath-bat...,1st October 2024,51.3825028,-2.360375
1,6962602,"6 Terrace Walk, Bath, Bath And North East Somerset, BA1 1LN","£25,000.00 Per Annum","1,144 Sq Ft","Office, Retail - High Street, Other, Offices, Retail, Other Property Types & Opportunities",CSquared (Previously Colston & Colston Chartered Surveyors),Nathan Clark,"Situated on the first and second floors, the property overlooks Parade Gardens to the front and ...",https://propertylink.estatesgazette.com/property-details/6962602-6-terrace-walk-bath-bath-and-no...,31st May 2024,51.381243,-2.357851


In [29]:
# Writing to csv

df.to_csv('spreadsheets/EG_scraped_18.11.24.csv',index=False)

In [30]:
# Email draft


# pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_colwidth', 100)

print(f"""Hi {df.iloc[2]['Agent Name']},\n 
I see that {df.iloc[2]['Address']} has been taken off the wesbites. Please could you let me know the tenant, rent and lease length?\n
Thanks,\n
[Name]""")

print(f"\n\n\n {df.iloc[2]}")

Hi Philip Marshall,
 
I see that Second Floor, 14 Queen Square, Bath, Bath and North East Somerset has been taken off the wesbites. Please could you let me know the tenant, rent and lease length?

Thanks,

[Name]



 Id                                                                                                           6975849
Address                                            Second Floor, 14 Queen Square, Bath, Bath and North East Somerset
Price                                                                                         £12,500.00  Per  Annum
Size                                                                                                       683 Sq Ft
Property Type                                                                                        Office, Offices
Agency Name                                                                                         Carter Jonas LLP
Agent Name                                                                       

In [31]:
# Making the map

import folium
map = folium.Map(location=(51.38, -2.36), zoom_start=10)#location - the center of the map, zoom_start - the resolution
for index, row in df.iterrows():
    folium.Marker(
        location=(row['Latitude'], row['Longitude']),
        popup=row['Address']
    ).add_to(map)
map