In [41]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains

In [68]:
# Set up Chrome options
chrome_options = webdriver.ChromeOptions()

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Go to Rightmove website
driver.get("https://www.rightmove.co.uk")
# Wait for the cookie popup to be present and then locate the 'Accept all' button
wait = WebDriverWait(driver, 10)
accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Accept all"]')))

# Click the 'Accept all' button
accept_button.click()

# Wait until the search input is present
search_input = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input.ksc_inputText.ksc_typeAheadInputField')))

# Clear the search box before entering text
search_input.clear()
search_input.send_keys("London")

# Wait for the autocomplete suggestions to appear and use the down arrow key to select the first suggestion
search_input.send_keys(Keys.DOWN)

# Use the ENTER key to select the top suggestion from the dropdown
search_input.send_keys(Keys.ENTER)

# Wait for the "To Rent" button that contains the text "To Rent" to be clickable
to_rent_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "To Rent")]')))
to_rent_button.click()

# Select the minimum price
min_price_select = wait.until(EC.presence_of_element_located((By.ID, 'minPrice')))
Select(min_price_select).select_by_value("2500")

# Select the maximum price
max_price_select = wait.until(EC.presence_of_element_located((By.ID, 'maxPrice')))
Select(max_price_select).select_by_value("3000")

# Select the minimum number of bedrooms
min_bedrooms_select = wait.until(EC.presence_of_element_located((By.ID, 'minBedrooms')))
Select(min_bedrooms_select).select_by_value("2")

# Select the maximum number of bedrooms
max_bedrooms_select = wait.until(EC.presence_of_element_located((By.ID, 'maxBedrooms')))
Select(max_bedrooms_select).select_by_value("3")

# Select the property type
property_type_select = wait.until(EC.presence_of_element_located((By.ID, 'displayPropertyType')))
Select(property_type_select).select_by_visible_text("Flats / Apartments")

# Wait for the "Find properties" button to be clickable
find_properties_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Find properties")]')))
find_properties_button.click()

In [69]:
# Initialize an empty list to store the URLs
property_urls = []

# Start processing pages
while True:
    # Wait for the properties to be loaded on the page
    properties = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.l-searchResult[data-test*="propertyCard"]'))
    )

    # Iterate over the properties
    for property in properties:
        try:
            # Check if the property has at least 2 bathrooms by looking for the icon's title
            bathroom_icon = property.find_element(By.CSS_SELECTOR, 'span.no-svg-bathroom-icon + span.text')
            num_bathrooms = int(bathroom_icon.get_attribute('textContent'))
            
            # Check if the property card mentions a floorplan
            floorplan_element = property.find_element(By.CSS_SELECTOR, 'a[data-test="property-floorplan-icon"]')
            
            # If the property has at least 2 bathrooms and a floorplan, get the URL and add to the list
            if num_bathrooms >= 2 and floorplan_element:
                property_url_element = property.find_element(By.CSS_SELECTOR, 'a.propertyCard-link')
                property_url = property_url_element.get_attribute('href')
                property_urls.append(property_url)
        except NoSuchElementException:
            # If the bathroom element or floorplan is not found, skip this property
            continue

    # Check if the "Next" button is disabled
    next_button = driver.find_element(By.CSS_SELECTOR, 'button.pagination-direction--next')
    if next_button.get_attribute('disabled'):
        # If the button is disabled, we are on the last page
        break
    else:
        # If the button is not disabled, click it to go to the next page
        ActionChains(driver).move_to_element(next_button).click().perform()


In [81]:
# Close the browser
driver.quit()

In [100]:
import requests
import json
from bs4 import BeautifulSoup


def download_image(image_url, filename):
    try:
        # Send a GET request to the image URL
        response = requests.get(image_url)

        # Check if the request was successful
        response.raise_for_status()

        # Write the image content to a file
        with open(filename, "wb") as file:
            file.write(response.content)

        print(f"Image downloaded and saved as {filename}")
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error occurred: {err}")


def get_floorplan_url(property_url):
    # Split the URL on '#'
    parts = property_url.split("#")
    # Insert the floorplan segment before the '?channel=RES_LET' part
    floorplan_url = (
        parts[0] + "#/floorplan" + ("?" + parts[1] if len(parts) > 1 else "")
    )
    return floorplan_url


def extract_floorplan_url_from_html(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find the script tag containing the PAGE_MODEL
    script_tag = soup.find("script", string=lambda t: t and "window.PAGE_MODEL" in t)

    # Extract the JSON string from the script tag
    json_string = script_tag.string.split("=", 1)[1].strip()

    # Remove the trailing semicolon if it exists
    json_string = json_string.rstrip(";")

    # Parse the JSON string
    data = json.loads(json_string)

    # Extract the floorplan URL
    floorplan_url = data["propertyData"]["floorplans"][0]["url"]

    return floorplan_url


url = get_floorplan_url(property_urls[0])

headers = requests.utils.default_headers()

headers.update(
    {
        "User-Agent": "My User Agent 1.0",
    }
)

response = requests.get(url, headers=headers)

floorplan_image_url = extract_floorplan_url_from_html(response.text)

download_image(floorplan_image_url, "floorplan.jpeg")

Image downloaded and saved as floorplan.jpeg


In [101]:
import pytesseract
from PIL import Image
from transformers import pipeline

# Path to the image
image_path = 'floorplan.jpeg'

# Open the image with Pillow
image = Image.open(image_path)

# Use Tesseract to do OCR on the image
text = pytesseract.image_to_string(image)

if text.strip():

    # Load a pre-trained question-answering model from Hugging Face
    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

    # The question we want to answer (i.e., what we want to extract)
    question = "What is the total gross internal area in square feet?"

    # Use the model to find the answer
    result = qa_pipeline(question=question, context=text)

    # Print the answer
    print("Answer:", result["answer"])


Answer: 794


In [99]:
text

''