In [None]:
from pathlib import Path

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from unidecode import unidecode

AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
URL = "https://www.celiahammond.org/canning-town/index.php/animals-needing-homes/canning-town-london"

In [None]:
def get_link(
    url: str, agent: str = AGENT, allow_redirects: bool = True, verify: bool = True
):
    """Return http response from a given url"""
    headers = {"User-Agent": agent}
    return requests.get(
        url,
        headers=headers,
        allow_redirects=allow_redirects,
        timeout=None,
        verify=verify,
    )

def make_soup(url: str, verify: bool = True):
    """Get content from a url and parse it with Beautiful Soup."""
    response = get_link(url, verify=verify)
    return BeautifulSoup(response.text, "lxml")


In [None]:
soup = make_soup(URL)

options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-errors")
options.add_argument("--incognito")
options.add_argument("--headless")

def get_page(url):
    """Get page, using Selenium for Arcgis pages"""
    path = Path.cwd().parent / "code" / "chromedriver"
    print(path)
    driver = webdriver.Chrome(path, options=options)
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "slides_control"))
        )
    except TimeoutException:
        return []
    html = driver.page_source
    content = BeautifulSoup(html, "lxml")
    return content

In [None]:
content = get_page(URL)

In [None]:
intros = content.find_all("div", {"class": "bt-introtext"})
intro_texts = [intro.text.strip().replace("\n", " ") for intro in intros]
[text for text in intro_texts if "indoor" in text.lower()]

In [None]:
headings = content.find_all("a", {"class": "bt-title"})
heading_texts = [(heading["title"], "https://www.celiahammond.org" + heading["href"]) for heading in headings if "indoor" in heading["title"].lower()]
heading_texts