In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import pickle
import re

In [29]:
url = 'https://www.shapeyourcity.ca/development'
PATH = "C:\Program Files (x86)\chromedriver-win64\chromedriver.exe"

In [2]:
def get_list_of_urls(webpage,path):
    """
    Scrapes all the URLs of the development permits within the webpage

    Parameters:
        webpage (str): the url of the webpage
        path (str): the location of the user's chromedriver

    Returns:
        A list of strings which contains all of the development permits url 
    """
    
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    driver = webdriver.Chrome(service = Service(path), options=chrome_options)
    driver.get(webpage)
    # The element is located within an iframe, required to locate the iframe and switch frames
    iframe = driver.find_element(By.TAG_NAME, 'iframe')
    url_list = []
    driver.switch_to.frame(iframe)
    page_num = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".chakra-button.ehq-paginationButton.css-i1louw")))
    last_page_num = int([item.text for item in page_num][-2])
    # Scrape all of the urls on each page
    for num in range(last_page_num+1):
        # Ensures that all the CSS elements are loaded before scraping
        urls = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.chakra-link.ehq-projectCoverImg.css-1eh7kaa')))
        for url in urls:
            url_list.append(url.get_attribute('href'))
        # After scraping all of the elements, click to the next page if not on the final page
        if num < last_page_num+1:
            click = driver.find_element(By.CSS_SELECTOR, ".chakra-button.ehq-paginationButton.ehq-paginationNextButton.css-i1louw")
            click.click()
        else:
            break
        time.sleep(5)
    driver.switch_to.default_content()
    driver.quit()
    return url_list

In [78]:
all_permit_urls = get_list_of_urls(url, PATH)
# save the urls to a pickle file that can be opened at any time without re-running the code. 
with open('permit_urls', 'wb') as f:
    pickle.dump(all_permit_urls, f)

In [3]:
# Load the saved pickle with all the URLs.
all_permit_urls_saved = pd.read_pickle('permit_urls')

In [4]:
def get_list_of_applicant(list_of_description):
    """
    Utilizes regex to obtain the applicants name from the list of description

    Parameters:
        list_of_description (list of str): the description of the webpage derived from the get_description 

    Returns:
        A list of the applicant names 
    """
    
    regex = r'^[\s\S]*?(?=\s+has applied)'
    list_of_applicant = []
    for applicant in list_of_description:
        try:
            applicant_name = applicant[:re.search(regex, applicant).span()[1]]
        except:
            applicant_name = 'Unknown'
        list_of_applicant.append(applicant_name)
    return list_of_applicant

In [5]:
def application_status(driver):
    """
    Scrape webpage to obtain the Director of Planning decision

    Parameters:
        driver: the driver that opened the webpage

    Returns:
        The application status
    """  

    try:
        text_info = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.TAG_NAME,'strong'))).text.split()
        if 'Director' in text_info:
            if 'approved' in text_info:
                return 'Approved'
            elif 'cancelled' in text_info:
                return 'Cancelled'
            elif 'withdrawn' in text_info:
                return 'Withdrawn'
            else:
                return 'Rejected'
        else:
            return 'In progress'
    except:
        return 'In progress'

In [6]:
def permit_status(driver, regex):
    """
    Scrapes webpage to obtain the development permit id

    Parameters:
        driver: the driver that opened the webpage
        regex: the regular expression required to isolate the permit id

    Returns:
        The development permit id 
    """
    
    try:
        head_text = driver.find_element(By.TAG_NAME, 'h1').text
        permit_id = re.search(regex, head_text)[1] 
    except:
        permit_id = 'Unknown'
    return permit_id

In [7]:
def scrape_description(driver):
    """
    Scrapes each webpage to obtain the description of each development permit application

    Parameters:
        driver: the driver that opened the webpage

    Returns:
        The description of the development permit application 
    """

    text = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'truncated-description'))).text
    return text

In [8]:
def scrape_info(list_of_urls, path):
    """
    Scrapes each webpage in the list of urls

    Parameters:
        list_of_urls (list of str): all the urls to be scraped
        path (str): the location of the user's chromedriver

    Returns:
        A tuple where the 0th element is the list status, the 1st element is the description list, and the 2nd element is the permit list
    """

    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    list_status = []
    description_list = []
    permit_list = []
    error_list = []
    regex_permit = r'\((DP.*?)\)'
    for url in list_of_urls:
        driver = webdriver.Chrome(service = Service(path), options = chrome_options)
        driver.get(url)
        try:
            list_status.append(application_status(driver))
            permit_list.append(permit_status(driver, regex_permit))
            description_list.append(scrape_description(driver))
            driver.quit()
            time.sleep(3)
        except:
            error_list.append(url)
            driver.quit()
            time.sleep(3)
    return list_status, description_list, permit_list, error_list


In [151]:
all_information = scrape_info(all_permit_urls_saved, PATH)

In [152]:
# Save all the scraped information
with open('scraped_data', 'wb') as f:
    pickle.dump(all_information, f)

In [13]:
scraped_information = pd.read_pickle('scraped_data')

In [153]:
# Get Applicant name from the description
applicant = get_list_of_applicant(all_information[1])

In [154]:
# Create the dataframe to hold all the necessary information
df = pd.DataFrame({"Applicant_Name": applicant})
df['permit_id'] = all_information[2]
df['applicant_status'] = all_information[0]
df['description'] = all_information[1]

In [155]:
# save the database
with open('database', 'wb') as f:
    pickle.dump(df, f)

In [10]:
df = pd.read_pickle('database')

In [None]:
# Grab first item of current url list, scrape first page of website, check each url of the first page of website until it matches the first item, stop loop, collect urls in a list and extend this list with the old one. 

In [20]:
most_recent_url = all_permit_urls_saved[0]

In [22]:
def get_recent_urls(webpage, path, recent_url):
    """
    Scrapes all the recent URLs of the development permits within the webpage

    Parameters:
        webpage (str): the url of the webpage
        path (str): the location of the user's chromedriver
        recent_url (str): the most recent url from the saved list

    Returns:
        A list of strings which contains all of the development permits url 
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    driver = webdriver.Chrome(service = Service(path), options=chrome_options)
    driver.get(webpage)
    # The element is located within an iframe, required to locate the iframe and switch frames
    iframe = driver.find_element(By.TAG_NAME, 'iframe')
    main_url_list = []
    sub_url_list = []
    loop_check = True
    driver.switch_to.frame(iframe)
    while loop_check:
        urls = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.chakra-link.ehq-projectCoverImg.css-1eh7kaa')))
        for url in urls:
            if url.get_attribute('href') == recent_url:
                loop_check = False
                break
            sub_url_list.append(url.get_attribute('href'))
        click = driver.find_element(By.CSS_SELECTOR, ".chakra-button.ehq-paginationButton.ehq-paginationNextButton.css-i1louw")
        click.click()
        main_url_list.extend(sub_url_list)
    driver.switch_to.default_content()
    driver.quit()
    return main_url_list

In [28]:
driver = webdriver.Chrome(service = Service(PATH))
driver.get(url)
url_list = []
loop_check = True
iframe = driver.find_element(By.TAG_NAME, 'iframe')
while loop_check:
    urls = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.chakra-link.ehq-projectCoverImg.css-1eh7kaa')))
    for url in urls:
        if url.get_attribute('href') == most_recent_url:
            loop_check = False
            break
        else:
            url_list.append(url.get_attribute('href'))
    click = driver.find_element(By.CSS_SELECTOR, ".chakra-button.ehq-paginationButton.ehq-paginationNextButton.css-i1louw")
    click.click()
driver.switch_to.default_content()
driver.quit()

InvalidArgumentException: Message: invalid argument: 'url' must be a string
  (Session info: chrome=122.0.6261.113)
Stacktrace:
	GetHandleVerifier [0x00007FF742FCAD32+56930]
	(No symbol) [0x00007FF742F3F632]
	(No symbol) [0x00007FF742DF42E5]
	(No symbol) [0x00007FF742E78C1A]
	(No symbol) [0x00007FF742E5BC9A]
	(No symbol) [0x00007FF742E781E2]
	(No symbol) [0x00007FF742E5BA43]
	(No symbol) [0x00007FF742E2D438]
	(No symbol) [0x00007FF742E2E4D1]
	GetHandleVerifier [0x00007FF743346ABD+3709933]
	GetHandleVerifier [0x00007FF74339FFFD+4075821]
	GetHandleVerifier [0x00007FF74339818F+4043455]
	GetHandleVerifier [0x00007FF743069766+706710]
	(No symbol) [0x00007FF742F4B90F]
	(No symbol) [0x00007FF742F46AF4]
	(No symbol) [0x00007FF742F46C4C]
	(No symbol) [0x00007FF742F36904]
	BaseThreadInitThunk [0x00007FFBF8D6257D+29]
	RtlUserThreadStart [0x00007FFBF906AA58+40]
