# Requires
Permit Status:
- Approved
- Refused
- Withdrawn
- Cancelled

Permit Description \
Permit Applicant Name \
Permit Number

In [72]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import pickle

In [69]:
url = 'https://www.shapeyourcity.ca/development'
PATH = "C:\Program Files (x86)\chromedriver-win64\chromedriver.exe"

In [77]:
def get_list_of_urls(webpage,path):
    """
    Scrapes all the URLs of the development permits within the webpage

    Parameters:
        webpage (str): the url of the webpage
        path (str): the location of the user's chromedriver

    Returns:
        A list of strings which contains all of the development permits url 
    """
    
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    driver = webdriver.Chrome(service = Service(path), options=chrome_options)
    driver.get(webpage)
    # The element is located within an iframe, required to locate the iframe and switch frames
    iframe = driver.find_element(By.TAG_NAME, 'iframe')
    url_list = []
    driver.switch_to.frame(iframe)
    page_num = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".chakra-button.ehq-paginationButton.css-i1louw")))
    last_page_num = int([item.text for item in page_num][-2])
    # Scrape all of the urls on each page
    for num in range(last_page_num+1):
        # Ensures that all the CSS elements are loaded before scraping
        urls = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.chakra-link.ehq-projectCoverImg.css-1eh7kaa')))
        for url in urls:
            url_list.append(url.get_attribute('href'))
        # After scraping all of the elements, click to the next page if not on the final page
        if num < last_page_num+1:
            click = driver.find_element(By.CSS_SELECTOR, ".chakra-button.ehq-paginationButton.ehq-paginationNextButton.css-i1louw")
            click.click()
        else:
            break
        time.sleep(5)
    driver.switch_to.default_content()
    driver.quit()
    return url_list

In [78]:
all_permit_urls = get_list_of_urls(url, PATH)
# save the urls to a pickle file that can be opened at any time without re-running the code. 
with open('permit_urls', 'wb') as f:
    pickle.dump(all_permit_urls, f)

606