In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import pickle
import re

In [2]:
url = 'https://www.shapeyourcity.ca/development'
PATH = "C:\Program Files (x86)\chromedriver-win64\chromedriver.exe"

In [3]:
def get_list_of_urls(webpage,path):
    """
    Scrapes all the URLs of the development permits within the webpage

    Parameters:
        webpage (str): the url of the webpage
        path (str): the location of the user's chromedriver

    Returns:
        A list of strings which contains all of the development permits url 
    """
    
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    driver = webdriver.Chrome(service = Service(path), options=chrome_options)
    driver.get(webpage)
    # The element is located within an iframe, required to locate the iframe and switch frames
    iframe = driver.find_element(By.TAG_NAME, 'iframe')
    url_list = []
    driver.switch_to.frame(iframe)
    page_num = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".chakra-button.ehq-paginationButton.css-i1louw")))
    last_page_num = int([item.text for item in page_num][-2])
    # Scrape all of the urls on each page
    for num in range(last_page_num+1):
        # Ensures that all the CSS elements are loaded before scraping
        urls = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.chakra-link.ehq-projectCoverImg.css-1eh7kaa')))
        for url in urls:
            url_list.append(url.get_attribute('href'))
        # After scraping all of the elements, click to the next page if not on the final page
        if num < last_page_num+1:
            click = driver.find_element(By.CSS_SELECTOR, ".chakra-button.ehq-paginationButton.ehq-paginationNextButton.css-i1louw")
            click.click()
        else:
            break
        time.sleep(5)
    driver.switch_to.default_content()
    driver.quit()
    return url_list

In [78]:
all_permit_urls = get_list_of_urls(url, PATH)
# save the urls to a pickle file that can be opened at any time without re-running the code. 
with open('permit_urls', 'wb') as f:
    pickle.dump(all_permit_urls, f)

In [4]:
# Load the saved pickle with all the URLs.
all_permit_urls_saved = pd.read_pickle('permit_urls')

In [21]:
def get_permit_ids(list_of_urls, path):
    """
    Scrapes each webpage to obtain the development permit id

    Parameters:
        list_of_urls (list of str): the urls of the webpage derived from the get_list_of_urls function
        path (str): the location of the user's chromedriver

    Returns:
        A list of the development permit ids 
    """
    
    header_list = []
    regex = r'\((.*?)\)'
    for url in list_of_urls:
        chrome_options = Options()
        chrome_options.add_argument("--headless=new")
        driver = webdriver.Chrome(service = Service(path), options=chrome_options)
        driver.get(url)
        header_list.append(driver.find_element(By.TAG_NAME, 'h1').text)
        driver.quit()
        time.sleep(3)
    permit_list = [re.search(regex, permit)[1] for permit in header_list]
    return permit_list

In [20]:
def get_description(list_of_urls, path):
    """
    Scrapes each webpage to obtain the description of each development permit application

    Parameters:
        list_of_urls (list of str): the urls of the webpage derived from the get_list_of_urls function
        path (str): the location of the user's chromedriver

    Returns:
        A list of the description of each development permit application 
    """

    regex = r'(.*?)Under the'
    description_list = []
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    for url in list_of_urls:
        driver = webdriver.Chrome(service = Service(path), options=chrome_options)
        driver.get(url)
        text = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'truncated-description'))).text
        end_point = re.search(regex, text).span()[0]
        description = text[:end_point]
        description_list.append(description)
        driver.quit()
        time.sleep(3)
    return description_list

In [19]:
def get_list_of_applicant(list_of_description):
    """
    Utilizes regex to obtain the applicants name from the list of description

    Parameters:
        list_of_description (list of str): the description of the webpage derived from the get_description 

    Returns:
        A list of the applicant names 
    """
    
    regex = r'^.*?(?=\s+has applied)'
    list_of_applicant = []
    for applicant in list_of_description:
        try:
            applicant_name = applicant[:re.search(regex, applicant).span()[1]]
        except:
            applicant_name = 'Unknown'
        list_of_applicant.append(applicant_name)
    return list_of_applicant

In [18]:
def application_status(list_of_urls, path):
    """
    Scrapes each webpage to obtain the Director of Planning decision

    Parameters:
        list_of_urls (list of str): the urls of the webpage derived from the get_list_of_urls function
        path (str): the location of the user's chromedriver

    Returns:
        A list of application status
    """  

    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    list_status = []
    for url in list_of_urls:
        driver = webdriver.Chrome(service = Service(path), options = chrome_options)
        driver.get(url)
        text_info = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.TAG_NAME,'strong'))).text.split()
        if 'Director' in text_info:
            if 'approved' in text_info:
                list_status.append('Approved')
            elif 'cancelled' in text_info:
                list_status.append('Cancelled')
            elif 'withdrawn' in text_info:
                list_status.append('Withdrawn')
            else:
                list_status.append('Rejected')
        else:
            list_status.append('In progress')
        driver.quit()
        time.sleep(3)
    return list_status

In [385]:
permit_id = {'permit_id': get_permit_ids(all_permit_urls_saved, PATH)}
description = get_description(all_permit_urls_saved, PATH)
list_of_applicants = get_list_of_applicant(description)
applicant_status = application_status(all_permit_urls_saved, PATH)

TypeError: 'NoneType' object is not subscriptable

In [None]:
df = pd.DataFrame(permit_id)
df['description'] = description
df['applicant'] = list_of_applicants
df['applicant_status'] = applicant_status

In [35]:
def application_status(driver):
    """
    Scrape webpage to obtain the Director of Planning decision

    Parameters:
        driver: the driver that opened the webpage

    Returns:
        The application status
    """  

    try:
        text_info = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.TAG_NAME,'strong'))).text.split()
        if 'Director' in text_info:
            if 'approved' in text_info:
                return 'Approved'
            elif 'cancelled' in text_info:
                return 'Cancelled'
            elif 'withdrawn' in text_info:
                return 'Withdrawn'
            else:
                'Rejected'
        else:
            return 'In progress'
    except:
        return 'In progress'

In [22]:
def permit_status(driver, regex):
    """
    Scrapes webpage to obtain the development permit id

    Parameters:
        driver: the driver that opened the webpage
        regex: the regular expression required to isolate the permit id

    Returns:
        The development permit id 
    """
    
    try:
        head_text = driver.find_element(By.TAG_NAME, 'h1').text
        permit_id = re.search(regex, head_text)[1] 
    except:
        permit_id = 'Unknown'
    return permit_id

In [7]:
def scrape_description(driver):
    """
    Scrapes each webpage to obtain the description of each development permit application

    Parameters:
        driver: the driver that opened the webpage

    Returns:
        The description of the development permit application 
    """

    text = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'truncated-description'))).text
    return text

In [12]:
def scrape_info(list_of_urls, path):
    """
    Scrapes each webpage in the list of urls

    Parameters:
        list_of_urls (list of str): all the urls to be scraped
        path (str): the location of the user's chromedriver

    Returns:
        A tuple where the 0th element is the list status, the 1st element is the description list, and the 2nd element is the permit list
    """

    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    list_status = []
    description_list = []
    permit_list = []
    error_list = []
    regex_permit = r'\((.*?)\)'
    for url in list_of_urls:
        driver = webdriver.Chrome(service = Service(path), options = chrome_options)
        driver.get(url)
        try:
            list_status.append(application_status(driver))
            permit_list.append(permit_status(driver, regex_permit))
            description_list.append(scrape_description(driver))
            driver.quit()
            time.sleep(3)
        except:
            error_list.append(url)
            driver.quit()
            time.sleep(3)
    return list_status, description_list, permit_list, error_list


In [10]:
len(all_permit_urls_saved)

606

In [15]:
scraped_information_100 = scrape_info(all_permit_urls_saved[100:200], PATH)

In [16]:
scraped_information_100[3]

[]

In [None]:
scraped_information_200 = scrape_info(all_permit_urls_saved[200:300], PATH)
scraped_information_200[3]

['https://www.shapeyourcity.ca/3596-w-28-ave',
 'https://www.shapeyourcity.ca/false-creek-north-non-market-housing']

In [39]:
scraped_information_300 = scrape_info(all_permit_urls_saved[300:400], PATH)
scraped_information_300[3]

[]

In [40]:
scraped_information_400 = scrape_info(all_permit_urls_saved[400:500], PATH)
scraped_information_400[3]

[]

In [41]:
scraped_information_500 = scrape_info(all_permit_urls_saved[500:], PATH)
scraped_information_500[3] 

[]

In [42]:
scraped_information = scrape_info(all_permit_urls_saved[:100], PATH)