In [1]:
# load environment variables
from dotenv import load_dotenv
import os

load_dotenv()

# selenium to parse dynamic web pages
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

# import utilities
import pandas as pd
import time

# Authentication & Filters

In [2]:
# get environment variables
card_num = os.environ.get('MISS_LIB_CARD')
base_url = os.getenv('BASE_URL')

# search terms
query = "banquet"
data_values = [
    581212, # Banquet Facilities
    791102, # Ballrooms
    799904, # Auditoriums
    738931] # Convention & Meeting Facilities

# Set Browser Options & Install Drivers

In [3]:
# define options for browser
options = ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
# options.add_argument('--headless')

# install driver and open URL
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

# Helper Functions

In [4]:
def document_initialised(driver: driver) -> driver:
    '''
    document_initialised will command the driver to wait for the page to load completely.

    Args:
        driver (ChromeService object): Chrome browser driver.

    Returns:
        js script: Driver executes a script to wait for the page to load or, if it has already loaded before the listener can catch it, to proceed.
    '''

    return driver.execute_script("if (document.readyState === 'complete') {return true;} else {window.addEventListener('load', () => {return true;});}")

def groupbox_info(groupbox: driver) -> list | int:
    '''
    scrape_groupbox_info will scrape groupbox details from one company page.

    Args:
        groupbox (WebDriver): WebDriver element filtering for groupbox class divs.
        groupbox_title (WebDriver element): WebDriver element filtering for groupbox_title divs.

    Returns:
        temp_list (list): List of data values scraped from each groupbox.
        count (int): Length of items in temp_list.
    '''

    temp_list = list()

    # find all information in the groupbox
    cols = groupbox.find_elements(By.TAG_NAME, "td")
    count = len(cols)

    # append information to the info_list
    for col in cols:
        temp_list.append(col.text)

    # return count for website info check
    return temp_list, count

def headings() -> list:
    '''
    headings returns table headings relevant to the data table.

    Returns:
        list: List of headings.
    '''

    # list of headings
    headings = [
        'Company Name',
        '',
        'Address Line 1',
        'Address Line 2',
        '',
        'Phone',
        'District',
        'Fax',
        'CMA',
        'Website',
        'Industry Profile',
        'Location Employees',
        'Location Sales Volume',
        'Corporate Employees',
        'Corporate Sales Volume',
        'Location Type',
        'Credit Rating',
        'Stock Ticker Table',
        'Latitude/Longitude',
        'IUSA Number',
        'Federal Contractor',
        'Management Directory',
        'Disclosure',
        'Accounting',
        'Advertising',
        'Contract Labor',
        'Insurance',
        'Legal',
        'Management/Administration',
        'Office Equipment & Supplies',
        'Package/Container',
        'Payroll & Benefits',
        'Purchased Print',
        'Rent & Leasing',
        'Technology',
        'Telecommunications',
        'Utilities']

    return headings

def wait_for_page(n: int) -> None:
    '''
    wait_for_page will stall the selenium driver to listen for conditions.

    Args:
        n (int): Set wait time in seconds.

    '''

    WebDriverWait(driver, timeout=10).until(document_initialised)
    time.sleep(n)

# Scraper Functions

In [5]:
def scrape_table_data(driver: driver) -> list | list:
    '''
    scrape_page_data will scrape all data from the tabulated, high-level results.
    NOTE: This function only works if it is called on the first page of the tabulated results.

    Args:
        driver (WebDriver): Chrome browser driver.

    Returns:
        data (list): Scraped entries from the tables.
        heading_names (list): Scraped fields from the tables.
    '''

    # find the resulting number of pages
    max_pages = driver.find_element(By.CLASS_NAME, "data-page-max").text

    # define containers
    data = list()
    heading_names = list()

    # scrape table headings
    headings = driver.find_elements(By.XPATH, "//thead[@id = 'searchResultsHeader']/tr/th")
    for heading in headings:
        heading_names.append(heading.text)

    for _ in range(1, int(max_pages)+1):
        # scrape entries from table
        rows = driver.find_elements(By.XPATH, "//tbody[@id = 'searchResultsPage']/tr")

        for row in rows:
            row_data = list()
            for col in row.find_elements(By.TAG_NAME, "td"):
                row_data.append(col.text)
            data.append(row_data)

        # go to the next page then wait
        driver.find_elements(By.CLASS_NAME, "next")[0].click()
        wait_for_page(3)

    return data, heading_names

def scrape_company_page(groupboxes: list, groupbox_titles: list) -> list:

    company_details = list()

    # Isolate site for each group box based on title
    for i, groupbox_title in enumerate(groupbox_titles):
        # Switch case for each groupbox based on their titles
        match groupbox_title.text:
            case "Location Information":
                temp_list, count = groupbox_info(groupboxes[i])
                company_details.extend(temp_list)
                # check if address is two lines and add blank item if not
                if temp_list[3] == "":
                    company_details.insert(2, "")
                    count += 1 # adds another line for website info check below
                # check if website is included in the information and add blank if not
                if count < 10:
                    company_details.append("")

            case "Industry Profile":
                temp_list, _ = groupbox_info(groupboxes[i])
                company_details.append(temp_list)

            case "Photo, Map, & Directions":
                temp_list, _ = groupbox_info(groupboxes[i])
                company_details.extend(temp_list)

            case "Business Demographics":
                temp_list, _ = groupbox_info(groupboxes[i])
                company_details.extend(temp_list)

            case "Management Directory":
                temp_list, _ = groupbox_info(groupboxes[i])
                company_details.append(temp_list)

            case "Business Expenditures":
                temp_list, _ = groupbox_info(groupboxes[i])
                company_details.extend(temp_list)

            case "Nearby Businesses":
                temp_list, _ = groupbox_info(groupboxes[i])
                company_details.extend(temp_list)

            case "Competitors Report":
                temp_list, _ = groupbox_info(groupboxes[i])
                company_details.extend(temp_list)

            case _:
                print("")

    return company_details

# Navigate to URL to Results Table

In [6]:
# navigate to URL
driver.get(base_url)

# navigate to Canadian Businesses
wait_for_page(1)
driver.find_element(By.XPATH, ("//a[@title = 'Canadian Businesses']")).click()

# check off on the ToC and Agree
wait_for_page(1)
driver.find_element(By.ID, "chkAgree").click()
driver.find_element(By.CLASS_NAME, "action-agree").click()

# enter card number and log on
wait_for_page(1)
driver.find_element(By.ID, "matchcode").send_keys(card_num)
driver.find_element(By.CLASS_NAME, "action-submit-form").click()

# open Canadian Business search
wait_for_page(1)
driver.find_element(By.CLASS_NAME, ("action-do-search")).click()

# switch to Advanced Search and interact with filters
wait_for_page(1)
driver.find_element(By.CLASS_NAME, "advancedSearch").click()
driver.find_element(By.ID, "cs-YellowPageHeadingOrSic").click() # Business Type: Keyword/SIC/NAICS
driver.find_element(By.ID, "cs-Province").click() # Geography: Province

# wait for advanced filter options to populate
wait_for_page(1)
driver.find_element(By.CSS_SELECTOR, "input#VerifiedOnly").click()
driver.find_element(By.CSS_SELECTOR, "input#sicPrimaryOptionId").click() # radio: Search primary SIC Only
driver.find_element(By.ID, "sicLookupKeyword").send_keys(query) # search field: Query
driver.find_element(By.ID, "searchSic").click() # search button: Query

# wait for results to populate and highlight chosen query fields
wait_for_page(3)
for value in data_values:
    driver.find_element(By.XPATH, f"//li[@data-value = {value}]").click() # results: Highlight NAICS results
driver.find_element(By.XPATH, f"//li[@data-value = 'ON']").click() # results: Highlight Ontario results
driver.find_element(By.CLASS_NAME, "action-view-results").click() # View Results with filters

# results page
wait_for_page(3)
driver.find_element(By.XPATH, "//*[@alt = 'Optional Column']").click()
driver.find_element(By.XPATH, "//*[@data-key = 'Title']").click()

In [7]:
data = list()
# find the resulting number of pages
max_pages = int(driver.find_element(By.CLASS_NAME, "data-page-max").text)

# iterate through pages
for i in range(max_pages):

    info_list = list()

    # go to next page if not on the first page
    if i != 0:
        driver.find_elements(By.CLASS_NAME, "next")[0].click()
        print(f"End of results. Onwards to page {i+1}...")

    # find the resulting number of companies per page
    wait_for_page(3)
    company_links = driver.find_elements(By.XPATH, "//tbody[@id ='searchResultsPage']/tr/td/a")
    print(f"Page {i+1} contains {len(company_links)} company links.")

    # iterate through each company page and scrape each page by groupbox
    for j, _ in enumerate(company_links):

        print(f"Gathering info on page {i+1}, row {j+1} of {len(company_links)}...")

        # restate 'company_links' to circumvent stale element error and enter next company link
        wait_for_page(3)
        company_links = driver.find_elements(By.XPATH, "//tbody[@id ='searchResultsPage']/tr/td/a")
        company_links[j].click()

        # scrape company page and append to existing list
        wait_for_page(3)
        groupboxes = driver.find_elements(By.CLASS_NAME, "groupbox")
        groupbox_titles = driver.find_elements(By.CLASS_NAME, "groupboxTitle")
        info_list.append(scrape_company_page(groupboxes, groupbox_titles))

        # navigate back to the results table page
        driver.back()

    # save "info_list" to data out
    wait_for_page(3)
    data.extend(info_list)

Page 1 contains 25 company links.
Gathering info on page 1, row 1 of 25...
Gathering info on page 1, row 2 of 25...
Gathering info on page 1, row 3 of 25...
Gathering info on page 1, row 4 of 25...
Gathering info on page 1, row 5 of 25...
Gathering info on page 1, row 6 of 25...
Gathering info on page 1, row 7 of 25...
Gathering info on page 1, row 8 of 25...
Gathering info on page 1, row 9 of 25...
Gathering info on page 1, row 10 of 25...
Gathering info on page 1, row 11 of 25...
Gathering info on page 1, row 12 of 25...
Gathering info on page 1, row 13 of 25...
Gathering info on page 1, row 14 of 25...
Gathering info on page 1, row 15 of 25...
Gathering info on page 1, row 16 of 25...
Gathering info on page 1, row 17 of 25...
Gathering info on page 1, row 18 of 25...
Gathering info on page 1, row 19 of 25...
Gathering info on page 1, row 20 of 25...
Gathering info on page 1, row 21 of 25...
Gathering info on page 1, row 22 of 25...
Gathering info on page 1, row 23 of 25...
Gathering

In [10]:
df = pd.DataFrame(data, columns=headings())
df.to_excel('lead-list-detailed.xlsx')

In [11]:
df.tail(50)

Unnamed: 0,Company Name,Unnamed: 2,Address Line 1,Address Line 2,Unnamed: 5,Phone,District,Fax,CMA,Website,...,Legal,Management/Administration,Office Equipment & Supplies,Package/Container,Payroll & Benefits,Purchased Print,Rent & Leasing,Technology,Telecommunications,Utilities
1060,Venetian Banquet & Hospitality,,219 Romina Dr,"Concord, ON L4K4V3",,(905) 660-1110,York Regional,(905) 660-1113,Toronto,thevenetian.ca,...,"$10,000 to $25,000","$50,000 to $100,000","$100,000 to $250,000","Over $50,000",$2.5 Million to $10 Million,"$10,000 to $25,000","Over $500,000","$10,000 to $50,000","$20,000 to $50,000","Over $100,000"
1061,Veneto Centre,,7465 Kipling Ave,"Woodbridge, ON L4L1Y5",,(905) 851-5551,York Regional,(905) 851-9878,Toronto,venetocentre.com,...,"$1,000 to $2,500","$5,000 to $10,000","$20,000 to $50,000","$10,000 to $25,000","$250,000 to $500,000","$1,000 to $2,500","$50,000 to $100,000","$1,000 to $2,500","$2,000 to $5,000","$25,000 to $50,000"
1062,Verdi Convention Ctr,,3550 Derry Rd E,"Mississauga, ON L4T3V7",,(416) 675-6756,Peel Regional,(416) 675-9859,Toronto,verdi.ca,...,Less than $500,"Less than $2,500","Less than $5,000","$1,000 to $5,000","Less than $100,000",Less than $500,"Less than $10,000",Less than $500,"Less than $2,000","$2,000 to $5,000"
1063,Verdi Hospitality Ctr,,3550 Derry Rd E,"Mississauga, ON L4T3V7",,(416) 675-6756,Peel Regional,Not Available,Toronto,verdi.ca,...,"$500 to $1,000","$2,500 to $5,000","$10,000 to $20,000","$1,000 to $5,000","$100,000 to $250,000","$500 to $1,000","$25,000 to $50,000","$500 to $1,000","Less than $2,000","$10,000 to $25,000"
1064,Verney Conference Management,,9 Bullman St,"Ottawa, ON K1Y2S2",,(613) 226-8317,Ottawa-Carleton Regional,(613) 722-7725,Ottawa-Gatineau,verney.ca,...,"$2,500 to $5,000","$10,000 to $25,000","$20,000 to $50,000",Less than $500,"$100,000 to $250,000","Over $25,000","$10,000 to $25,000","$2,500 to $5,000","$2,000 to $5,000","$2,000 to $5,000"
1065,Versailles Convention Ctr,,6721 Edwards Blvd,"Mississauga, ON L5T2V9",,(905) 565-7400,Peel Regional,(905) 565-9897,Toronto,versaillesconventioncentre.com,...,"$10,000 to $25,000","$50,000 to $100,000","$100,000 to $250,000","$1,000 to $5,000",$1 to $2.5 Million,"Over $25,000","$50,000 to $100,000","$10,000 to $50,000","$20,000 to $50,000","$10,000 to $25,000"
1066,Vert Catering,,963 Dovercourt Rd,"Toronto, ON M6H2X6",,(416) 535-2412,Toronto Metropolitan,Not Available,Toronto,vertcc.com,...,Less than $500,"Less than $2,500","Less than $5,000","$1,000 to $5,000","Less than $100,000",Less than $500,"Less than $10,000",Less than $500,"Less than $2,000","$2,000 to $5,000"
1067,Victorian Garden Banquet Hall,,570 Westney Rd S,"Ajax, ON L1S6V4",,(905) 619-9858,Durham Regional,Not Available,Toronto,,...,Less than $500,"Less than $2,500","Less than $5,000","$1,000 to $5,000","Less than $100,000",Less than $500,"Less than $10,000",Less than $500,"Less than $2,000","$2,000 to $5,000"
1068,Vila Verde Restaurant,,869 Dundas St W,"Toronto, ON M6J1V6",,(416) 603-2515,Toronto Metropolitan,(416) 538-7315,Toronto,vilaverdechurrasqueira.com,...,Less than $500,"Less than $2,500","Less than $5,000","$1,000 to $5,000","Less than $100,000",Less than $500,"Less than $10,000",Less than $500,"Less than $2,000","$2,000 to $5,000"
1069,Villa Colombo,,10443 27 Hwy,"Vaughan, ON",,(905) 893-9474,York Regional,Not Available,Toronto,villacharities.com,...,"$10,000 to $25,000","$50,000 to $100,000","$100,000 to $250,000","Over $50,000",$2.5 Million to $10 Million,"$10,000 to $25,000","Over $500,000","$10,000 to $50,000","$20,000 to $50,000","Over $100,000"
