In [106]:
# load environment variables
from dotenv import load_dotenv
from pathlib import Path
import os

load_dotenv()

# selenium to parse dynamic web pages
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

# import utilities
import pandas as pd
import time

# Authentication & Filters

In [107]:
# get environment variables
card_num = os.environ.get('MISS_LIB_CARD')
base_url = os.getenv('BASE_URL')

# search terms
query = "banquet"
data_values = [
    581212, # Banquet Facilities
    791102, # Ballrooms
    799904, # Auditoriums
    738931] # Convention & Meeting Facilities

In [108]:
print(base_url)

https://www.mississauga.ca/library/research-and-learn/business/


# Helper Functions

In [109]:
def document_initialised(driver):
    '''
    document_initialised will command the driver to wait for the page to load completely.

    Args:
        driver (ChromeService object): Chrome browser driver.

    Returns:
        js script: Driver executes a script to wait for the page to load or, if it has already loaded before the listener can catch it, to proceed.
    '''
    return driver.execute_script("if (document.readyState === 'complete') {return true;} else {window.addEventListener('load', () => {return true;});}")

def wait_for_page(n):
    '''
    wait_for_page will stall the selenium driver to listen for conditions.

    Args:
        n (int): Set wait time in seconds.

    '''
    WebDriverWait(driver, timeout=10).until(document_initialised)
    time.sleep(n)
    return



# Scraper Functions

In [110]:
def scrape_table_data(driver):
    '''
    scrape_page_data will scrape all data from the tabulated, high-level results.
    NOTE: This function only works if it is called on the first page of the tabulated results.

    Args:
        driver (WebDriver): Chrome browser driver.

    Returns:
        data (list): Scraped entries from the tables.
        heading_names (list): Scraped fields from the tables.
    '''

    # find the resulting number of pages
    max_pages = driver.find_element(By.CLASS_NAME, "data-page-max").text

    # define containers
    data = list()
    heading_names = list()

    # scrape table headings
    headings = driver.find_elements(By.XPATH, "//thead[@id = 'searchResultsHeader']/tr/th")
    for heading in headings:
        heading_names.append(heading.text)

    for _ in range(1, int(max_pages)+1):
        # scrape entries from table
        rows = driver.find_elements(By.XPATH, "//tbody[@id = 'searchResultsPage']/tr")

        for row in rows:
            row_data = list()
            for col in row.find_elements(By.TAG_NAME, "td"):
                row_data.append(col.text)
            data.append(row_data)

        # go to the next page then wait
        driver.find_elements(By.CLASS_NAME, "next")[0].click()
        wait_for_page(3)

    return data, heading_names

def scrape_groupbox_info(groupbox, groupbox_title):
    '''
    scrape_groupbox_info will scrape groupbox details from one company page.

    Args:
        groupbox (WebDriver): WebDriver element filtering for groupbox class divs.
        groupbox_title (WebDriver element): WebDriver element filtering for groupbox_title divs.

    Returns:
        temp_list (list): List of data values scraped from each groupbox.
        count (int): Length of items in temp_list.
    '''
    temp_list = list()
     #  confirm the case
    print(groupbox_title.text)

    # find all information in the groupbox
    cols = groupbox.find_elements(By.TAG_NAME, "td")
    count = len(cols)

    # append information to the info_list
    for col in cols:
        temp_list.append(col.text)

    # capture if website if present, return nothing if not
    # try:
    #     temp_list.append(groupbox.find_element(By.TAG_NAME, "a").get_attribute("href"))
    # except NoSuchElementException:
    #     temp_list.append("")

    return temp_list, count

def scrape_company_data_one_page(groupboxes, groupbox_titles):
    # instantiate storage list variable
    info_list = list()

    # Isolate site for each group box based on title
    for i, groupbox_title in enumerate(groupbox_titles):
        # Switch case for each groupbox based on their titles
        match groupbox_title.text:
            case "Location Information":
                temp_list, count = scrape_groupbox_info(groupboxes[i], groupbox_title)
                info_list.extend(temp_list)
                # check if address is two lines and add blank item if not
                if temp_list[3] == "":
                    info_list.insert(2, "")
                # check if website is included in the information and add blank if not
                if count < 10:
                    info_list.append("")

            case "Industry Profile":
                temp_list, _ = scrape_groupbox_info(groupboxes[i], groupbox_title)
                info_list.append(temp_list)

            case "Photo, Map, & Directions":
                temp_list, _ = scrape_groupbox_info(groupboxes[i], groupbox_title)
                info_list.extend(temp_list)

            case "Business Demographics":
                temp_list, _ = scrape_groupbox_info(groupboxes[i], groupbox_title)
                info_list.extend(temp_list)

            case "Management Directory":
                temp_list, _ = scrape_groupbox_info(groupboxes[i], groupbox_title)
                info_list.append(temp_list)

            case "Business Expenditures":
                temp_list, _ = scrape_groupbox_info(groupboxes[i], groupbox_title)
                info_list.extend(temp_list)

            case "Nearby Businesses":
                temp_list, _ = scrape_groupbox_info(groupboxes[i], groupbox_title)
                info_list.extend(temp_list)

            case "Competitors Report":
                temp_list, _ = scrape_groupbox_info(groupboxes[i], groupbox_title)
                info_list.extend(temp_list)

            case _:
                print("")

    return info_list

# Set Browser Options & Install Drivers

In [111]:
# define options for browser
options = ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
# options.add_argument('--headless')

# install driver and open URL
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

# Navigate to URL to Results Table

In [112]:
# navigate to URL
driver.get(base_url)

# navigate to Canadian Businesses
wait_for_page(1)
driver.find_element(By.XPATH, ("//a[@title = 'Canadian Businesses']")).click()

# check off on the ToC and Agree
wait_for_page(1)
driver.find_element(By.ID, "chkAgree").click()
driver.find_element(By.CLASS_NAME, "action-agree").click()

# enter card number and log on
wait_for_page(1)
driver.find_element(By.ID, "matchcode").send_keys(card_num)
driver.find_element(By.CLASS_NAME, "action-submit-form").click()

# open Canadian Business search
wait_for_page(1)
driver.find_element(By.CLASS_NAME, ("action-do-search")).click()

# switch to Advanced Search and interact with filters
wait_for_page(1)
driver.find_element(By.CLASS_NAME, "advancedSearch").click()
driver.find_element(By.ID, "cs-YellowPageHeadingOrSic").click() # Business Type: Keyword/SIC/NAICS
driver.find_element(By.ID, "cs-Province").click() # Geography: Province

# wait for advanced filter options to populate
wait_for_page(1)
driver.find_element(By.CSS_SELECTOR, "input#VerifiedOnly").click()
driver.find_element(By.CSS_SELECTOR, "input#sicPrimaryOptionId").click() # radio: Search primary SIC Only
driver.find_element(By.ID, "sicLookupKeyword").send_keys(query) # search field: Query
driver.find_element(By.ID, "searchSic").click() # search button: Query

# wait for results to populate and highlight chosen query fields
wait_for_page(3)
for value in data_values:
    driver.find_element(By.XPATH, f"//li[@data-value = {value}]").click() # results: Highlight NAICS results
driver.find_element(By.XPATH, f"//li[@data-value = 'ON']").click() # results: Highlight Ontario results
driver.find_element(By.CLASS_NAME, "action-view-results").click() # View Results with filters

# results page
wait_for_page(3)
driver.find_element(By.XPATH, "//*[@alt = 'Optional Column']").click()
driver.find_element(By.XPATH, "//*[@data-key = 'Title']").click()

In [113]:
# find the resulting number of pages
max_pages = driver.find_element(By.CLASS_NAME, "data-page-max").text

In [114]:
info_list = list()

# find the resulting number of companies per page
company_links = driver.find_elements(By.XPATH, "//tbody[@id ='searchResultsPage']/tr/td/a")

# iterate through each company page and scrape each page by groupbox
for i, _ in enumerate(company_links):
    # wait for page to load
    wait_for_page(3)
    print(i)
    # enter a company page
    company = driver.find_elements(By.XPATH, "//tbody[@id ='searchResultsPage']/tr/td/a")
    company[i].click()
    # scrape company page and append to existing list
    wait_for_page(3)
    groupboxes = driver.find_elements(By.CLASS_NAME, "groupbox")
    groupbox_titles = driver.find_elements(By.CLASS_NAME, "groupboxTitle")
    info_list.append(scrape_company_data_one_page(groupboxes, groupbox_titles))
    # navigate back to results table page
    if i !=  len(company_links):
        wait_for_page(3)
        driver.back()

0
Location Information
Industry Profile
Photo, Map, & Directions
Business Demographics
Management Directory
Business Expenditures
Nearby Businesses
Competitors Report
1
Location Information
Industry Profile
Photo, Map, & Directions
Business Demographics
Management Directory
Business Expenditures
Nearby Businesses
Competitors Report
2
Location Information
Industry Profile
Photo, Map, & Directions
Business Demographics
Management Directory
Business Expenditures
Nearby Businesses
Competitors Report
3
Location Information
Industry Profile
Photo, Map, & Directions
Business Demographics
Management Directory
Business Expenditures
Nearby Businesses
Competitors Report
4
Location Information
Industry Profile
Photo, Map, & Directions
Business Demographics
Management Directory
Business Expenditures
Nearby Businesses
Competitors Report
5
Location Information
Industry Profile
Photo, Map, & Directions
Business Demographics
Management Directory
Business Expenditures
Nearby Businesses
Competitors Repor

In [122]:
print(len(info_list))
info_list

25


[['10 Tation Event Catering',
  '',
  '232 Norseman St',
  'Etobicoke, ON M8Z2R4',
  '',
  '(416) 243-5144',
  'Toronto Metropolitan',
  '(416) 243-2662',
  'Toronto',
  '10tation.com',
  ['5812-12',
   'Caterers',
   '',
   'Regular',
   '2019',
   '72232001',
   'Caterers',
   '',
   'Regular',
   '2019'],
  '24',
  '$1,296,000',
  'Not Available',
  'Not Available',
  'Single Loc',
  'Excellent',
  '',
  '43.632250 / -79.529510',
  '03-839-5380',
  'No',
  ["David D'Aprile", 'Owner', 'Male'],
  'These expenditures are an estimated annual expense.',
  '$2,500 to $5,000',
  '$20,000 to $50,000',
  '$1,000 to $10,000',
  '$10,000 to $25,000',
  '$1,000 to $2,500',
  '$5,000 to $10,000',
  '$20,000 to $50,000',
  '$10,000 to $25,000',
  '$250,000 to $500,000',
  '$1,000 to $2,500',
  '$50,000 to $100,000',
  '$1,000 to $2,500',
  '$2,000 to $5,000',
  '$25,000 to $50,000'],
 ['1595645 Ontario Inc',
  '',
  '2155a Steeles Ave E',
  'Brampton, ON L6T5A1',
  '',
  '(905) 450-9996',
  'Peel

In [123]:
headings = [
    'Company Name',
    '',
    'Address Line 1',
    'Address Line 2',
    '',
    'Phone',
    'District',
    'Fax',
    'CMA',
    'Website',
    'Industry Profile',
    'Location Employees',
    'Location Sales Volume',
    'Corporate Employees',
    'Corporate Sales Volume',
    'Location Type',
    'Credit Rating',
    'Stock Ticker Table',
    'Latitude/Longitude',
    'IUSA Number',
    'Federal Contractor',
    'Management Directory',
    'Disclosure',
    'Accounting',
    'Advertising',
    'Contract Labor',
    'Insurance',
    'Legal',
    'Management/Administration',
    'Office Equipment & Supplies',
    'Package/Container',
    'Payroll & Benefits',
    'Purchased Print',
    'Rent & Leasing',
    'Technology',
    'Telecommunications',
    'Utilities'
]

In [124]:
len(headings)

37

In [125]:
df = pd.DataFrame(info_list, columns=headings)

In [126]:
df

Unnamed: 0,Company Name,Unnamed: 2,Address Line 1,Address Line 2,Unnamed: 5,Phone,District,Fax,CMA,Website,...,Legal,Management/Administration,Office Equipment & Supplies,Package/Container,Payroll & Benefits,Purchased Print,Rent & Leasing,Technology,Telecommunications,Utilities
0,10 Tation Event Catering,,232 Norseman St,"Etobicoke, ON M8Z2R4",,(416) 243-5144,Toronto Metropolitan,(416) 243-2662,Toronto,10tation.com,...,"$1,000 to $2,500","$5,000 to $10,000","$20,000 to $50,000","$10,000 to $25,000","$250,000 to $500,000","$1,000 to $2,500","$50,000 to $100,000","$1,000 to $2,500","$2,000 to $5,000","$25,000 to $50,000"
1,1595645 Ontario Inc,,2155a Steeles Ave E,"Brampton, ON L6T5A1",,(905) 450-9996,Peel Regional,Not Available,Toronto,,...,Less than $500,"Less than $2,500","$5,000 to $10,000","$1,000 to $5,000","Less than $100,000",Less than $500,"$10,000 to $25,000",Less than $500,"Less than $2,000","$5,000 to $10,000"
2,3 Sisters Catering,,"Aurora, ON L4G3A2",,,(905) 727-2660,York Regional,Not Available,Toronto,,...,Less than $500,"Less than $2,500","$5,000 to $10,000","$1,000 to $5,000","Less than $100,000",Less than $500,"$10,000 to $25,000",Less than $500,"Less than $2,000","$5,000 to $10,000"
3,38 Gourmet Catering,,1355 Regent St,"Sudbury, ON P3E3Z1",,(705) 626-3717,Sudbury Regional,Not Available,Greater Sudbury,,...,Less than $500,"Less than $2,500","$5,000 to $10,000","$1,000 to $5,000","Less than $100,000",Less than $500,"$10,000 to $25,000",Less than $500,"Less than $2,000","$5,000 to $10,000"
4,A & V Catering Svc,,30 Horseshoe Cres,"Scarborough, ON M1B4S4",,(416) 414-9609,Toronto Metropolitan,Not Available,Toronto,,...,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available
5,A Fine Fit Catering & Consltng,,193 Riverside Dr,"Thunder Bay, ON P7B7B3",,(807) 708-3509,Thunder Bay District,Not Available,Thunder Bay,afinefitcatering.ca,...,Less than $500,"Less than $2,500","Less than $5,000","$500 to $1,000","Less than $100,000",Less than $500,"Less than $10,000",Less than $500,"Less than $2,000","$2,000 to $5,000"
6,A Great Mix Bartending Svc,,15 Wessenger Dr,"Barrie, ON L4N8R8",,(705) 791-9200,Simcoe County,Not Available,Barrie,,...,Less than $500,"Less than $2,500","Less than $5,000","$1,000 to $5,000","Less than $100,000",Less than $500,"Less than $10,000",Less than $500,"Less than $2,000","$2,000 to $5,000"
7,A J's Banquet & Catering Fclts,,1751 8th Line Rd,"Metcalfe, ON K0A2P0",,(613) 821-1445,Ottawa-Carleton Regional,(613) 821-4851,Ottawa-Gatineau,,...,Less than $500,"Less than $2,500","Less than $5,000","$1,000 to $5,000","Less than $100,000",Less than $500,"$10,000 to $25,000",Less than $500,"Less than $2,000","$5,000 to $10,000"
8,A LA King-Culinary Creations,,"Beamsville, ON L0RL0R",,,(289) 440-0010,Niagara Regional,Not Available,St Catharines-Niagara,,...,Less than $500,"Less than $2,500","Less than $5,000","$1,000 to $5,000","Less than $100,000",Less than $500,"$10,000 to $25,000",Less than $500,"Less than $2,000","$5,000 to $10,000"
9,A One Catering,,22-7875 Tranmere Dr,"Mississauga, ON L5S1T8",,(905) 677-9121,Peel Regional,Not Available,Toronto,aonecatering.ca,...,"$500 to $1,000","$2,500 to $5,000","$10,000 to $20,000","$1,000 to $5,000","$100,000 to $250,000","$500 to $1,000","$25,000 to $50,000","$500 to $1,000","Less than $2,000","$10,000 to $25,000"


In [120]:
# df.to_excel('lead-list-scraped.xlsx')

In [121]:
#Location Information
## Company
## Address
## Phone
## Fax
## District
## CMA

#Industry Profile
## SIC Code
## Descriptions
## Ad Size
## Year First Appeared
## NAICS Code
## Description
## Ad Size
## Year First Appeared

# Business Demographics
## Location Employees
## Corporate Employees
## Location Type
## Stock Ticker Symbol
## IUSA Number
## Location Sales Volume
## Corporate Sales Volume
## Credit Rating
## Latitude / Longitude
## Federal Contractor

# Management Directory <-- Use this to populate your rows
## Name
## Executive Title
## Gender

# Business Expenditures
## Accounting
## Contract Labor
## Legal
## Office Equipment & Supplies
## Payroll & Benefits
## Rent & Leasing
## Telecommunications
## Advertising
## Insurance
## Management/Administration
## Package/Container
## Purchased Print
## Technology
## Utilities