In [8]:
# load environment variables
from dotenv import load_dotenv
from pathlib import Path
import os

dotenv_path = Path('/.env').parent
print(dotenv_path)
load_dotenv(Path('../.env'))

# selenium to parse dynamic web pages
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

# import utilities
import pandas as pd
import time

/


# Authentication & Filters

In [9]:
# get environment variables
card_num = os.environ.get('MISS-LIB-CARD')
base_url = os.getenv('BASE-URL')

# search terms
query = "banquet"
data_values = [
    581212, # Banquet Facilities
    791102, # Ballrooms
    799904, # Auditoriums
    738931] # Convention & Meeting Facilities

In [10]:
print(card_num)

None


# Set Browser Options & Install Drivers

In [11]:
# define options for browser
options = ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
# options.add_argument('--headless')

# install driver and open URL
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

[WDM] - Downloading: 100%|██████████| 8.00M/8.00M [00:00<00:00, 10.9MB/s]


# Define Helper Functions

In [12]:
def document_initialised(driver):
    '''
    document_initialised will command the driver to wait for the page to load completely.

    Args:
        driver (ChromeService object): Chrome browser driver.

    Returns:
        js script: Driver executes a script to wait for the page to load or, if it has already loaded before the listener can catch it, to proceed.
    '''
    return driver.execute_script("if (document.readyState === 'complete') {return true;} else {window.addEventListener('load', () => {return true;});}")

def wait_for_page(n):
    '''
    wait_for_page will stall the selenium driver to listen for conditions.

    Args:
        n (int): Set wait time in seconds.

    '''
    WebDriverWait(driver, timeout=10).until(document_initialised)
    time.sleep(n)
    return

def scrape_table_data(driver):
    '''
    scrape_page_data will scrape all data from the tabulated, high-level results.
    NOTE: This function only works if it is called on the first page of the tabulated results.

    Args:
        driver (ChromeService object): Chrome browser driver.

    Returns:
        data (list): Scraped entries from the tables.
        heading_names (list): Scraped fields from the tables.
    '''

    # find the resulting number of pages
    max_pages = driver.find_element(By.CLASS_NAME, "data-page-max").text

    # define containers
    data = list()
    heading_names = list()

    # scrape table headings
    headings = driver.find_elements(By.XPATH, "//thead[@id = 'searchResultsHeader']/tr/th")
    for heading in headings:
        heading_names.append(heading.text)

    for _ in range(1, int(max_pages)+1):
        # scrape entries from table
        rows = driver.find_elements(By.XPATH, "//tbody[@id = 'searchResultsPage']/tr")

        for row in rows:
            row_data = list()
            for col in row.find_elements(By.TAG_NAME, "td"):
                row_data.append(col.text)
            data.append(row_data)

        # go to the next page then wait
        driver.find_elements(By.CLASS_NAME, "next")[0].click()
        wait_for_page(3)

    return data, heading_names

def scrape_detailed_data(driver):
    # find the resulting number of pages
    max_pages = driver.find_element(By.CLASS_NAME, "data-page-max").text

    # define containers
    data = list()
    heading_names = list()

    # scrape table headings
    headings = driver.find_elements(By.XPATH, "//thead[@id = 'searchResultsHeader']/tr/th")
    for heading in headings:
        heading_names.append(heading.text)

    for _ in range(1, int(max_pages)+1):
        # scrape entries from table
        rows = driver.find_elements(By.XPATH, "//tbody[@id = 'searchResultsPage']/tr")

        for row in rows:
            row_data = list()
            for col in row.find_elements(By.TAG_NAME, "td"):
                row_data.append(col.text)
            data.append(row_data)

        # go to the next page then wait
        driver.find_elements(By.CLASS_NAME, "next")[0].click()
        wait_for_page(3)

    return data, heading_names

# Navigate to URL to Results Table

In [27]:
# navigate to URL
driver.get("https://www.mississauga.ca/library/research-and-learn/business/")

# navigate to Canadian Businesses
wait_for_page(1)
driver.find_element(By.XPATH, ("//a[@title = 'Canadian Businesses']")).click()

# check off on the ToC and Agree
wait_for_page(1)
driver.find_element(By.ID, "chkAgree").click()
driver.find_element(By.CLASS_NAME, "action-agree").click()

# enter card number and log on
wait_for_page(1)
driver.find_element(By.ID, "matchcode").send_keys(card_num)
driver.find_element(By.CLASS_NAME, "action-submit-form").click()

# open Canadian Business search
wait_for_page(1)
driver.find_element(By.CLASS_NAME, ("action-do-search")).click()

# switch to Advanced Search and interact with filters
wait_for_page(1)
driver.find_element(By.CLASS_NAME, "advancedSearch").click()
driver.find_element(By.ID, "cs-YellowPageHeadingOrSic").click() # Business Type: Keyword/SIC/NAICS
driver.find_element(By.ID, "cs-Province").click() # Geography: Province

# wait for advanced filter options to populate
wait_for_page(1)
driver.find_element(By.CSS_SELECTOR, "input#VerifiedOnly").click()
driver.find_element(By.CSS_SELECTOR, "input#sicPrimaryOptionId").click() # radio: Search primary SIC Only
driver.find_element(By.ID, "sicLookupKeyword").send_keys(query) # search field: Query
driver.find_element(By.ID, "searchSic").click() # search button: Query

# wait for results to populate and highlight chosen query fields
wait_for_page(1)
for value in data_values:
    driver.find_element(By.XPATH, f"//li[@data-value = {value}]").click() # results: Highlight NAICS results
driver.find_element(By.XPATH, f"//li[@data-value = 'ON']").click() # results: Highlight Ontario results
driver.find_element(By.CLASS_NAME, "action-view-results").click() # View Results with filters

# results page
wait_for_page(2)
driver.find_element(By.XPATH, "//*[@alt = 'Optional Column']").click()
driver.find_element(By.XPATH, "//*[@data-key = 'Title']").click()

### Make function to repeat try/except blocks for each check below.

In [None]:
# high-level
bodies = driver.find_elements(By.TAG_NAME, "tbody")
info_list = list()

# low-level
management = bodies[5].find_elements(By.TAG_NAME, "tr")
for people in management:
    person = people.find_elements(By.TAG_NAME, "td")

    # Management Directory
    for detail in person:
        info_list.append(detail.text)

    # Company details below will be copied for each person in management of that company
    for i,_ in enumerate(bodies):
        # Location Information (1st)
        if i == 0:
            cols = bodies[i].find_elements(By.TAG_NAME, "tr")
            for col in cols:
                info_list.append(col.text)

        # Location Information (2nd)
        if i == 1:
            info_list.append(bodies[i].find_element(By.XPATH, "//td[@headers = 'phone']").text)
            info_list.append(bodies[i].find_element(By.XPATH, "//td[@headers = 'fax']").text)
            info_list.append(bodies[i].find_element(By.XPATH, "//td[@headers = 'district']").text)
            info_list.append(bodies[i].find_element(By.XPATH, "//td[@headers = 'cma']").text) # Census Metropolitan Area

        # Location Information (3rd)
        if i == 2:
            info_list.append(bodies[i].find_element(By.TAG_NAME, "a").get_attribute("href"))

        # Industry Profile
        if i == 3:
            cols = bodies[i].find_elements(By.TAG_NAME, "td")
            for col in cols:
                info_list.append(col.text)

        # Business Demographics
        if i == 4:
            cols = bodies[i].find_elements(By.TAG_NAME, "td")
            for col in cols:
                info_list.append(col.text)

        # Business Expenditures
        if i == 6:
            cols = bodies[i].find_elements(By.TAG_NAME, "td")
            for col in cols:
                info_list.append(col.text)

In [None]:
# navigate to the first company
## LOOP THROUGH RESULTS
company_links = driver.find_elements(By.XPATH, "//tbody[@id ='searchResultsPage']/tr/td/a")
company_links[0].click()
wait_for_page(3)

In [20]:
print(len(info_list))
info_list

8


['These expenditures are an estimated annual expense.',
 '1595645 Ontario Inc',
 '2155a Steeles Ave E',
 'Brampton, ON L6T5A1',
 '(905) 450-9996',
 'Not Available',
 'Peel Regional',
 'Toronto']

In [67]:
for i in range(len(driver.find_elements(By.TAG_NAME, "tbody"))):
    print(driver.find_elements(By.TAG_NAME, "tbody")[i].text, "\n")

10 Tation Event Catering
232 Norseman St
Etobicoke, ON M8Z2R4 

Phone (416) 243-5144 District Toronto Metropolitan
Fax (416) 243-2662 CMA Toronto 

10tation.com 

SIC Code Descriptions Primary Ad Size Year First Appeared
5812-12 Caterers Regular 2019
NAICS Code Description Primary Ad Size Year First Appeared
72232001 Caterers Regular 2019 

Location Employees 24 Location Sales Volume $1,296,000
Corporate Employees Not Available Corporate Sales Volume Not Available
Location Type Single Loc Credit Rating Excellent
Stock Ticker Symbol Latitude / Longitude 43.632250 / -79.529510
IUSA Number 03-839-5380 Federal Contractor No 

David D'Aprile Owner Male 

These expenditures are an estimated annual expense.
Accounting $2,500 to $5,000 Advertising $20,000 to $50,000
Contract Labor $1,000 to $10,000 Insurance $10,000 to $25,000
Legal $1,000 to $2,500 Management/Administration $5,000 to $10,000
Office Equipment & Supplies $20,000 to $50,000 Package/Container $10,000 to $25,000
Payroll & Benefits

In [70]:
for i in range(len(driver.find_elements(By.TAG_NAME, "thead"))):
    print(driver.find_elements(By.TAG_NAME, "thead")[i].text)

Name Executive Title Gender


In [71]:
for i in range(len(driver.find_elements(By.TAG_NAME, "th"))):
    print(driver.find_elements(By.TAG_NAME, "th")[i].text)

Phone
District
Fax
CMA
SIC Code
Descriptions
Primary
Ad Size
Year First Appeared
NAICS Code
Description
Primary
Ad Size
Year First Appeared
Location Employees
Location Sales Volume
Corporate Employees
Corporate Sales Volume
Location Type
Credit Rating
Stock Ticker Symbol
Latitude / Longitude
IUSA Number
Federal Contractor
Name
Executive Title
Gender
Accounting
Advertising
Contract Labor
Insurance
Legal
Management/Administration
Office Equipment & Supplies
Package/Container
Payroll & Benefits
Purchased Print
Rent & Leasing
Technology
Telecommunications
Utilities


In [72]:
for i in range(len(driver.find_elements(By.TAG_NAME, "td"))):
    print(driver.find_elements(By.TAG_NAME, "td")[i].text)

10 Tation Event Catering

232 Norseman St
Etobicoke, ON M8Z2R4

(416) 243-5144
Toronto Metropolitan
(416) 243-2662
Toronto
10tation.com
5812-12
Caterers

Regular
2019
72232001
Caterers

Regular
2019
24
$1,296,000
Not Available
Not Available
Single Loc
Excellent

43.632250 / -79.529510
03-839-5380
No
David D'Aprile
Owner
Male
These expenditures are an estimated annual expense.
$2,500 to $5,000
$20,000 to $50,000
$1,000 to $10,000
$10,000 to $25,000
$1,000 to $2,500
$5,000 to $10,000
$20,000 to $50,000
$10,000 to $25,000
$250,000 to $500,000
$1,000 to $2,500
$50,000 to $100,000
$1,000 to $2,500
$2,000 to $5,000
$25,000 to $50,000


In [79]:
driver.find_elements(By.TAG_NAME, "td")[26].text

''

In [None]:
# gather location information
driver.find_element(By.ID, "businessName").text # Company
driver.find_element(By.XPATH, "//td[@id = 'businessName']/tr[@class = 'firstTd']")

In [58]:
driver.back()

In [59]:
driver.forward()

In [31]:
# df = pd.DataFrame(data, columns=headings)

In [32]:
# df.to_excel('lead-list-scraped.xlsx')

In [None]:
#Location Information
## Company
## Address
## Phone
## Fax
## District
## CMA

#Industry Profile
## SIC Code
## Descriptions
## Ad Size
## Year First Appeared
## NAICS Code
## Description
## Ad Size
## Year First Appeared

# Business Demographics
## Location Employees
## Corporate Employees
## Location Type
## Stock Ticker Symbol
## IUSA Number
## Location Sales Volume
## Corporate Sales Volume
## Credit Rating
## Latitude / Longitude
## Federal Contractor

# Management Directory <-- Use this to populate your rows
## Name
## Executive Title
## Gender

# Business Expenditures
## Accounting
## Contract Labor
## Legal
## Office Equipment & Supplies
## Payroll & Benefits
## Rent & Leasing
## Telecommunications
## Advertising
## Insurance
## Management/Administration
## Package/Container
## Purchased Print
## Technology
## Utilities