# Imports

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import re
import warnings

import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

warnings.filterwarnings(action="ignore")

# Helper Functions

In [3]:

def get_naics(
    url= "https://www.dnb.com/business-directory/company-information.outpatient_care_centers.ca.html?page=2", 
    driver_path="/home/ada/Downloads/geckodriver"
    ):

    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = False
    driver = webdriver.Firefox(capabilities=cap, executable_path=driver_path)
    driver.get(url)

    vals = driver.find_element_by_xpath(
        "//div[@class='codes-title naics']/span").get_attribute("innerHTML")

    vals = vals.split("&")[0]
    print(vals)

    driver.close()

    return vals



def get_dnd_links(
    url = "https://www.dnb.com/business-directory/company-information.health_care_and_social_assistance.ca.html?page=2",
    driver_path = "/home/ada/Downloads/geckodriver"
):


    # company dnd links
    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = False
    driver = webdriver.Firefox(capabilities=cap, executable_path=driver_path)

    driver.get(url)

    elems = driver.find_elements_by_xpath("//a[@href]")
    links = []
    for elem in elems:
        links.append(elem.get_attribute("href"))

    driver.close()

    links = [i for i in links if "company-profiles" in i]

    return links

    
def get_company_details(
    dnd_url = "https://www.dnb.com/business-directory/company-profiles.alberta_health_services.129b1f80ef8deb9b06ed739f80223ccb.html",
    driver_path = "/home/ada/Downloads/geckodriver"
):


    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = False
    driver = webdriver.Firefox(capabilities=cap, executable_path=driver_path)
    driver.get(dnd_url)

    # remove html tags
    def remove_tags(text):
        return TAG_RE.sub('', text)
    
    # address
    address = driver.find_element_by_xpath(
        "//div[@class='col-md-11']/span[@class='company_data_point' and @name='company_address']/span").get_attribute("innerHTML")
    address = address.split("<")[0]
#     address = remove_tags(address)
    

    # telephone
    telephone = driver.find_element_by_xpath("//div[@class='col-md-11']/span[@class='company_data_point' and @name='company_phone']/span").get_attribute("innerHTML")

    # website
    website = driver.find_element_by_xpath(
        "//div[@class='col-md-11']/span[@class='company_data_point' and @name='company_website']/span").get_attribute("innerHTML")
    website = website.split(">")[1].split("<")[0]
    
    # employees
    employees = driver.find_element_by_xpath(
        "//div[@class='col-md-11']/span[@class='company_data_point' and @name='employees_all_site']/span").get_attribute("innerHTML")

    # revenue
    revenue = driver.find_element_by_xpath(
        "//div[@class='col-md-11']/span[@class='company_data_point' and @name='revenue_in_us_dollar']/span").get_attribute("innerHTML")

    # name
    name = driver.find_element_by_xpath(
        "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='company_name']/span").get_attribute("innerHTML")

    # description
    description = driver.find_element_by_xpath(
        "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='company_description']/span").get_attribute("innerHTML")

    # contact person
    contact_person = driver.find_element_by_xpath(
        "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='key_principal']/span").get_attribute("innerHTML")
    contact_person = contact_person.split("&")[0]
#     contact_person = remove_tags(contact_person)

    
    # services
    services = driver.find_element_by_xpath(
        "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='industry_links']/span").get_attribute("innerHTML")

    TAG_RE = re.compile(r'<[^>]+>')

    services = remove_tags(services).replace("\n", " ").replace("&nbsp;", "").replace(",", "")

    driver.close()

    df =  pd.DataFrame([name, description, services, website, employees, contact_person, address, telephone, revenue]).T
    df.columns = ["name", "description", "services", "website", "employees", "contact_person", "address", "telephone", "revenue"]

    return df

# Outpatient Care Centers Companies in Canada


In [4]:
# get naics
naics = get_naics()

6214


In [20]:
urls = []
for i in range(1, 3172, 1):
    urls.append(f"https://www.dnb.com/business-directory/company-information.health_care_and_social_assistance.ca.html?page={i}")

In [22]:
dnd_links = []
for url in urls:
    dnd_links.append(get_dnd_links(url))
    
len(dnd_links)

In [6]:
dfs = []
for company in dnd_links:
    try:
        dfs.append(get_company_details(company))
        print(company, "---pass")
    except Exception:
        pass

https://www.dnb.com/business-directory/company-profiles.windsor_regional_hospital.fc678005878ec949cd484e7b042e94b4.html ---pass
https://www.dnb.com/business-directory/company-profiles.soci%C3%A9t%C3%A9_de_gestion_du_centre_dexcellence_en_th%C3%A9rapie_cellulaire.3453523eca1d04e22a35b5210714923f.html ---pass
https://www.dnb.com/business-directory/company-profiles.the_canadian_red_cross_society.0e0090048e870428462fb2cada180b2f.html ---pass


In [7]:
mega = pd.concat(dfs)
mega.to_csv("outpatient_cares.csv")
# dfs

Unnamed: 0,name,description,services,website,employees,contact_person,address,telephone,revenue
0,Windsor Regional Hospital,Windsor Regional Hospital is located in Windso...,...,\n ...,2200,\n \n ...,\n ...,(519) 254-5577,$466.95 million
0,Société de Gestion du Centre d'Excellence en T...,Société de Gestion du Centre d'Excellence en T...,...,\n ...,4200,\n \n ...,\n ...,(514) 374-7940,$466.54 million
0,The Canadian Red Cross Society,The Canadian Red Cross Society is located in O...,...,\n ...,6400,\n \n ...,\n ...,(613) 740-1900,$464.79 million


In [8]:
mega.services.values[0]

'                                                                                                                                       General Medical and Surgical Hospitals                                                                                                                                                                                                                                                                                                                                                            Hospitals                                                                                                                                                                                                                                                                                                                                                            Health Care and Social Assistance                                                                                        

In [10]:
mega.website.values[0]

'\n                                                    www.wrh.on.ca\n                                            '

In [9]:
# ## dnd
# # names, physical location, sales revenue($M)
# cap = DesiredCapabilities().FIREFOX
# cap["marionette"] = False
# driver = webdriver.Firefox(capabilities=cap, executable_path="/home/ada/Downloads/geckodriver")

# driver.get('https://www.dnb.com/business-directory/company-information.health_care_and_social_assistance.ca.html?page=2')

# # links = driver.find_elements_by_class_name("col-md-12 data")
# vals = driver.find_element(By.TAG_NAME, 'div')
# vals = vals.find_elements(By.ID, value="companyResults")
# print('length of links are: ', len(vals))
# names = []
# for i in range(len(vals)):
#     names.append(vals[i].text)


# driver.close()