# Imports

In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import re
import warnings

import pandas as pd
import numpy as np

In [19]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

warnings.filterwarnings(action="ignore")

# Helper Functions

In [76]:

def get_naics(
    url= "https://www.dnb.com/business-directory/company-information.outpatient_care_centers.ca.html", 
    driver_path="/home/ada/Downloads/geckodriver"
    ):

    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = False
    driver = webdriver.Firefox(capabilities=cap, executable_path=driver_path)
    driver.get(url)

    vals = driver.find_element_by_xpath(
        "//div[@class='codes-title naics']/span").get_attribute("innerHTML")

    vals = vals.split("&")[0]
    print(vals)

    driver.close()

    return vals



def get_dnd_links(
    url = "https://www.dnb.com/business-directory/company-information.health_care_and_social_assistance.ca.html?page=2",
    driver_path = "/home/ada/Downloads/geckodriver"
):


    # company dnd links
    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = False
    driver = webdriver.Firefox(capabilities=cap, executable_path=driver_path)

    driver.get(url)

    elems = driver.find_elements_by_xpath("//a[@href]")
    links = []
    for elem in elems:
        links.append(elem.get_attribute("href"))

    driver.close()

    links = [i for i in links if "company-profiles" in i]

    return links

    
def get_company_details(
    dnd_url = "https://www.dnb.com/business-directory/company-profiles.alberta_health_services.129b1f80ef8deb9b06ed739f80223ccb.html",
    driver_path = "/home/ada/Downloads/geckodriver"
):


    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = False
    driver = webdriver.Firefox(capabilities=cap, executable_path=driver_path)
    driver.get(dnd_url)

    # remove html tags
    def remove_tags(text):
        return TAG_RE.sub('', text)
    
    # address
    try:
        address = driver.find_element_by_xpath(
            "//div[@class='col-md-11']/span[@class='company_data_point' and @name='company_address']/span").get_attribute("innerHTML")
        address = address.split("<")[0]
    except Exception:
        address = "None"
#     address = remove_tags(address)
    

    # telephone
    try:
        telephone = driver.find_element_by_xpath("//div[@class='col-md-11']/span[@class='company_data_point' and @name='company_phone']/span").get_attribute("innerHTML")
    except Exception:
        telephone = "None"
        
    # website
    try:
        website = driver.find_element_by_xpath(
            "//div[@class='col-md-11']/span[@class='company_data_point' and @name='company_website']/span").get_attribute("innerHTML")
        website = website.split(">")[1].split("<")[0]
    except Exception:
        website = "None"
        
    # employees
    try:
        employees = driver.find_element_by_xpath(
            "//div[@class='col-md-11']/span[@class='company_data_point' and @name='employees_all_site']/span").get_attribute("innerHTML")
    except Exception:
        employees = "None"
        
    
    # revenue
    try:
        revenue = driver.find_element_by_xpath(
            "//div[@class='col-md-11']/span[@class='company_data_point' and @name='revenue_in_us_dollar']/span").get_attribute("innerHTML")
    except Exception:
        revenue = "None"
        
    # name
    try:
        name = driver.find_element_by_xpath(
            "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='company_name']/span").get_attribute("innerHTML")
    except Exception:
        name = "None"
        
    # description
    try:
        description = driver.find_element_by_xpath(
            "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='company_description']/span").get_attribute("innerHTML")
    except Exception:
        description = "None"
        
    # contact person
    try:
        contact_person = driver.find_element_by_xpath(
            "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='key_principal']/span").get_attribute("innerHTML")
        contact_person = contact_person.split("&")[0]
    #     contact_person = remove_tags(contact_person)
    except Exception:
        contact_person = "None"
    
    # services
    try:
        services = driver.find_element_by_xpath(
            "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='industry_links']/span").get_attribute("innerHTML")
        TAG_RE = re.compile(r'<[^>]+>')
        services = remove_tags(services).replace("\n", " ").replace("&nbsp;", "").replace(",", "")
    except Exception:
        services = "None"
        
    driver.close()

    df =  pd.DataFrame([name, description, services, website, employees, contact_person, address, telephone, revenue]).T
    df.columns = ["name", "description", "services", "website", "employees", "contact_person", "address", "telephone", "revenue"]

    return df

# Outpatient Care Centers Companies in Canada


In [21]:
# get naics
naics = get_naics()

6214


In [22]:
urls = []
for i in range(1, 3172):
    urls.append(f"https://www.dnb.com/business-directory/company-information.health_care_and_social_assistance.ca.html?page={i}")
    
len(urls)

3171

In [23]:
dnd_links = []
for url in urls[:20]:
    try:
        dnd_links.append(get_dnd_links(url))              
    except Exception:
        pass
    
len(dnd_links)

20

In [34]:
dnd_links = [item for sublist in dnd_links for item in sublist]
len(dnd_links)

1000

### 1st 320

In [None]:
dfs = []
for company in dnd_links:
    try:
        dfs.append(get_company_details(company))
        print(company.split(".")[3].split(".")[0], "---pass")
    except Exception:
        pass
    

In [None]:
mega = pd.concat(dfs)
mega.shape
mega.to_csv("output/outpatient_cares_0-320.csv")
# dfs

alberta_health_services ---pass
omers_private_equity_inc ---pass
the_winnipeg_regional_health_authority ---pass
provincial_health_services_authority ---pass
young_mens_christian_association_of_edmonton ---pass
centre_de_sant%C3%A9_et_de_services_sociaux_de_laval ---pass
vancouver_island_health_authority ---pass
university_health_network ---pass
interior_health_authority ---pass
centre_int%C3%A9gr%C3%A9_de_sant%C3%A9_et_de_services_sociaux_de_la_mont%C3%A9r%C3%A9gie-centre ---pass
prairie_mountain_health ---pass
eastern_regional_integrated_health_authority ---pass
mcgill_university_health_centre ---pass
regional_health_authority_nb ---pass
nova_scotia_health_authority ---pass
the_ottawa_hospital ---pass
extendicare_(canada)_inc ---pass
sunnybrook_health_sciences_centre ---pass
covenant_health ---pass
hamilton_health_sciences_corporation ---pass
gestion_global_excel_inc ---pass
chartwell_master_care_lp ---pass
centre_hospitalier_de_luniversit%C3%A9_de_montr%C3%A9al ---pass
trillium_healt

middlesex_london_health_unit ---pass
georgian_bay_general_hospital ---pass
association_des_b%C3%A9n%C3%A9voles_du_centre_de_sant%C3%A9_paul-gilbert ---pass


In [46]:
print(mega.shape)
mega.tail()

(320, 9)


Unnamed: 0,name,description,services,website,employees,contact_person,address,telephone,revenue
0,"Bethany Nursing Home of Camrose, Alberta","Bethany Nursing Home of Camrose, Alberta is lo...",...,\n ...,600,\n \n ...,\n ...,(780) 679-2000,$41.00 million
0,Progress Homes Inc,"Progress Homes Inc is located in St. John's, N...",...,\n ...,150,\n \n ...,\n ...,(709) 754-1165,$40.64 million
0,Sunbeam Community &amp; Developmental Services,Sunbeam Community &amp; Developmental Services...,...,\n ...,250,\n \n ...,\n ...,(519) 893-6200,$40.45 million
0,Stars Aviation Canada Inc,Stars Aviation Canada Inc is located in Calgar...,...,\n ...,150,\n \n ...,\n ...,(403) 295-1811,$40.05 million
0,Forensic Psychiatric Services Commission,Forensic Psychiatric Services Commission is lo...,...,\n ...,500,\n \n ...,\n ...,(604) 524-7700,$39.94 million


In [57]:
374 + 400

774

### 374 - 774

In [68]:
dfs = []
for company in dnd_links[374:774]:
    try:
        dfs.append(get_company_details(company))
        print(company.split(".")[3].split(".")[0], "---pass")
    except Exception:
        pass

the_childrens_aid_society_of_the_region_of_peel ---pass
the_canadian_national_institute_for_the_blind ---pass
coll%C3%A8ge_lasalle ---pass
centre_de_sante_et_de_services_sociaux_du_rocher-perce ---pass
hillsdale_estates_inc ---pass
9104-8306_qu%C3%A9bec_inc ---pass
childrens_aid_society_of_the_regional_municipality_of_waterloo_the ---pass
hay_river_health__social_services_authority ---pass
centre_de_sant%C3%A9_et_de_services_sociaux_du_granit ---pass
canadian_mental_health_association_toronto_branch_the ---pass
insight_medical_imaging ---pass
central_community_care_access_centre ---pass
world_vision_canada_- ---pass
st_josephs_villa_foundation_dundas ---pass
arnprior_regional_health ---pass
centre_de_recherche_interdisciplinaire_en_readaptation_du_montreal_metropolitain_crir ---pass
cerebral_palsy_parent_council_of_toronto ---pass
catholic_health_corporation_of_manitoba ---pass
toronto_french_school ---pass
mount_carmel_clinic ---pass
clsc_montr%C3%A9al-nord ---pass
lutheran_sunset_hom

the_beverly_centre_inc ---pass
sherbrooke_community_society_inc ---pass
jewish_home_for_the_aged_of_british_columbia ---pass
auxiliary_to_the_overlander_extended_care_hospital ---pass
st_patricks_mercy_home ---pass
northwood_homecare_ltd ---pass
classic_lifecare_ltd ---pass
tofield_health_center ---pass
hopital_du_st-sacrement_du_centre_hospitalier_affilie_uq ---pass
west_nipissing_general_hospital_the ---pass
northern_ontario_school_of_medicine ---pass
turning_point_youth_services ---pass
district_of_kenora_home_for_the_aged ---pass
westminster_house_society ---pass
kettle_friendship_society ---pass
acad%C3%A9mie_lafontaine_inc ---pass
kardel_consulting_services_inc ---pass
providence_place_for_holistic_health_inc ---pass
london_x-ray_associates ---pass
cosmopolitan_industries_ltd ---pass
penetanguishene_general_hospital_inc_the ---pass
thames_valley_childrens_centre ---pass
access_home_care_inc ---pass
north_shore_connexions_society ---pass
copernicus_lodge ---pass
the_religious_hosp

In [69]:
mega = pd.concat(dfs)
mega.shape
mega.to_csv("output/outpatient_cares_374_774.csv")
# dfs

In [70]:
len(dfs)

309

### 775 - 1000

In [None]:
dfs = []
for company in dnd_links[775:]:
    try:
        dfs.append(get_company_details(company))
        print(company.split(".")[3].split(".")[0], "---pass")
    except Exception:
        pass

the_calgary_fetal_alcohol_network_an_alberta_story ---pass
canadian_society_of_addiction_medicine ---pass
mental_illness_caregivers_association_of_canada ---pass
district_19_a_diabetes_foundation ---pass
ezer_mizion ---pass
kates_kause ---pass
groupe_en_toute_amitie_de_senneterre ---pass
societe_franco-ontarienne_de_lautisme ---pass
korean_canadian_alzheimers_society ---pass
delta_stroke_recovery_society ---pass
service_populaire_daccompagnement_psychosocial ---pass
fondation_claude-durocher ---pass
dil_walk_foundation ---pass
la_maison_des_trois_colombes_2014 ---pass
nepal_house_society ---pass
aidants_unis_pour_recreer_ensemble_des_soutiens ---pass
oceanside_stroke_recovery_society ---pass
alzheimer_society_of_nova_scotia ---pass
planned_parenthood_ottawa-carleton_planning_des_naissances_dottawa-carleton ---pass
peace_by_piece-autism_development_and_sensory_centre ---pass
sovereign_order_of_st_john_of_jerusalem_knights_hospitaller ---pass
john_rudy_health_resource_centre_association 

In [None]:
mega = pd.concat(dfs)
mega.shape
mega.to_csv("output/outpatient_cares_775_1000.csv")
# dfs

### 50k

In [None]:
dnd_links = []
for url in urls[20:]:
    try:
        dnd_links.append(get_dnd_links(url))              
    except Exception:
        pass
    
len(dnd_links)

In [None]:
dnd_links = [item for sublist in dnd_links for item in sublist]
len(dnd_links)

In [None]:
dfs = []
for company in dnd_links:
    try:
        dfs.append(get_company_details(company))
        print(company.split(".")[3].split(".")[0], "---pass")
    except Exception:
        pass

In [None]:
mega = pd.concat(dfs)
mega.shape
mega.to_csv("output/outpatient_cares_1000....csv")
# dfs

In [None]:
# dnd_links = []
# for url in urls[20:100]:
#     try:
#         dnd_links.append(get_dnd_links(url))              
#     except Exception:
#         pass
    
# len(dnd_links)

In [None]:
# dfs_ = []
# for company in dnd_links:
#     try:
#         dfs_.append(get_company_details(company))
#         print(company.split(".")[3].split(".")[0], "---pass")
#     except Exception:
#         pass

In [None]:
# m = pd.concat(dfs_)
# m.to_csv("outpatient_cares_1.csv")
# # dfs

In [None]:
# ## dnd
# # names, physical location, sales revenue($M)
# cap = DesiredCapabilities().FIREFOX
# cap["marionette"] = False
# driver = webdriver.Firefox(capabilities=cap, executable_path="/home/ada/Downloads/geckodriver")

# driver.get('https://www.dnb.com/business-directory/company-information.health_care_and_social_assistance.ca.html?page=2')

# # links = driver.find_elements_by_class_name("col-md-12 data")
# vals = driver.find_element(By.TAG_NAME, 'div')
# vals = vals.find_elements(By.ID, value="companyResults")
# print('length of links are: ', len(vals))
# names = []
# for i in range(len(vals)):
#     names.append(vals[i].text)


# driver.close()