# Imports

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options


import re
import warnings

import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

warnings.filterwarnings(action="ignore")

# Helper Functions

In [4]:
# default webdriver options
options = Options()
options.add_argument('--headless')

def get_naics(
    url= "https://www.dnb.com/business-directory/company-information.outpatient_care_centers.ca.html", 
    driver_path="/home/ada/Downloads/geckodriver"
    ):

    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = False
    driver = webdriver.Firefox(capabilities=cap, executable_path=driver_path, options=options)
    driver.get(url)

    vals = driver.find_element_by_xpath(
        "//div[@class='codes-title naics']/span").get_attribute("innerHTML")

    vals = vals.split("&")[0]
    print(vals)

    driver.close()

    return vals




def get_dnd_links(
    url = "https://www.dnb.com/business-directory/company-information.health_care_and_social_assistance.ca.html?page=2",
    driver_path = "/home/ada/Downloads/geckodriver"
):


    # company dnd links
    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = False
    driver = webdriver.Firefox(capabilities=cap, executable_path=driver_path, options=options)

    driver.get(url)

    elems = driver.find_elements_by_xpath("//a[@href]")
    links = []
    for elem in elems:
        links.append(elem.get_attribute("href"))

    driver.close()

    links = [i for i in links if "company-profiles" in i]

    return links

    
def get_company_details(
    dnd_url = "https://www.dnb.com/business-directory/company-profiles.alberta_health_services.129b1f80ef8deb9b06ed739f80223ccb.html",
    driver_path = "/home/ada/Downloads/geckodriver"
):


    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = False
    driver = webdriver.Firefox(capabilities=cap, executable_path=driver_path, options=options)
    driver.get(dnd_url)

    # remove html tags
    def remove_tags(text):
        return TAG_RE.sub('', text)
    
    # address
    try:
        address = driver.find_element_by_xpath(
            "//div[@class='col-md-11']/span[@class='company_data_point' and @name='company_address']/span").get_attribute("innerHTML")
        address = address.split("<")[0]
    except Exception:
        address = "None"
#     address = remove_tags(address)
    

    # telephone
    try:
        telephone = driver.find_element_by_xpath("//div[@class='col-md-11']/span[@class='company_data_point' and @name='company_phone']/span").get_attribute("innerHTML")
    except Exception:
        telephone = "None"
        
    # website
    try:
        website = driver.find_element_by_xpath(
            "//div[@class='col-md-11']/span[@class='company_data_point' and @name='company_website']/span").get_attribute("innerHTML")
        website = website.split(">")[1].split("<")[0]
    except Exception:
        website = "None"
        
    # employees
    try:
        employees = driver.find_element_by_xpath(
            "//div[@class='col-md-11']/span[@class='company_data_point' and @name='employees_all_site']/span").get_attribute("innerHTML")
    except Exception:
        employees = "None"
        
    
    # revenue
    try:
        revenue = driver.find_element_by_xpath(
            "//div[@class='col-md-11']/span[@class='company_data_point' and @name='revenue_in_us_dollar']/span").get_attribute("innerHTML")
    except Exception:
        revenue = "None"
        
    # name
    try:
        name = driver.find_element_by_xpath(
            "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='company_name']/span").get_attribute("innerHTML")
    except Exception:
        name = "None"
        
    # description
    try:
        description = driver.find_element_by_xpath(
            "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='company_description']/span").get_attribute("innerHTML")
    except Exception:
        description = "None"
        
    # contact person
    try:
        contact_person = driver.find_element_by_xpath(
            "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='key_principal']/span").get_attribute("innerHTML")
        contact_person = contact_person.split("&")[0]
    #     contact_person = remove_tags(contact_person)
    except Exception:
        contact_person = "None"
    
    # services
    try:
        services = driver.find_element_by_xpath(
            "//div[@class='margin-between-rows']/span[@class='company_data_point' and @name='industry_links']/span").get_attribute("innerHTML")
        TAG_RE = re.compile(r'<[^>]+>')
        services = remove_tags(services).replace("\n", " ").replace("&nbsp;", "").replace(",", "")
    except Exception:
        services = "None"
        
    driver.close()

    df =  pd.DataFrame([name, description, services, website, employees, contact_person, address, telephone, revenue]).T
    df.columns = ["name", "description", "services", "website", "employees", "contact_person", "address", "telephone", "revenue"]

    return df

# Professional, Scientific, And Technical Services Industry

## NIACS

In [5]:
naics = get_naics("https://www.dnb.com/business-directory/company-information.professional_scientific_and_technical_services.ca.html?page=1")

54


## DND Urls

In [6]:
# diff page urls
urls = []
for i in range(1, 5500):
    urls.append(f"https://www.dnb.com/business-directory/company-information.health_care_and_social_assistance.ca.html?page={i}")
    
len(urls)

5499

In [None]:
dnd_links = []
for url in urls[:1000]:
    try:
        dnd_links.append(get_dnd_links(url))              
    except Exception:
        pass
        
len(dnd_links)

In [None]:
with open('1st50k_coy_urls.txt', 'w') as f:
    for item in dnd_links:
        f.write("%s\n" % item)

In [None]:
dnd_links = [item for sublist in dnd_links for item in sublist]
len(dnd_links)

# 1st 50k companies

In [None]:
dfs = []
for company in dnd_links:
    try:
        dfs.append(get_company_details(company))
        print(company.split(".")[3].split(".")[0], "---pass")
    except Exception:
        pass