# Imports

In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urlsplit

import pandas as pd
import numpy as np

import os, sys, httplib2, json, fire, re, string, requests
from collections import OrderedDict, deque
import re, requests, requests.exceptions

import warnings
warnings.filterwarnings("ignore")


pd.set_option("display.max.columns", None)
pd.set_option("display.max.rows", None)

# Scraping Helper Functions

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res
    
def get_tag_elements(url, tag='h2'):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    response = simple_get(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        names = set()
        for li in html.select(tag):
            for name in li.text.split('\n'):
                if len(name) > 0:
                    names.add(name.strip())
        return list(names)

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url)) 
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

In [3]:
# links
def get_links(url):
    
    http = httplib2.Http()
    status, response = http.request(url)

    links = []
    for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
        links.append(str(link))
        
    return links



def get_info(info_):
    
    df_ = {}
    df_["Name"] = info_[1]
    df_["industry"] = info_[0]
    df_["phyical_address"] = info_[2]
    df_["tel"] = [info for info in info_ if "Tel" in info]
    df_["fax"] = [info for info in info_ if "Fax" in info]
    df_["site"] = info_[info_.index("Website and Email") + 1]
    df_["contact_person"] = info_[info_.index("Website and Email") + 3]
    df_["est"] = [info for info in info_ if "Est" in info]
    df_["rev"] = [info for info in info_ if "Rev" in info]
    df_["staff"] = [info for info in info_ if "Staff" in info]
    df_["summary"] = info_[info_.index("Summary") + 1]
    df_["products"] = info_[info_.index("Summary") - 1] + ", " + info_[info_.index("Product / Service Areas") + 1]
    
    return df_

# contactCanada

[https://www.contactcanada.com/database/companies.php]()

In [4]:
def contactCanada(urls, naics_code):

    # names
    names = []
    for site in urls:
        stuff = get_elements(site, tag='ul',search={}, fname=None)
        names.append(stuff[2:-2])


    names_ = [item for sublist in names for item in sublist]
    print(len(names_))

    links = []

    for site in urls:
        linkss = get_links(site)
        linkss = [link for link in linkss if "freesearch" in link][1:]
        links.append(linkss)

    links_ = [item for sublist in links for item in sublist]

    links_ = [url.split('"')[1] for url in links_]
    print(len(links_))

    # name df
    df_ = pd.DataFrame(columns=["name", "url"])
    df_["name"] = names_
    df_["url"] = links_

    df_["url"] = "https://www.contactcanada.com/database/" + df_.url
    df_.url = df_.url.str.replace("amp;", "")
    df_.head()

    # further details
    infos = []
    for url in df_.url.values:
        infos.append(get_elements(url, tag='div',search={"class": "profileWrapper layoutSixth"}, fname=None))

    updated =[]
    for info in infos:
        updated.append(list(OrderedDict.fromkeys(info))[2:])

    print(len(updated))

    all_ = []
    for info in updated:
        all_.append(get_info(info))

    len(all_)

    df_ = pd.DataFrame.from_dict(all_)
    df_.naics_code = naics_code
    display(df_.sample(3))
    
    return df_

In [5]:
# - https://www.contactcanada.com/database/companies.php?portal=3&s=0&l=90
    
__ = []

for i in range(0, 990, 90):
    url = f"https://www.contactcanada.com/database/companies.php?portal=3&s={i}&l=90"
    __.append(url)
    

nat_health = contactCanada(urls=__, naics_code=446191)

Error during requests to https://www.contactcanada.com/database/companies.php?portal=3&s=0&l=90 : HTTPSConnectionPool(host='www.contactcanada.com', port=443): Max retries exceeded with url: /database/companies.php?portal=3&s=0&l=90 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f8472fdb310>: Failed to establish a new connection: [Errno 101] Network is unreachable'))


TypeError: 'NoneType' object is not subscriptable

In [None]:
# - https://www.contactcanada.com/database/companies.php?portal=9&s=0&l=90
    
__ = []

for i in range(0, 990, 90):
    url = f"https://www.contactcanada.com/database/companies.php?portal=9&s={i}&l=90"
    __.append(url)
    

sea_food = contactCanada(urls=__, naics_code=3117)

In [None]:
# - https://www.contactcanada.com/database/companies.php?portal=10&s=0&l=90
    
__ = []

for i in range(0, 990, 90):
    url = f"https://www.contactcanada.com/database/companies.php?portal=10&s={i}&l=90"
    __.append(url)
    

food_groc = contactCanada(urls=__, naics_code=4451)

990
990
990


Unnamed: 0,Name,industry,phyical_address,tel,fax,site,contact_person,est,rev,staff,summary,products
466,Honest Dumplings Ltd.,A Canadian Food & Grocery Profile,10552 114 St.Edmonton AB T5H 3J7Canada,[Tel: 780-240-1378],[],www.honestdumplings.ca,CEO: Mr. Chris Lerohl,[Est: 2014 - Private],[Rev: $ M],[Staff:],"1, All-natural gourmet frozen dumplings with a...","Vegan, Frozen Foods"
89,Bernardin Ltd.,A Canadian Food & Grocery Profile,"845 Intermodal Dr., Unit 1Brampton ON L6T 0C6...",[Tel: 888-430-4231],[Fax: 905-793-9798],www.bernardin.ca,Mgmt:,[Est:],[Rev: $ M],[Staff:],"For over 100 years, home canning has been an e...","Canned Food/Meat, Canned Food/Meat"
820,PreGel Canada,A Canadian Food & Grocery Profile,221 Don Hillock Dr.Aurora ON L4G 0K2Canada,[Tel: 905-727-3068],[],www.pregelcanada.com,Mgmt: Luca Costella,[Est: 2008],[Rev: $ M],[Staff: 1–10],PreGel Canada is the Canadian division of PreG...,"Kosher/Parve, Extracts"


In [None]:
nat_health["naics"] = 446191
sea_food["naics"] = 3117
food_groc["naics"] = 4451

df = pd.concat([nat_health, sea_food, food_groc])
df.shape, nat_health.shape, sea_food.shape, food_groc.shape

((2854, 13), (953, 13), (911, 13), (990, 13))

In [None]:
df.sample(3)

Unnamed: 0,Name,industry,phyical_address,tel,fax,site,contact_person,est,rev,staff,summary,products,naics
686,Atlantic Mariculture Ltd.,A Canadian Seafood Profile,PO Box 924Grand Manan NB E5G 4M2 Canada,[Tel: 506-662-8150],[Fax: 506-662-8850],www.organicdulse.com,Jayne Turner,[Est: 1974],[],[Staff: 10–50],"Markets: Canada, United States, Europe, Asia.","Staff: 10–50, Dulse",3117
28,Allseas Fisheries Corp.,A Canadian Seafood Profile,55 Vansco RoadToronto ON M8Z 5Z8 Canada,[Tel: 416-255-3474],[Fax: 416-255-0181],Key Contacts,Corporate Data,[Est:],[],[Staff: 100–500],Markets: Worldwide.www.allseas.net,"Staff: 100–500, Seafood-Various",3117
77,Brown Line LLC,A Canadian Seafood Profile,3814 Old Highway 99 SouthMount Vernon WA 9827...,[Tel: 800-426-2050],[Fax: 800-247-7339],www.shipbll.com,Steve McQueary,[Est: 1979],[],[Staff: 100–500],Providing temperature-control truckload & less...,"Staff: 100–500, Logistics/Transport",3117


## get emails

In [None]:
def get_email(url):

    unprocessed_urls = deque([url])

    processed_urls = set()

    emails = set()
    url = url

    print("Crawling URL %s" % url)
    response = requests.get(url)

    new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
    new_emails = [mail for mail in new_emails if ".png" not in mail]
    new_emails = [mail for mail in new_emails if ".jpg" not in mail]
    
    emails.update(new_emails)
    
    return emails

In [None]:
mails = []

for site in df.site.values:
    try:
        mail = get_email("http://" + site)
        
    except Exception:
        try:
            mail = get_email("https://" + site)
        
        except Exception:
            mail = "null"
            pass
    
    mails.append(mail)

In [None]:
sub = df.iloc[:1867]
sub["mails"] = mails
print(sub.shape)

(1867, 14)


## clean df

In [None]:
df = pd.read_csv("backup_cc.csv")
sub = df

In [None]:
sub.head(2)

Unnamed: 0.1,Unnamed: 0,Name,industry,phyical_address,tel,fax,site,contact_person,est,rev,staff,summary,products,naics,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33
0,0,2598040 Ont. Inc dba Serenity2000,A Natural Health Products Canada Profile,11-190 Don Park RdMarkham ON L3R 2V8Canada,['Tel: 905-470-2270'],['Fax: 800-583-7477'],www.serenity2000.com,Pres: Mr. Gabor Zoltan,"['Est: 1991 - Private', 'Established in 1991, ...",['Rev: $ M'],['Staff: 1–10'],"Established in 1991, Serenity 2000 is proud to...","Therapy Products, Magnets",446191,,,,,,,,,,,,,,,,,,,,
1,1,52 North Beverages,A Natural Health Products Canada Profile,22 - 8980 Fraserwood Ct.Burnaby BC V5J 5H7Canada,['Tel: 604-218-8994'],[],www.52north.ca,Mgmt: Sarah Kendrick Wall,['Est:'],['Rev: $ M'],['Staff:'],52° North Beverages harvests Birch Sap and Spr...,"Teas-Herbal, Beverages",446191,,,,,,,,,,,,,,,,,,,,


In [None]:
# sub.tel = [','.join(map(str, l)) for l in sub['tel']]
# sub.fax = [','.join(map(str, l)) for l in sub['fax']]
# sub.est = [','.join(map(str, l)) for l in sub['est']]
# sub.rev = [','.join(map(str, l)) for l in sub['rev']]
# sub.staff = [','.join(map(str, l)) for l in sub['staff']]


print(sub.shape)
sub = sub[(sub['tel'].str.len() > 4) & (sub['fax'].str.len() > 4)]
sub.shape

(1932, 34)


(1351, 34)

In [None]:
# tel
sub.tel = sub.tel.str[6:-2]

# fax
sub.fax = sub.fax.str[6:-2]

# est
sub.est = sub.est.astype(str)
ests = []
for text in sub.est.values:
    ests.append([int(s) for s in text.split() if s.isdigit()])
sub.est = ests
sub.est = [','.join(map(str, l)) for l in sub['est']]
sub.est = sub.est.str[:5]

# rev -- drop the column cause most of the entries are null
sub = sub.drop(["rev"], axis=1)

# staff
sub.staff = sub.staff.astype(str)
sub.staff = sub.staff.str[8:-2]

# products
prods = [x.replace('Staff','') for x in sub.products.values]
prods = [x.replace(':–,','') for x in prods]

sub["products"] = prods
sub['products'] = sub['products'].str.replace('\d+', '')

# addresses
sub["physical_address"] = sub.phyical_address
sub["mailing_address"] = sub.phyical_address

In [None]:
sub_ = sub.loc[(sub['contact_person'].str.len() > 12)]
sub_ = sub_.iloc[:941]
sub_.shape, sub.shape

((941, 35), (1351, 35))

In [None]:
sub_["contact_title"] = sub_.contact_person.str.split(":").str[0]
sub_["contact_name"] = sub_.contact_person.str.split(":").str[-1]

In [None]:
sub_.rename(columns = {
    "Name":"business_name",
    "tel": "contact_phone",
    "site": "website",
    "description": "business_activity",
    "est": "year_founded",
    "products": "products_services",
}, inplace=True)


In [None]:
sub_ = sub_.loc[sub_.website != "Key Contacts"]
sub_.drop(["phyical_address", "contact_person", "Unnamed: 0", "Unnamed: 14", "Unnamed: 15",
          "Unnamed: 16", "Unnamed: 17", "Unnamed: 18", "Unnamed: 19",
          "Unnamed: 20", "Unnamed: 21", "Unnamed: 22", "Unnamed: 23",
          "Unnamed: 24", "Unnamed: 25", "Unnamed: 26", "Unnamed: 27",
          "Unnamed: 28", "Unnamed: 29", "Unnamed: 30", "Unnamed: 31",
          "Unnamed: 32", "Unnamed: 33"], axis=1, inplace=True)
sub_.shape

(932, 14)

In [None]:
sub_.head()

Unnamed: 0,business_name,industry,contact_phone,fax,website,year_founded,staff,summary,products_services,naics,physical_address,mailing_address,contact_title,contact_name
0,2598040 Ont. Inc dba Serenity2000,A Natural Health Products Canada Profile,905-470-2270,800-583-7477,www.serenity2000.com,1991.0,1–10,"Established in 1991, Serenity 2000 is proud to...","Therapy Products, Magnets",446191,11-190 Don Park RdMarkham ON L3R 2V8Canada,11-190 Don Park RdMarkham ON L3R 2V8Canada,Pres,Mr. Gabor Zoltan
2,A & A Pharmachem Inc.,A Natural Health Products Canada Profile,613-224-1234,613-228-2840,www.aapharmachem.com,1992.0,10–50,A&A Pharmachem is a professionally managed imp...,"Consulting, Nutraceuticals",446191,4 - 77 Auriga DriveOttawa ON K2E 7Z7Canada,4 - 77 Auriga DriveOttawa ON K2E 7Z7Canada,Mgmt,Nav Aggarwal
3,A & E Fine Foods,A Natural Health Products Canada Profile,905-478-1500,905-478-1502,www.aefinefood.com,1978.0,1–10,Organic and Natural Food Product distributor a...,"Natural Products, Distributor",446191,19811 Woodbine AvenueQueensville ON L0G 1R0Ca...,19811 Woodbine AvenueQueensville ON L0G 1R0Ca...,Mgmt,Peter Radvanszky
4,A.D.S. Enterprises,A Natural Health Products Canada Profile,450-435-4161,888-435-6150,www.ADSInternet.ca,1986.0,10–50,"For over 30 years, A.D.S. is a Manufacturer, I...","% Natural, Importer",446191,17505 Val D'espoirMirabel QC J7J 1M3Canada,17505 Val D'espoirMirabel QC J7J 1M3Canada,Mgmt,Mr. Richard Provost
5,Absorb Science Inc.,A Natural Health Products Canada Profile,604-929-6789,604-929-6786,www.absorbscience.com,,,Absorb Science has an established reputation a...,"Digestive Aids, Herbal Remedies",446191,104-3728 North Fraser WayBurnaby BC V5J 5G1Ca...,104-3728 North Fraser WayBurnaby BC V5J 5G1Ca...,Mgmt,Dr. Massoud Eftekhari


In [None]:
sub_.to_csv("canadianCompanies.csv")

### NOTES: In progress (kernel still runnig) + To Do.

1. Emails are being scraped -- will add this once done.
2. split address to generate -- city, province, postal.
3. Scrap other sites e.g -- https://www.yellowpages.ca/business/
4. Wrap code in  a python script to source it directly on a notebook or on terminal.
5. Sanity checks on the resulting df + further cleaning that may arise.
