### Northwest Territories Child Care
Retrieved from: https://www.ece.gov.nt.ca/en/childcare?page=4

Robots.txt: https://www.ece.gov.nt.ca/robots.txt

User-agent: *

Crawl-delay: 10s

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from requests.auth import AuthBase
import time
import random
from logger import logging

In [2]:
headers = {
    'User-Agent': 'kaitlyn hobbs bot version 1.0',
    'From': 'kaitlyn.hobbs@statcan.gc.ca'
}

#### Execution log:

In [None]:
FORMAT = "%(asctime)-15s %(clientip)s %(user)-8s %(message)s"
logging.basicConfig(filename="logfilename.log", format=FORMAT)
logging.info('Execution of MB child care web scrape. \n{}.'.format(headers))

#### Execute scrape:

In [36]:
name = []
address = []
location = []
status = []
contact = []
phone = []
links = set()

r = requests.get('https://www.ece.gov.nt.ca/en/childcare')
s = BeautifulSoup(r.text)

tables = s.find_all('td')
for td in tables:
    try:
        links.add(td.findChild()['href'])
    except TypeError:
        pass

<mark> Slow script below </mark>

In [38]:
# crawl through each facility link and scrape information
crawl = 'https://www.ece.gov.nt.ca'

# First page - because the domain name does not change
counter = 1
for l in links:
    print("{} of {} links".format(counter, len(links)))
    r = requests.get(crawl+l)
    s = BeautifulSoup(r.text)
    
    # gather data
    name.append(s.find('h1', {'id':'page-title'}).text)
    
    if s.find('div', {'class':'field field-name-field-address field-type-text field-label-above'}):
        address.append(s.find('div', {'class':'field field-name-field-address field-type-text field-label-above'}).findChild().findNext().text)
    else:
        address.append("none")
        
    if s.find('div', {'class':"field field-name-field-status field-type-list-text field-label-inline clearfix"}):
        status.append(s.find('div', {'class':"field field-name-field-status field-type-list-text field-label-inline clearfix"}).findChild().findNext().text)
    else:
        status.append("none")
        
    if s.find_all('div', {'class':'field field-name-field-location field-type-taxonomy-term-reference field-label-hidden'}):
        location.append(s.find('div', {'class':'field field-name-field-location field-type-taxonomy-term-reference field-label-hidden'}).findChild().text)
    else:
        location.append("none")
        
    if s.find_all('div', {'class':'field field-name-field-contact-name field-type-text field-label-inline clearfix'}):
        contact.append(s.find('div', {'class':'field field-name-field-contact-name field-type-text field-label-inline clearfix'}).findChild().findNext().text)
    else:
        contact.append("none")

    if s.find_all('div', {'class':'field field-name-field-company-phones field-type-multifield field-label-above'}):
        phone.append(s.find('div', {'class':'field field-name-field-company-phones field-type-multifield field-label-above'}).findChild().findNext().text)
    else:
        phone.append("none")
        
    time.sleep(random.uniform(10,15))
    counter +=1

1 of 37 links
2 of 37 links
3 of 37 links
4 of 37 links
5 of 37 links
6 of 37 links
7 of 37 links
8 of 37 links
9 of 37 links
10 of 37 links
11 of 37 links
12 of 37 links
13 of 37 links
14 of 37 links
15 of 37 links
16 of 37 links
17 of 37 links
18 of 37 links
19 of 37 links
20 of 37 links
21 of 37 links
22 of 37 links
23 of 37 links
24 of 37 links
25 of 37 links
26 of 37 links
27 of 37 links
28 of 37 links
29 of 37 links
30 of 37 links
31 of 37 links
32 of 37 links
33 of 37 links
34 of 37 links
35 of 37 links
36 of 37 links
37 of 37 links


In [39]:
# # Remaining pages - iterate through each listed facility link
domain = "https://www.ece.gov.nt.ca/en/childcare?page="
pagelinks = set()

for i in range(1,5):
    print("Page {} of 5".format(i))
    r = requests.get(domain + str(i), headers = headers)
    s = BeautifulSoup(r.text)
    
    column = s.find_all('td', {'class':'views-field views-field-title active'})
    for c in column:
        pagelinks.add(c.find('a')['href'])

    counter = 1
    for p in pagelinks:
        print("{} of {} links".format(counter, len(pagelinks)))
        r = requests.get(crawl+p)
        s = BeautifulSoup(r.text)
        
        name.append(s.find('h1', {'id':'page-title'}).text)

        if s.find('div', {'class':'field field-name-field-address field-type-text field-label-above'}):
            address.append(s.find('div', {'class':'field field-name-field-address field-type-text field-label-above'}).findChild().findNext().text)
        else:
            address.append("none")

        if s.find('div', {'class':"field field-name-field-status field-type-list-text field-label-inline clearfix"}):
            status.append(s.find('div', {'class':"field field-name-field-status field-type-list-text field-label-inline clearfix"}).findChild().findNext().text)
        else:
            status.append("none")

        if s.find_all('div', {'class':'field field-name-field-location field-type-taxonomy-term-reference field-label-hidden'}):
            location.append(s.find('div', {'class':'field field-name-field-location field-type-taxonomy-term-reference field-label-hidden'}).findChild().text)
        else:
            location.append("none")

        if s.find_all('div', {'class':'field field-name-field-contact-name field-type-text field-label-inline clearfix'}):
            contact.append(s.find('div', {'class':'field field-name-field-contact-name field-type-text field-label-inline clearfix'}).findChild().findNext().text)
        else:
            contact.append("none")

        if s.find_all('div', {'class':'field field-name-field-company-phones field-type-multifield field-label-above'}):
            phone.append(s.find('div', {'class':'field field-name-field-company-phones field-type-multifield field-label-above'}).findChild().findNext().text)
        else:
            phone.append("none")
            
        counter +=1
            
        time.sleep(random.uniform(10,15))

Page 1 of 5
1 of 25 links
2 of 25 links
3 of 25 links
4 of 25 links
5 of 25 links
6 of 25 links
7 of 25 links
8 of 25 links
9 of 25 links
10 of 25 links
11 of 25 links
12 of 25 links
13 of 25 links
14 of 25 links
15 of 25 links
16 of 25 links
17 of 25 links
18 of 25 links
19 of 25 links
20 of 25 links
21 of 25 links
22 of 25 links
23 of 25 links
24 of 25 links
25 of 25 links
Page 2 of 5
1 of 50 links
2 of 50 links
3 of 50 links
4 of 50 links
5 of 50 links
6 of 50 links
7 of 50 links
8 of 50 links
9 of 50 links
10 of 50 links
11 of 50 links
12 of 50 links
13 of 50 links
14 of 50 links
15 of 50 links
16 of 50 links
17 of 50 links
18 of 50 links
19 of 50 links
20 of 50 links
21 of 50 links
22 of 50 links
23 of 50 links
24 of 50 links
25 of 50 links
26 of 50 links
27 of 50 links
28 of 50 links
29 of 50 links
30 of 50 links
31 of 50 links
32 of 50 links
33 of 50 links
34 of 50 links
35 of 50 links
36 of 50 links
37 of 50 links
38 of 50 links
39 of 50 links
40 of 50 links
41 of 50 links
42 o

#### Consolidate, Clean & Export:

In [40]:
final = pd.DataFrame({'facility_name':name, 'address':address, 'city':location, 'status':status, 'contact':contact, "phone":phone})

In [43]:
final.phone.replace("[Telephone]","", inplace = True, regex=True)
final.phone = final.phone.str.strip()
final.phone = final.phone.str.replace('Fax', ' Fax: ')
final.phone = final.phone.str.replace('WbsitYWCANW.ca', '')
final.phone = final.phone.str.replace('P', 'P: ')

cols = ['facility_name', 'address', 'city', 'status', 'contact', 'phone']
for c in cols:
    final[c] = final[c].str.encode('ascii', 'ignore').str.decode('ascii')

In [42]:
final.info()
final.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   facility_name  282 non-null    object
 1   address        282 non-null    object
 2   city           282 non-null    object
 3   status         282 non-null    object
 4   contact        282 non-null    object
 5   phone          282 non-null    object
dtypes: object(6)
memory usage: 13.3+ KB


Unnamed: 0,facility_name,address,city,status,contact,phone
0,Ndilǫ,none,none,none,none,
1,Dancing Lights Family Day Home,none,Yellowknife,Open,none,
2,Chickadee Childcare,none,Yellowknife,Open,none,
3,Délı̨ne Preschool,none,Délı̨ne,Open,none,(867) 589-3000
4,Dezona Ka Zue Child Care,none,Fort Liard,Open,none,


In [50]:
final.to_csv('data/childcare/NT-childcare.csv')