**IN:** raw data 

**OUT:** input data for opentab

In [2]:
import pandas as pd
import os
import glob
import re
import numpy as np

In [3]:
os.chdir('/Users/kt/Documents/work/STATCAN/Projects/ODECF/Collection-ODECF/data/childcare')


In [4]:
glob.glob('*.csv')

['NT-childcare.csv',
 'QC-CPE-GARD-MF.csv',
 'NB-childcare.csv',
 'AB-childcare.csv',
 'NL-childcare.csv',
 'odec.csv',
 'MB-childcare.csv',
 'PE-childcare.csv',
 'NS-Child_Care_Directory.csv',
 'SK-childcare.csv',
 'ON-childcare.csv',
 'YT-childcare.csv',
 'odecf_godaycares_2020-10-19.csv',
 'ON-childcare-dictionary.csv',
 'NU-childcare.csv',
 'BC-childcare_locations.csv']

In [5]:
NT = pd.read_csv('NT-childcare.csv')
QC = pd.read_csv('QC-CPE-GARD-MF.csv')
NB = pd.read_csv('NB-childcare.csv')
AB = pd.read_csv('AB-childcare.csv')
NL = pd.read_csv('NL-childcare.csv')
MB = pd.read_csv('MB-childcare.csv')
PE = pd.read_csv('PE-childcare.csv')
NS = pd.read_csv('NS-Child_Care_Directory.csv')
SK = pd.read_csv('SK-childcare.csv')
ON = pd.read_csv('ON-childcare.csv')
YT = pd.read_csv('YT-childcare.csv')
NU = pd.read_csv('NU-childcare.csv')
BC = pd.read_csv('BC-childcare_locations.csv')
godc = pd.read_csv('odecf_godaycares_2020-10-19.csv')

In [9]:
NS.groupby("AGE_RANGE").count().index

Index(['0 months - 6 years', '10 months - 5 years', '18 Months - 12 Years',
       '18 Months - 12 years', '18 Months - 5 Years', '18 months  - 12 Years',
       '18 months -  6 years', '18 months - 10 years', '18 months - 11 years',
       '18 months - 12 years', '18 months - 5 years', '18 months - 5years',
       '18 months - 6 years', '18 months - 8 years', '18 months - 9 years',
       '18 months-5 years', '18months -  11 years', '2 months - 5 years',
       '2 years - 12 years', '2 years - 5 years', '2 years - 8 years',
       '2.5 years - 12 years', '2.5 years - 5 years', '3 months  - 12 years',
       '3 months - 10 years', '3 months - 11 years', '3 months - 12 years',
       '3 months - 2 years', '3 months - 3 years', '3 months - 5 years',
       '3 months - 6 years', '3 months - 8 years', '3 months - 9 years',
       '3 years - 10 years', '3 years - 12 years', '3 years - 4 years',
       '3 years - 5 years', '3 years - 6 years', '3 years - 9 years',
       '30 Months - 5 Years

complete | prov | details | updated src
----------|------|----------|-----------
[-] | NT | No relevant columns | -
[x] | QC | `MF` - home care. `CPE/GARD` - day care. `CR` - `infant` & `toddler` | X
[x] | NB | - | X
[x] | AB | Facilities offering care in private residences were considered home cares. | X
[-] | NL | No relevant columns | -
[x] | MB | - | X
[x] | PE | `toddler`: 2-5 Years, `infant`: Under 2, `school_age`: School Age | X
[x] | NS | <mark> Group `preschool` with `toddler` </mark> | X
[x] | SK | Src file updated to include details col | X
[-] | ON | No relevant columns | -
[x] | YT | If "and older" was included in text, all ages above were marked as "Y". `toddler`, `preschool`, `kindergarten` were grouped as `toddler`. | X
[x] | NU | Added age columns only (no home info). preschool = toddler age. | X
[x] | BC | Added `toddler` column only. See descriptions. | X
[x] | GoDayCare.com | 

---
### QC
**Type de service de garde:**

MF - home care
CPE/GARD - day care

**Place à contribution réduite**

CR - reduced contribution place: The Reduced Contribution Program was created for children between 0 and 5 years old
https://www.mfa.gouv.qc.ca/en/services-de-garde/parents/programme-contribution-reduite/Pages/index.aspx



In [5]:
QC['facility_type'] = QC['Type de service de garde'].apply(lambda x: "Day Care" if x == "CPE" or x == "GARD" else "Home")

In [6]:
QC['infant'] = QC['Place à contribution réduite'].apply(lambda x: "Y" if x == "CR" else None)
QC['toddler'] = QC['Place à contribution réduite'].apply(lambda x: "Y" if x == "CR" else None)

Check:

In [7]:
# QC.groupby(['Place à contribution réduite', 'infant', 'toddler']).describe()
# QC.groupby(['Type de service de garde', 'facility_type']).nunique()

### NB

In [8]:
NB.age = NB.age.astype(str)
NB['infant']=NB.age.apply(lambda x: "Y" if "Infant" in x else ("N" if x != "" else None))
NB['toddler']=NB.age.apply(lambda x: "Y" if "Preschool" in x else ("N" if x != "" else None))
NB['school_age']=NB.age.apply(lambda x: "Y" if "School-age" in x else ("N" if x != "" else None))

In [9]:
NB['facility_type']=NB.type.apply(lambda x: "Home" if x == "Home" else "Day Care")
# NB['day_care']=NB.type.apply(lambda x: "Y" if "Centre" in x else "N")

check:

In [10]:
# NB.groupby(['type', 'facility_type']).describe()

In [11]:
# NB.groupby(['age','infant', 'toddler', 'school_age']).max()[['facility']]

### AB

There are many types of licensed child care options available:
* day care programs that serve infants, toddlers and pre-school-aged children
* family day homes offer care in a private residence for up to 6 children, including the caregiver’s own children, with one caregiver
* group family child care programs offer care in a private residence and have between 7 and 10 children with 2 caregivers
* out-of-school-care programs operate before and after school, during lunch hours and sometimes when school is closed
* pre-school programs offer child care for 4 or less hours per day for pre-school aged children

https://www.alberta.ca/finding-quality-child-care.aspx

<mark> Let those offering care in private residences be considered home cares. </mark>

In [12]:
AB['facility_type'] = AB['Type of program'].apply(lambda x: "Home" if "FAMILY" in x else ("Day Care" if x != "" else None))

Check:

In [13]:
# AB.groupby(['Type of program', 'facility_type']).describe()

### MB

In [14]:
MB['facility_type'] = MB['Type of Facility'].apply(lambda x: "Home" if "Home" in x else ("Day Care" if x != "" else None))

In [15]:
MB['Age Type']=MB['Age Type'].astype(str)
MB['infant']=MB['Age Type'].apply(lambda x: "Y" if "Infant" in x else ("N" if x != "" else None))
MB['toddler']=MB['Age Type'].apply(lambda x: "Y" if "Preschool" in x else ("N" if x != "" else None))
MB['school_age']=MB['Age Type'].apply(lambda x: "Y" if "School Age" in x else ("N" if x != "" else None))

Check:

In [16]:
# MB.groupby(['Type of Facility', 'facility_type']).describe()
# MB.groupby(['Age Type', 'infant', 'toddler', 'school_age']).describe()

### PE

`toddler`: 2-5 Years

`infant`: Under 2

`school_age`: School Age

In [17]:
# Infant - Under 2 Years
def infant(x):
    try:
        if x != "Not Specified":
            re.match(r'^[a-zA-Z]{5}\s[0-9]{1}\s[a-zA-Z]{5}', x)[0]
            return "Y"
        else:
            return None
    except TypeError:
        return "N"
    
PE['infant'] = PE['Centre Accepts'].apply(infant)

In [18]:
# Toddler - 2 Years, 3 Years, 4 Years, 5 Years
def toddler(x):
    try:
        if x != "Not Specified" and x != "Under 2 Years":
            re.match(r'.*[0-9]{1}\s[a-zA-Z]{5}', x)[0]
            return "Y"
        elif x == "Under 2 Years":
            return "N"
        else:
            return None
    except TypeError:
        return "N"
    
PE['toddler'] = PE['Centre Accepts'].apply(toddler)

In [19]:
# School Age
PE['school_age'] = PE['Centre Accepts'].apply(lambda x: "Y" if "School Age" in x else (None if x == "Not Specified" else "N"))

Check:

In [20]:
# PE.groupby(['Centre Accepts', 'infant', 'toddler', 'school_age']).describe()

### NS
http://www.moving2novascotia.com/blog-posts/what-age-do-children-start-school-in-nova-scotia/

preschool = toddler



In [21]:
# NS.info()

In [22]:
NS['facility_type'] = NS.FACILITY_TYPE.apply(lambda x: "Home" if x == "Family Home Day Care Agency" else "Day Care")

NS['infant'] = NS.AGE_INFANT.apply(lambda x: "Y" if x == "Yes" else "N")
NS['school_age'] = NS.AGE_SCHOOL_AGE.apply(lambda x: "Y" if x == "Yes" else "N")

In [23]:
# Find indices of both infant identifiable columns
tod1 = NS[NS.AGE_PRESCHOOL == "Yes"].index.to_list()
tod2 = NS[NS.AGE_TODDLER == "Yes"].index.to_list()
todns_ix = tod1+tod2

In [24]:
NS['toddler'] = 'N'
NS.loc[todns_ix, 'toddler'] = "Y"

Check:

In [25]:
# NS.groupby(['FACILITY_TYPE', 'facility_type']).describe()
# NS.groupby(['AGE_INFANT', 'AGE_TODDLER', 'AGE_PRESCHOOL', 'AGE_SCHOOL_AGE', 'infant', 'toddler', 'school_age']).describe()

### SK

In [26]:
SK.details = SK.details.astype(str)

In [27]:
# Type
SK['facility_type'] = SK.details.apply(lambda x: "Home" if "home" in x.lower() else ("Day Care" if "centre" in x.lower() else None))

In [28]:
# Age
SK['infant'] = SK.details.apply(lambda x: "Y" if "6 weeks - 18 months" in x.lower() 
                                or "6 weeks - 30 months" in x.lower() 
                                or "infants" in x.lower()
                                else ("N" if "kindergarten" in x.lower() else None))
SK['toddler'] = SK.details.apply(lambda x: "Y" if "6 weeks - 30 months" in x.lower() or "kindergarten" in x.lower() else ("N" if "6 weeks - 18 months" in x.lower() else None))
SK['school_age'] = SK.infant.apply(lambda x: None if x is None else "N")

Check:

In [29]:
# SK.groupby(['details', 'facility_type']).describe()
# SK.groupby(['details', 'infant', 'toddler', 'school_age']).describe()

### YT

**Given age group hierarchy:**

infant < toddler < preschool < kindergarten < school age

**Adjusted age group hierarchy:**

infant < toddler (+ preschool + kindergarten) < school age

<mark> if "and older" was included in text, all ages above were marked as "Y". </mark>

In [30]:
def toddler(x):
    try:
        # for 'infant(s) and older' entries
        if re.match('\s{0,1}[a-z]{6,7} and older', x)[0]:
            return 'Y'
    except TypeError:
        if 'toddler' in x.lower():
            return 'Y'
        elif 'pre-school' in x.lower():
            return 'Y'
        elif 'kindergarten' in x.lower():
            return 'Y'
        else:
            return "N"

In [31]:
YT['infant'] = YT.age.apply(lambda x: "Y" if "infant" in x else "N")
YT['school_age'] = YT.age.apply(lambda x: "Y" if "school-age" in x or "and older" in x else "N")
YT['toddler'] = YT.age.apply(toddler)

Check:

In [32]:
YT.groupby(['age', 'infant', 'toddler', 'school_age']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,mean,std,min,25%,50%,75%,max
age,infant,toddler,school_age,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
infant and older,Y,Y,Y,16.0,15.5625,10.398518,1.0,7.25,14.5,23.5,33.0
"infant and older, 24 hours",Y,Y,Y,1.0,9.0,,9.0,9.0,9.0,9.0,9.0
"infant and older, extended hours",Y,Y,Y,1.0,4.0,,4.0,4.0,4.0,4.0,4.0
infant and school-age,Y,N,Y,1.0,10.0,,10.0,10.0,10.0,10.0,10.0
infant to pre-school,Y,Y,N,1.0,21.0,,21.0,21.0,21.0,21.0,21.0
"infant, toddler, pre-school",Y,Y,N,1.0,28.0,,28.0,28.0,28.0,28.0,28.0
infants and older,Y,Y,Y,10.0,43.1,5.526703,31.0,41.5,44.5,46.75,49.0
"infants and older, extended-hour care",Y,Y,Y,1.0,40.0,,40.0,40.0,40.0,40.0,40.0
kindergarten and school-age,N,Y,Y,1.0,24.0,,24.0,24.0,24.0,24.0,24.0
pre-school,N,Y,N,2.0,10.5,4.949747,7.0,8.75,10.5,12.25,14.0


### NU

In [33]:
NU['infant'] = NU.Infants.apply(lambda x: "Y" if x != 0 else "N")
NU['toddler'] = NU.Preschooler.apply(lambda x: "Y" if x != 0 else "N")
NU['school_age'] = NU['School age'].apply(lambda x: "Y" if x != 0 else "N")

Check:

In [34]:
# NU.groupby(['toddler', 'Preschooler']).describe()

### BC

`SRVC_UNDER36_IND` indicates whether or not the facility offers child care services to children under 36 months of age, i.e., Y or N **infant** <mark> assigned in source file </mark>

`SRVC_30MOS_5YRS_IND` indicates whether or not the facility offers child care services to children between the ages of 30 months and 5 years, i.e., Y or N **toddler**

`SRVC_LICPRE_IND` indicates whether or not the facility is a licensed preschool, i.e., Y or N

`SRVC_OOS_KINDER_IND` indicates whether or not the facility offers care for out of school care for kindergarten aged children, i.e., Y or N **toddler**

`SRVC_OOS_GR1_AGE12_IND` indicates whether or not the facility offers care for out of school care for children in grade 1 up to the age of 12, i.e., Y or N **school_age** <mark> assigned in source file </mark>



In [50]:
# BC.info()

In [36]:
# Find indices of both toddler identifiable columns
tod1 = BC[BC.SRVC_30MOS_5YRS_YN == "Y"].index.to_list()
tod2 = BC[BC.SRVC_OOS_KINDER_YN == "Y"].index.to_list()
toddler_ix = tod1+tod2

In [37]:
BC['toddler'] = 'N'
BC.loc[toddler_ix, 'toddler'] = "Y"

Check:

In [38]:
# BC.groupby(['toddler', 'SRVC_30MOS_5YRS_YN', 'SRVC_OOS_KINDER_YN']).describe()

#### GoDayCare.com


In [39]:
godc.Age_Groups = godc.Age_Groups.replace(np.nan, None)
godc.Age_Groups = godc.Age_Groups.astype(str)

In [40]:
godc['infant'] = godc.Age_Groups.map(lambda x: "Y" if "Infant" in x else (None if x == "None" else "N"))
godc['toddler'] = godc.Age_Groups.map(lambda x: "Y" if "Toddler" in x or "Preschool" in x or "Kindergarten" in x else (None if x == "None" else "N"))
godc['school_age'] = godc.Age_Groups.map(lambda x: "Y" if "Schoolage" in x else (None if x == "None" else "N"))

Check:

In [41]:
# godc.groupby(['Age_Groups', 'infant', 'toddler', 'school_age']).describe()

---

### Export

##### To verify

In [42]:
# os.chdir('/Users/kt/Documents/work/STATCAN/ODECF/Wrangling-ODECF/output/childcare/standardized')

In [43]:
# SK.groupby(['details', 'home_day_care', 'day_care', 'infant', 'toddler', 'school_age']).describe().to_csv("SK-childcare-verify.csv")
# YT.groupby(['age', 'infant', 'toddler', 'school_age']).describe().to_csv("YT-childcare-verify.csv")
# PE.groupby(['Centre Accepts', 'infant', 'toddler', 'school_age']).describe().to_csv('PE-childcare-verify.csv')

**OK**

In [44]:
os.getcwd()

'/Users/kt/Documents/work/STATCAN/ODECF/Collection-ODECF/data/childcare'

In [45]:
os.chdir('/Users/kt/.config/opentabulate.con/data/input/')

In [46]:
godc.to_csv('godaycare.csv')

In [47]:
PE.to_csv('PE-childcare.csv')

In [48]:
YT.to_csv('YT-childcare.csv')
SK.to_csv('SK-childcare.csv')
PE.to_csv('PE-childcare.csv')
NB.to_csv('NB-childcare.csv')
AB.to_csv('AB-childcare.csv')
MB.to_csv('MB-childcare.csv')
NS.to_csv('NS-childcare.csv')
NU.to_csv('NU-childcare.csv')
BC.to_csv('BC-childcare.csv')
QC.to_csv('QC-childcare.csv', encoding = "utf-8-sig")

##### Unchanged

In [49]:
ON.to_csv('ON-childcare.csv')
NT.to_csv('NT-childcare.csv')
NL.to_csv('NL-childcare.csv')