#### Address post cleaning

**IN:** parsed file

**OUT:** final database file

In [1]:
from postal.parser import parse_address
import pandas as pd
import os
import re

In [2]:
os.chdir('/Users/kt/Documents/work/STATCAN/ODECF/Wrangling-ODECF/output/childcare/parsed/')

In [3]:
df=pd.read_csv('childcare-parsed.csv')

---
#### Part 3: Completing missing information
* province

In [4]:
df.loc[df.provider == "Province of Manitoba", 'province'] = 'MB'
df.loc[df.provider == "Province of Alberta", 'province'] = 'AB'
df.loc[df.provider == "Province of Saskatchewan", 'province'] = 'SK'
df.loc[df.provider == "Province of British Columbia", 'province'] = 'BC'
df.loc[df.provider == "Province of Quèbec", 'province'] = 'QC'
df.loc[df.provider == "Province of New Brunswick", 'province'] = 'NB'
df.loc[df.provider == "Province of Newfoundland and Labrador", 'province'] = 'NL'
df.loc[df.provider == "Province of Nova Scotia", 'province'] = 'NS'
df.loc[df.provider == "Province of Prince Edward Island", 'province'] = 'PE'
df.loc[df.provider == "Nunavut", 'province'] = 'NU'
df.loc[df.provider == "Yukon Territory", 'province'] = 'YT'
df.loc[df.provider == "Northwest Territories", 'province'] = 'NT'

In [5]:
df.province = df.province.astype(str) 
prov_abbrev = {
              "Alberta": "AB",
              "British Columbia": "BC",
              "Saskatchewan": "SK",
              "Manitoba": "MB",
              "Ontario": "ON",
              "Quebec": "QC",
              "Newfoundland And Labrador": "NL",
              "New Brunswick": "NB",
              "Nova Scotia": "NS",
              "Northwest Territories": "NT",
              "Nunavut": "NU",
              "Prince Edward Island": "PE",
              "Yukon Territory": "YT"}

def complete_address(x):
    """Converts full province name to abbreviated in the reference set.
    Returns full address with changes intended for improved parsing efficacy."""
    for k in prov_abbrev.keys():
        try:
            mach = re.search(k, x)[0]
            return x.replace(mach, prov_abbrev[mach])
        except TypeError:
            pass
    return x

In [6]:
df.province = df.province.map(lambda x: re.sub(x, complete_address(x), x))

* fill city column with those removed from the full_address

In [7]:
colsort = ['source_id','name','source_facility_type','facility_type',
            'ages', 'capacity', 'infant', 'toddler', 'school_age',
           'source_full_address', 'full_address','street_number', 'street_name','unit', 'postal_code', 'city', 'province', 
           'provider','licence_status', 'longitude', 'latitude']
df = df.reindex(colsort, axis=1)

In [9]:
# df.drop(columns = ['Unnamed: 0'], inplace = True)

In [10]:
df.to_csv('../final/childcare-facilities.csv', encoding = 'utf-8')