### Address Parsing

**Full dataset is separated into two parts:**

1. All provinces/sources - (NB, godaycare.com)
2. NB, godaycare.com which have city and 2-letter province abbreviations in `full_address`

In [55]:
from postal.parser import parse_address
import pandas as pd
import os
import re

In [56]:
os.chdir('/Users/kt/Documents/work/STATCAN/ODECF/Wrangling-ODECF/output/childcare/merged/')

In [57]:
cc1 = pd.read_csv('childcare-1.csv')

---
#### Pre-processing
1. remove phone numbers from `full_address`
2. remove postal codes from `full_address` and fill nas in `postal_code`
3. remove province names from `full_address`
4. general cleaning of `full_address` (remove unwanted char)

1. remove phone numbers from `full_address`

In [58]:
def phone_rmv(x):
    """To remove phone numbers structured as ' (XXX) XXX-XXXX' from full address column."""
    try:
        p = re.compile('\s\(\d{3}\)\s\d{3}\-\d{4}')
        p.search(x)[0]
        update = x.replace(p.search(x)[0], '')
        return update
    except TypeError:
        return x

In [59]:
cc1.full_address = cc1.full_address.map(phone_rmv)

2. remove postal codes from `full_address` and fill nas in `postal_code`

In [60]:
pcs=[]
def pc_extract(x):
    """To extract postal code from full address and place in list, 
    which can be appended to df and fill NAs in postal code column.
    Also returns full address with postal code removed."""
    try:
        pc = re.compile('[A-Z]{1}\d{1}[A-Z]{1}\s\d{1}[A-Z]{1}\d{1}')
        pcs.append(pc.search(x)[0])
        return x.replace(pc.search(x)[0], '')
    except TypeError:
        pcs.append(None)
        return x
    
cc1.full_address = cc1.full_address.map(pc_extract)

# fill missing postal codes from what was extracted from full address using pc_extract
cc1.postal_code.fillna(value = pd.Series(pcs),inplace = True)

In [61]:
# cc1.loc[cc1.provider == 'Province of Manitoba', 'full_address'] = cc1.loc[cc1.provider == 'Province of Manitoba', 'full_address'].map(lambda x: x + ", Canada" if x != "None" else x)

In [62]:
# cc1.loc[cc1.provider == 'Province of Saskatchewan', 'full_address'] = cc1.loc[cc1.provider == 'Province of Saskatchewan', 'full_address'].map(lambda x: x + ", Canada" if x != "None" else x)

3. Convert 2-letter province abbreviations to full. Complete address for better parsing.

In [63]:
prov_abbrev = {
              "Alberta": "AB",
              "British Columbia": "BC",
              "Saskatchewan": "SK",
              "Manitoba": "MB",
              "Ontario": "ON",
              "Quebec": "QC",
              "Newfoundland And Labrador": "NL",
              "New Brunswick": "NB",
              "Nova Scotia": "NS",
              "Northwest Territories": "NT",
              "Nunavut": "NU",
              "Prince Edward Island": "PE",
              "Yukon Territory": "YT"}

long_prov = dict((v, k) for k, v in prov_abbrev.items())

In [84]:
def complete_address(x):
    """Converts abbreviated province name to full and appends Canada.
    Returns full address with changes for improved parsing efficacy."""
    for k in long_prov.keys():
        try:
            mach = re.search(k, x)[0]
            return x.replace(mach, long_prov[mach] + ", Canada")
        except TypeError:
            pass
    return x

In [83]:
t = '16 Elm Street Thompson, MB'
for k in long_prov.keys():
    try:
        if m = re.search(k,t)[0]:
            print(t.replace(m, long_prov[m] + ', Canada'))
    except TypeError:
        pass


In [86]:
complete_address('16 Elm Street Thompson')

'16 Elm Street Thompson'

In [91]:
cc1['full_address'] = cc1.full_address.map(lambda x: complete_address(x))

In [92]:
# cc1[['full_address', 'temp_address', 'provider']].to_csv('../parsed/check.csv')

3. general clean up

In [93]:
# clean up YT
cc1.loc[cc1.provider == 'Yukon Territory'].full_address.replace({', ':''}, regex=True, inplace = True)

In [94]:
# remove commas from AB and QC to improve parser
cc1[(cc1.provider == 'Province of Alberta') | (cc1.provider == 'Province of Quèbec')].full_address.replace({',':''}, regex=True, inplace = True)

---
### **Address Parse section 1**

In [95]:
cc1.full_address = cc1.full_address.astype(str)
parsed = cc1.full_address.map(parse_address)

In [96]:
units = [x[0][0] if len(x) == 3 else None for x in parsed]
str_num = [x[0][0] if len(x) == 2 else (x[1][0] if len(x) == 3 else None) for x in parsed]
str_name = [x[1][0] if len(x) == 2 else (x[2][0] if len(x) == 3 else None) for x in parsed]

In [97]:
cc1['unit'] = pd.Series(units)
cc1.street_name.fillna(value = pd.Series(str_name), inplace = True)
cc1.street_number.fillna(value = pd.Series(str_num), inplace = True)

Export sample:

In [98]:
cc1.sample(int(len(cc1)*0.02)).to_csv("../parsed/childcare-0.02-percent.csv", encoding = "utf-8-sig")

In [None]:
# cc.set_index('provider').filter(regex='GoDayCare\.com', axis =0)

---
Part 2:
---
NB + godaycare.com data

In [None]:
cc2 = pd.read_csv('childcare-2.csv')
cc2.full_address=cc2.full_address.astype(str)

Remove postal code and phonenumber

In [None]:
cc2.full_address = cc2.full_address.map(phone_rmv)
cc2.full_address = cc2.full_address.map(pc_extract)

Format full_address and remove city and 2-letter province abbreviations

In [None]:
## add prov to new brunswick to catch cities for removal
cc2.loc[cc2.provider == 'Province of New Brunswick', 'full_address'] = cc2.loc[cc2.provider == 'Province of New Brunswick', 'full_address'].map(lambda x: x + "NB")

In [None]:
cc2.full_address.replace({',':''},regex = True, inplace = True)

In [18]:
# grab list of all cities and provinces used in GoDayCare.com facilities
# godc_prov = cc2.loc[cc2.provider == 'GoDayCare.com'].province.astype(str).to_list()
# godc_cit = cc2.loc[cc2.provider == 'GoDayCare.com'].city.astype(str).to_list()



In [None]:
def prov_convert(x):
    """convert full province name to 2-letter abbreviation"""
    if x in prov_abbrev.keys():
        return prov_abbrev[x]

godc_prov_abb = []

for p in godc_prov:
    godc_prov_abb.append(prov_convert(p))

In [None]:
# create unique set of city - province concatenations to reference in full_address for removal
citprov = set()
for c,p in zip(godc_cit, godc_prov_abb):
    citprov.add("{} {}".format(c,p))

In [None]:
def citprov_rmv2(x):
    for cp in citprov:
        if cp.lower() in x.lower():
#             print("{} in {}".format(cp, x))
            return x.lower().replace(cp.lower(), '')
        else:
            pass
            
            
cc2['temp_address'] = cc2.full_address.map(citprov_rmv2)

In [None]:
cc2.temp_address.replace({'NB': ''}, regex = True, inplace = True)

In [None]:
cc2.temp_address.fillna(value = cc2.full_address, inplace = True)

Check:

In [None]:
cc2[['full_address', 'temp_address', 'provider', 'province']].to_csv('../parsed/citprov_removal.csv', encoding = 'utf-8-sig')

---
#### Parsing