### British Columbia data
Retreived from: 
1. https://catalogue.data.gov.bc.ca/dataset/child-care-map-data
2. https://catalogue.data.gov.bc.ca/dataset/strongstart-bc-centre-locations

In [54]:
import numpy as np 
import pandas as pd
from postal.parser import parse_address

In [55]:
bc_childcare = pd.read_csv('../Collection-ODECF/data/childcare/BC-childcare_locations.csv')
bc_ss = pd.read_csv('../Collection-ODECF/data/childcare/BC-strongstarts20200828.csv')

1. **Concatenate childcare and smartstart dataframes**

In [56]:
bc_childcare.rename(columns = {'NAME' : 'FACILITY NAME',
                              'ADDRESS_1' : 'FACILITY ADDRESS',
                               'CITY' : 'FACILITY_CITY',
                              'POSTAL_CODE' : 'FACILITY_POSTAL_CODE'
                              }, inplace = True)

In [57]:
print(#"bc_ss length: ", len(bc_ss), 
      #"childcare length:", len(bc_childcare), 
      "expected concatenated length: ", len(bc_ss) + len(bc_childcare))

print(#"smartstart cols:", len(bc_ss.columns), 
      #"childcare cols:", len(bc_childcare.columns),
     "expected concatenated cols (total - 4 same):", len(bc_ss.columns) + len(bc_childcare.columns) - 4)

expected concatenated length:  5318
expected concatenated cols (total - 4 same): 55


In [58]:
BC = pd.concat([bc_childcare, bc_ss], ignore_index=True)
print("rows:", len(BC), "cols:", len(BC.columns))

rows: 5318 cols: 55


In [59]:
BC.to_csv('output/childcare/bc-childcare.csv')

1. Add missing mandatory columns

In [60]:
# bc_childcare['provider'] = 'Province of British Columbia'
# bc_ss['provider'] = 'Province of British Columbia'

# bc_childcare['province'] = 'bc'
# bc_ss['province'] = 'bc'

2. Rename existing mandatory columns

In [61]:
# bc_childcare.info()
# bc_childcare.head()

In [62]:
# bc_ss.info()
# bc_ss.head()

----

---

4. **<mark> Manually add missing mandatory columns </mark>**
    * clean non-null `ADDRESS_2` 
        * Swap with `ADDRESS_1` where sensical
    * parse `ADDRESS_1` using libpostal

*Swapping `full_address` and `ADDRESS2` where sensical*

In [63]:
# indices to swap: 2110, 4078, 4114, 4115, 4549, 4954, 4995 
swap = [2116, 4087, 4123, 4124, 4213, 4558, 4962, 5003]

# BC.iloc[swap]

In [64]:
tempfull = BC.iloc[swap].full_address.to_list()
tempadd = BC.iloc[swap].ADDRESS_2.to_list()

for tf, ta in zip(tempfull, tempadd):
    BC.full_address.replace(tf, ta, inplace = True)
    BC.ADDRESS_2.replace(ta, tf, inplace = True)

AttributeError: 'DataFrame' object has no attribute 'full_address'

##### libpostal Address Parser Check

    [x] addresses with units e.g. '104 - 3242 Westwood St'

    [x] addresses with numerical street names e.g. '7348 16Th Ave', '20216 53 Ave'

Parse `full_address` using `libpostal`

In [None]:
parsed = BC.full_address.map(parse_address)
str_num = [x[0][0] for x in parsed]
str_name = [x[-1][0] for x in parsed]

In [None]:
BC['street_number'] = str_num
BC['street_name'] = str_name

In [None]:
BC.groupby('SERVICE_TYPE_CD').nunique()

In [None]:
BC.info()

In [None]:
BC.to_csv('output/childcare/bc-childcare.csv')

---

# address_parser

* Use regex to extract unit number where applicable
* Use regex to extract the rest of the address (w/o unit number) to accurately parse Street Name and Street Number
* Street Name = road name + road suffix + road direction
        
*Prepare data for pasring by extracting `unit` with regex*. 

For addresses containing several numbers separated by a whitespace, partition into two parts: the unit, the rest.

In [None]:
# from address_parser import Parser
# from phonetics import *
# # parser = Parser()

In [None]:
# # identify addresses that may contain unit number as prefix
# unit_adr = BC.full_address.str.extract(r'(^\b[0-9]+\s[0-9]+\s[a-zA-Z]+\s[a-zA-Z]+\b)')

# unit = unit_adr[0].str.extract(r'(^\b[0-9]+)')
# rest = unit_adr[0].str.replace(r'(^\b[0-9]+\s)', '')


# BC['unit'] = unit
# BC['rest'] = rest

# # correct data types
# BC.rest = BC.rest.astype(str)
# BC.full_address = BC.full_address.astype(str)

*parser no parsing!*

In [None]:
# def street_name(address):
#         adr = parser.parse(address)
#         str_name = "{} {} {}".format(adr.road.name, adr.road.suffix, adr.road.direction)
#         return str_name.title()
    
    
# def street_number(address):
#     adr = parser.parse(address)
#     if adr.number.number == -1:
#         '''
#         If the number is incorrect, use the end_number.
#         This is likely an address with a range.
#         '''
#         return adr.number.end_number
#     else:
#         return adr.number.number

In [None]:
# def check_name(x):
#     name = street_name(x).lower().strip()
#     if name in x.lower():
# #          print("{} in {}".format(name, x.lower()))
#         return True
#     else:
# #         print("{} not in {}".format(name, x.lower()))
#         return False

In [None]:
# prefix = BC.full_address.str.extract(r'(^\b[0-9]+)')[0].to_list()

# if prefix not in unit[0].to_list():
#     # if number prefix is not in the unit list, its the address no.
#     # check the street name for these ones as well.
#     BC['street_number'] = prefix
#     BC['street_name'] = BC.full_address.map(lambda addr: street_name(addr))
#     BC['ref'] = BC.full_address.map(lambda x: check_name(x))
        
# else:
#     BC['street_name'] = BC.rest.map(lambda addr: street_name(addr))
    
# BC['rest_street_name'] = BC.rest.map(lambda addr: street_name(addr))
# BC['rest_street_number'] = BC.rest.map(lambda addr: street_number(addr))

### Correct parser street names

In [None]:
# BC.iloc[536]

In [None]:
# crct = BC.loc[(BC.rest != 'nan') & (BC.ref == False)].index

# rplc = BC.iloc[crct].street_name.to_list()
# wth = BC.iloc[crct].rest_street_name.to_list()

# rplc_strno = BC.iloc[crct].street_number.to_list()
# with_strno = BC.iloc[crct].rest_street_number.to_list()

# counter = 0

# for r,w, rno,wno in zip(rplc, wth, rplc_strno, with_strno):
#     # trackers
#     counter +=1 
#     ind =  BC[BC.street_name == r].index.to_list()
    
#     # corrections
#     BC.street_name.replace(r, w, inplace = True)
#     BC.street_number.replace(rno, wno, inplace = True)
    
#     print("....... ({} / {})\n replacing street name: {} with {} at index {} \n replacing street number: {} with {} \n .......".format(counter, len(rplc), r,w,ind, rno, wno))
    

#### Correct parsers units
Manually inspect where street numbers are the same as the unit numbers. keep only correct number.

In [None]:
# BC.loc[BC.street_number == BC.unit,'unit'] = None

In [None]:
# BC.loc[BC.street_number == BC.unit][['full_address','street_name','street_number','unit']].reset_index()

In [None]:
# BC.loc[(BC.rest != 'nan') & (BC.ref == False)][['full_address', 'rest', 'rest_street_name', 'ref', 'street_name', 'street_no', 'unit']]

In [None]:
# BC.iloc[10:50][['full_address', 'rest', 'ref', 'street_name', 'street_number', 'unit']]

#### Tidy & export
* drop unneccessary cols used to correct addresses

In [None]:
# BC.drop(columns = ['ref', 'rest_street_name', 'rest_street_number', 'rest'], inplace = True)

In [None]:
# BC[BC.full_address.duplicated()]

In [None]:
# BC['street_name'] = BC.full_address.map(lambda addr: street_name(addr))
# # BC['street_name_fill'] = BC.rest_addr2.map(lambda addr: street_name(addr))
# # BC['street_name_fill2'] = BC.rest_addr3.map(lambda addr: street_name(addr))
# BC['street_number'] = BC.rest_addr.map(lambda addr: street_number(addr))

In [None]:
#------------------------------------------------------
# Using original address renders incorrect Street Names
#------------------------------------------------------

# BC['street_number'] = BC.full_address.map(lambda addr: street_number(addr))
# BC['street_name'] = BC.full_address.map(lambda addr: street_name(addr))


#------------------------------------------
# Attempting to reformat og address renders 
# IndexError with parser
#------------------------------------------
# to_parse = []
# for address, unit in zip(rest[0].to_list(), test2[0].to_list()):
#     to_parse.append("{} unit {}".format(address, unit))

# BC['parse_me'] = to_parse
# BC.parse_me = BC.parse_me

# BC['street_number'] = BC.parse_me.map(lambda addr: street_number(addr))
# BC['street_name'] = BC.parse_me.map(lambda addr: street_name(addr))

In [None]:
# BC.iloc[170:220][['full_address', 'street_name', 'street_name_fill', 'street_name_fill2', 'street_number', 'unit']]

In [None]:
BC.to_csv('output/childcare/bc-childcare.csv')