#### Address cleaning

**IN:** merged file

**OUT:** cleaned file

In [32]:
from postal.parser import parse_address
import pandas as pd
import os
import re

In [33]:
os.chdir('/Users/kt/Documents/work/STATCAN/ODECF/Wrangling-ODECF/output/childcare/merged/')

In [52]:
df=pd.read_csv('childcare-merged.csv')
df['source_full_address'] = df['full_address']

In [54]:
# correct ontario street numbers that turn to dates
df.loc[df.name == 'Petawawa Military Family Resource Centre Private Home Day Care', 'Street Number'] = ' 10-16'

#### Full address cleaning

**Globally:**
1. Remove phone numbers 
2. Remove postal codes and fill in `postal_code` column
3. General processing

**Targeted:** to remove city (and province) names from full address
4. Remove unneccessary commas 
    * AB, MB, SK, PE, YK, QC
    
5. Find and fix obvious errors in data
    * rename NB streets
    



1. Remove phone numbers 

In [35]:
def phone_rmv(x):
    """To remove phone numbers structured as ' (XXX) XXX-XXXX' from full address column."""
    try:
        p = re.compile('\s\(\d{3}\)\s\d{3}\-\d{4}')
        p.search(x)[0]
        update = x.replace(p.search(x)[0], '')
        return update
    except TypeError:
        return x
    
df.full_address = df.full_address.map(phone_rmv)

2. Remove postal codes and fill in `postal_code` column

In [36]:
pcs=[]
def pc_extract(x):
    """To extract postal code from full address and place in list, 
    which can be appended to df and fill NAs in postal code column.
    Also returns full address with postal code removed."""
    try:
        pc = re.compile('[a-zA-Z]{1}\d{1}[A-Za-z]{1}\s\d{1}[A-Za-z]{1}\d{1}')
        pcs.append(pc.search(x)[0])
        return x.replace(pc.search(x)[0], '')
    except TypeError:
        pcs.append(None)
        return x
    
df.full_address = df.full_address.map(pc_extract)

# fill missing postal codes from what was extracted from full address using pc_extract
df.postal_code.fillna(value = pd.Series(pcs),inplace = True)

3. General processing

In [49]:
df.full_address = df.full_address.astype(str)

# replace bad regex
df.full_address.replace({
    r'\bappartement\b ' : r'unit #',
    r'\bapt\b ' : r'unit #',
    r'\bsuite\b ' : r'#',
    r'\bSUITE\b ' : r'#', 
    r'\brd\b #\d+' : r'road \d+',
    r'\broad\b #\d+' : r'road \d+',
    r'\bhwy\b #\d+': r'highway \d+',
    r'\bhighway\b #\d+': r'highway \d+'
    }, inplace = True)

# remove bad regex
bad = [
    'ecole new era school m6',
    r':',
    r'\&',
    r'\bbureau\b \d+',
    r'\bbureau\b [a-zA-Z]',
    r'\bbureaux\b \d+',
    r'\bBox\b \d+',
    r'\blocal\b [a-zA-Z]',
    r'\blocal\b [a-zA-Z]\d+',
    r'\blocal\b \d+',
    r'\bsuite\b \d+[a-zA-Z]',
    r'\bCP\b \d+',
    r'\bcp\b \d+',
    r'\bgym\b',
    r'\bgymnasium\b',
    r'\blibrary\b \d+',
    r'\,',
    r'\(.*\)',
    r'\bc p\b \d+'
]

for b in bad:
    df.full_address = df.full_address.map(lambda x: re.sub(b,'',x))

    
#------------------------------


# remove commas from QC
df.loc[df.provider == 'Province of Quèbec', 'full_address'] = df.loc[df.provider == 'Province of Quèbec', 'full_address'].replace({',':''}, regex = True)

# replace dashes except from QC
df.loc[df.provider != 'Province of Quèbec', 'full_address']=df.loc[df.provider != 'Province of Quèbec', 'full_address'].replace('-',' ',regex=True)

# replace periods
df.full_address=[x.replace('.','') for x in df.full_address.astype('str')]

#replace multiple spaces
df.full_address=df.full_address.replace(' +',' ',regex=True)

# remove trailing white space
df.full_address=df.full_address.str.strip()



3. Remove unneccessary commas 
    * AB, MB, SK, PE, YT

In [38]:
slct = ['Alberta', 'Manitoba', 'Saskatchewan', 'Prince Edward Island']

for s in slct:
    df.loc[df.provider == 'Province of {}'.format(s), 'full_address'] = df.loc[df.provider == 'Province of {}'.format(s), 'full_address'].replace({',': ''}, regex = True)
    
df.loc[df.provider == 'Yukon Territory', 'full_address']=df.loc[df.provider == 'Yukon Territory', 'full_address'].replace({',': ''}, regex = True)

4. Find and fix obvious errors in data

**Find errors**

In [39]:
# errors: road mill, road quispamsis, road sainte anne, road beaverbrook, road road,
# ok: road ragged, road\s, road old shediac, road macdonald, 3359 Cloverside Road Avon Cloverside road Avonmore ON (godaycare), road allowance
ind = -1
def find_weird_roads(x):
    try:
#         rg1 = r'road [a-zA-Z]*' # english
        rg2 = r'rue [a-zA-Z]* [0-9].*' # french
        rg3 = r'rue [0-9].*' # french
#         return re.search(rg1, x)[0]
        return re.search(rg2, x)[0]
    except TypeError:
        pass
    
    
def find_weird_hwys(x):
    try:
        rg1 = r'highway [a-zA-Z]*'
        rg2 = r'highway [0-9]*'
        return re.search(rg1, x)[0]
        return re.search(rg2, x)[0]
    except TypeError:
        pass

# for f in df.full_address:
#     ind += 1
#     if find_weird_roads(f) != None:
#         print(find_weird_roads(f), "index: {}".format(ind))



**Fix obvious errors**
* rearrange street names that come after street (NB data)

In [40]:
df.full_address.replace({'Street Goodine' : 'goodine street',
                        'Street Botsford' : 'botsford street',
                         'Street Drummond' : 'drummon street',
                         'Street Grandview' : 'grandview street',
                         'Drive Rough Water' : 'rough water drive',
                         'Street Mcann' : 'mcann street',
                         'Street Priestman' : 'priestman street',
                         'Drive Elmwood' : 'elmwood drive',
                         'Street Paul' : 'paul street',
                         'Street Boucher': 'boucher street',
                         'Street Lepage' : 'lepage street',
                         'Street Andrew' : 'andrew street',
                         'Street Willis' : 'willis street',
                         'Street Main' : 'main street',
                         'Road Quispamsis' : 'Quispamsis Road',
                         'Street Ryan' : 'ryan street',
                         'Street Lady Russel' : 'lady russell street',
                         'Street Bonaccord' : 'bonaccord street'
                        }, inplace = True)

* concat ON data into full address

In [41]:
df.loc[df.full_address == 'nan', 'full_address'] = None

In [42]:
df.full_address.fillna(value = df.street_number + ' ' + df.street_name, inplace = True)

* rearrange roads for found errors

In [43]:
def rearrange_roads(x):
    '''function to place errors: 
    road mill, road quispamsis, road sainte anne, road beaverbrook, road road'''
    try:
        if 'road mill' in x:
            return x.replace('road mill', 'mill road')
        elif 'road quispamsis' in x:
            return x.replace('road quispamsis', 'quispamsis road')
        elif 'road sainte anne' in x:
            return x.replace('road sainte anne', 'sainte anne road')
        elif 'road beaverbrook' in x:
            return x.replace('road beaverbrook', 'beaverbrook road')
        elif 'road road' in x:
                return x.replace('road road', 'road')
    except TypeError:
        pass
    return x

df.loc[df.provider == 'Province of New Brunswick', 'full_address'] = df.loc[df.provider == 'Province of New Brunswick', 'full_address'].map(rearrange_roads)

* add hastag to french floors

In [44]:
def fix_fr_floor(x):
    ''' adds hashtag to french floors for easy parsing'''
    try:
        m = re.search(r'[0-9]e étage', x)[0]
        return x.replace(m, '#'+m)
    except TypeError:
        pass
    return x
    
df.loc[df.provider == 'Province of Quèbec', 'full_address'] = df.loc[df.provider == 'Province of Quèbec', 'full_address'].map(fix_fr_floor)

In [50]:
# check removal of CP
# df.full_address.iloc[1302]

'5111 46 RUE'

In [45]:
# drop unneccessary columns
df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'], inplace = True)

In [51]:
df.to_csv('/Users/kt/Documents/work/STATCAN/ODECF/Wrangling-ODECF/output/childcare/precleaned/childcare-precleaned.csv')