# Data Wrangle  - Data Audit and Clean 

### Downloaded the file from https://www.openstreetmap.org/relation/324211 

### 1. Creating the Sample Toronto OSM file.

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "toronto_canada.osm"  # Replace this with your osm file
SAMPLE_FILE = "torontosample.osm"

k = 15 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')


### 2. Audit the Data

In [6]:
"""
It should return a dictionary with the tag name as the key and number of times this tag can be encountered in 
the map as value.

"""
import xml.etree.cElementTree as ET
import pprint

def count_tags(filename):
    dict_ = {}
    for event,elem in ET.iterparse(filename):
        if elem.tag not in dict_:
            dict_[elem.tag] = 1
        else:
            dict_[elem.tag] += 1
    return dict_

def test():
    tags = count_tags('toronto_canada.osm')
    #tags = count_tags('torontosample.osm')
    pprint.pprint(tags) 

if __name__ == "__main__":
    test()

{'bounds': 1,
 'member': 151372,
 'nd': 5875055,
 'node': 5127518,
 'osm': 1,
 'relation': 9677,
 'tag': 5030403,
 'way': 761285}


In [7]:
"""
From lecture I have 3 regular expressions to check for certain patterns in the tags.
I would like to change the data model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}

So, I want to see. I have such tags, and if I have any tags with problematic characters.

I have a count of each of four tag categories in a dictionary:
1.  "lower"        - for tags that contain only lowercase letters and are valid
2.  "lower_colon"  - for otherwise valid tags with a colon in their names
3.  "problemchars" - for tags with problematic characters 
4.  "other"        - for other tags that do not fall into the other three categories.

"""
import xml.etree.cElementTree as ET
import pprint
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

osm_file = "toronto_canada.osm"
#osm_file = "torontosample.osm"

def key_type(element, keys):
    if element.tag == "tag":
        att = element.attrib['k'] 
        m = lower.search(att)
        p = problemchars.search(att)
        n = lower_colon.search(att)
        if m:
            keys["lower"] += 1
        elif n:
            keys["lower_colon"] += 1
        elif p:
            keys["problemchars"] += 1
        else:
            keys["other"] += 1
#pass    
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

def test():
    #keys = process_map('torontosample.osm')
    keys = process_map(osm_file)
    pprint.pprint(keys)

if __name__ == "__main__":
    test()

{'lower': 2970797,
 'lower_colon': 1939747,
 'other': 119222,
 'problemchars': 637}


In [None]:
3. I am auditing and finding the abbreviated street name.

In [8]:
"""
The OSM file used is an abbreviated version of the Toronto Mapzen file  
"""

import xml.etree.cElementTree as ET
from collections import defaultdict
import re

#osm_file = open("torontosample.osm", "r")
osm_file = open("toronto_canada.osm", "r")
street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()

        street_types[street_type] += 1

def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())  # Python 2.7 ver
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v) 

def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit():
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])    
    print_sorted_dict(street_types)    

if __name__ == '__main__':
    audit()

#1: 2
#10: 1
#101: 1
#106: 1
#11: 2
#110: 2
#12A: 1
#17: 1
#18: 1
#185: 1
#19: 2
#2: 2
#200: 1
#202: 2
#204: 1
#23B: 1
#2801: 1
#282: 1
#2A: 1
#3: 1
#318: 1
#332: 1
#364: 1
#37: 1
#38: 1
#4: 4
#409: 1
#4B: 1
#5: 3
#5-6: 1
#57: 1
#6: 1
#7: 1
#8: 1
#A: 1
#A3: 1
#E8: 1
#G: 1
#PHW: 1
1: 329
10: 443
101: 4
102: 4
106: 2
107: 6
109: 19
10a: 4
11: 108
115c: 1
11a: 4
11b: 8
12: 139
124: 45
125: 12
12a: 4
12b: 4
13: 36
132: 2
13b: 4
14: 37
141: 2
146: 6
147: 4
14a: 4
14b: 6
14c: 2
14d: 4
14e: 2
14f: 2
15: 140
16: 60
17: 127
18: 120
19: 56
2: 517
20: 140
21: 76
22: 19
23: 36
24: 52
25: 159
26: 16
27: 379
28: 24
2b: 8
3: 387
30: 12
300: 1
302: 1
32: 52
34: 46
38: 1
39: 10
4: 359
400: 4
403: 2
42: 1
47: 98
48: 203
4b: 8
5: 283
50: 312
500: 1
52: 21
56: 2
57: 132
5700: 1
5a: 4
6: 459
6a: 6
7: 1190
7-12: 2
7;12: 5
7A: 84
8: 511
88: 53
89: 213
89): 1
8a: 4
9: 340
97: 32
99: 4
Abbey: 4
Access: 4
Acres: 14
Adjala: 180
Agostino: 4
Alley: 26
Alliston: 40
Amaranth: 42
Amberdale: 21
Ames: 8
Amici: 12
Anton

## 3. Imporving Stree Names, Postal Codes and Phone Numbers

In [None]:
4. Mapping the Street name to the expected name.
   After mapped to the expected names write the data to "toronto_canada1.osm" file

In [10]:

import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint
 
OSM_FILE = "toronto_canada.osm"
#OSM_FILE = "torontosample.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Terrace" ,"Expressway", "Crescent", "Close", "Common", "Heights", "Way"]

mapping = { "St": "Street",
            "St.": "Street",
            "ST": "Street",
            "Ben":"Bend",
            "Glenn":"Glen",
            "Hrbr":"Harbour",
            "Ho":"Hollow",
            "Mews":"Medows",
            "Pkwy":"Parkway",
            "Wood":"Woods",
            "Ave": "Avenue",
            "ave": "Avenue",
            "Rd.": "Road",
            "Blvd":"Boulevard",
            "Dr.":"Drive",
            "Ct":"Court",
            "Pl":"Place",
            "Sq":"Square",
            "ln":"Lane",
            "SW": "Southwest ",
            "SE": "Southeast ",
            "NW": "Northwest ",
            "NE": "Northeast ",
            'CT': 'Court',
            'Ct': 'Court',
            'Dr': 'Drive',
            'Dr.': 'Drive',
            'E': 'East',
            'Main St': 'Main Street',
            'N': 'North',
            'NE': 'Northeast',
            'NW': 'Northwest',
            'nw': 'Northwest',
            'PL': 'Place',
            'Pl': 'Place',
            'Rd': 'Road',
            'RD': 'Road',
            'Rd.': 'Road',
            'S': 'South',
            'S.': 'South',
            'SE': 'Southeast',
            'ST': 'Street',
            'SW': 'Southwest',
            'SW,': 'Southwest',
            'Se': 'Southeast',
            'southeast': 'Southeast',
            'St': 'Street',
            'st': 'Street',
            'Ter': 'Terrace',
            'W': 'West',
            'west': 'West',
            'HYW': 'Highway',
            'WY': 'Way',
            "Avebue":"Avenue",
            "Avenu":"Avenue"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

def update_name(name, mapping):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            if street_type in mapping.keys():
                name = re.sub(street_type_re, mapping[street_type], name)
                               
    return name

#OSM_FILE_UPDATED = "toronto_canada1.osm"

"""
def test():
    st_types = audit(OSM_FILE)
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name

if __name__ == '__main__':
    test()
    
"""
# Mapped to the expected names, save/write the data to "toronto_canada1.osm" file 

OSM_FILE_UPDATED = "toronto_canada1.osm"
# Takes as input osm file and tuple of nodes and yield nodes of types from tuple. 

def get_element(osm_file, tags=('node', 'way', 'relation')):
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
            
# Following function will update the abbreviations in osm file

def update_street(original_file, update_file):
    with open(update_file, 'wb') as output:
        output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        output.write('<osm>\n  ') 
        for i, element in enumerate(get_element(original_file)):
            for tag in element.iter("tag"):
                if is_street_name(tag):
                    tag.set('v',update_name(tag.attrib['v'], mapping))
            output.write(ET.tostring(element, encoding='utf-8'))
        output.write('</osm>')

update_street(OSM_FILE, OSM_FILE_UPDATED)


In [None]:
5. Auditing the for postcode inconsistent

In [11]:

import xml.etree.cElementTree as ET
from collections import defaultdict
import re

postcode_type_re = re.compile(r'\d{5}-??')

def audit_post_type(post_types, zip):
    m = postcode_type_re.search(zip)
    if m:
        post_type = m.group()
        if post_type not in post_types:
            post_types[post_type].add(zip)
    else:
        post_types['unknown'].add(zip)

def is_pcode(elem):
    return (elem.attrib['k'] == "addr:postcode")

def postcode_audit(osmfile):
    osm_file = open(osmfile, "r")    
    post_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_pcode(tag):
                    audit_post_type(post_types, tag.attrib['v'])                
    osm_file.close()
    return post_types

postcode_types = postcode_audit(OSM_FILE_UPDATED)

pprint.pprint(dict(postcode_types))

OSM_FILE_UPDATED_PC = "toronto_canada2.osm"
# This function replace abbrevition by right zip
def u_postcode(zip):
    m = postcode_type_re.search(zip)
    if m:
        return m.group()
    else:
        return 'unknown'

# This function replace wrong zip in osm file
def update_postcode(original_file, update_file):
    with open(update_file, 'wb') as output:
        output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        output.write('<osm>\n  ') 
        for i, element in enumerate(get_element(original_file)):
            for tag in element.iter("tag"):
                if is_pcode(tag):
                    tag.set('v',u_postcode(tag.attrib['v']))
            output.write(ET.tostring(element, encoding='utf-8'))
        output.write('</osm>')

update_postcode(OSM_FILE_UPDATED, OSM_FILE_UPDATED_PC)



{'14174': set(['14174']),
 '33913': set(['33913']),
 '96734': set(['96734']),
 'unknown': set(['B2W 5M5',
                 'B2Y4N4',
                 'K4A 1W9',
                 'L0B 1A0',
                 'L0B 1J0',
                 'L0B 1L0',
                 'L0C 1A0',
                 'L0C 1B0',
                 'L0C 1K0',
                 'L0E 1T0',
                 'L0G 1A0',
                 'L0G 1B0',
                 'L0G 1J0',
                 'L0G 1L0',
                 'L0G 1M0',
                 'L0G 1N0',
                 'L0G 1R0',
                 'L0G 1T0',
                 'L0G 1V0',
                 'L0G 1W0',
                 'L0G1A0',
                 'L0G1M0',
                 'L0G1T0',
                 'L0G1W0',
                 'L0H 1G0',
                 'L0J 1C0',
                 'L0L 1L0',
                 'L0M 1A0',
                 'L0N 1A0',
                 'L0N 1B0',
                 'L0N 1C0',
                 'L0N 1E0',
                 'L0N 1M0',
   

                 'L7A3Z6',
                 'L7B 0C6',
                 'L7B 0L9',
                 'L7B 0N1',
                 'L7B 1A2',
                 'L7B 1A3',
                 'L7B 1A6',
                 'L7B 1A8',
                 'L7B 1C9',
                 'L7B 1G2',
                 'L7B 1G3',
                 'L7B 1G4',
                 'L7B 1H1',
                 'L7B 1H5',
                 'L7B 1H7',
                 'L7B 1H8',
                 'L7B 1J2',
                 'L7B 1K2',
                 'L7B 1K5',
                 'L7B 1L1',
                 'L7B 1L5',
                 'L7B 1L6',
                 'L7B 1M3',
                 'L7C 0K7',
                 'L7C 0S1',
                 'L7C 0W5',
                 'L7C 0Z5',
                 'L7C 1E6',
                 'L7C 1H8',
                 'L7C 1H9',
                 'L7C 1N3',
                 'L7C 1W8',
                 'L7C 2A1',
                 'L7C 2H9',
                 'L7C 2J3',
                 'L7C

                 'M6G 3L6',
                 'M6G 3S8',
                 'M6G 3T5',
                 'M6G 3Z6',
                 'M6G3N1',
                 'M6H 1A6',
                 'M6H 1A7',
                 'M6H 1B5',
                 'M6H 1C3',
                 'M6H 1C5',
                 'M6H 1J6',
                 'M6H 1L4',
                 'M6H 1M3',
                 'M6H 1M4',
                 'M6H 1M7',
                 'M6H 1M9',
                 'M6H 1N4',
                 'M6H 1N7',
                 'M6H 1V4',
                 'M6H 1Y3',
                 'M6H 1Z7',
                 'M6H 2A6',
                 'M6H 2J6',
                 'M6H 2N9',
                 'M6H 2Z3',
                 'M6H 3G4',
                 'M6H 3L8',
                 'M6H 3M8',
                 'M6H 3P1',
                 'M6H 3S4',
                 'M6H 3Z6',
                 'M6H 4A9',
                 'M6H 4B1',
                 'M6H 4B4',
                 'M6H 4C7',
                 'M6H

In [None]:
6. Auditing for the inconsistent phone numbers or format

In [12]:

# Compiler for cleaning phone format 
phone_type_re = re.compile(r'\d{3}\)?-?\s?.?\d{3}\s?-?\s?.?\d{4}')
phone_re = re.compile('\.|\)|\s|-')

def audit_phone_type(phone_types, phone):
    m = phone_type_re.search(phone)
    if m:
        phone_type = m.group()
        if phone_type not in phone_types:
            new_phone = phone_re.sub('',phone_type)
            new_phone = ('+1-' + new_phone[:3] + '-' +
                         new_phone[3:6] + '-' + new_phone[6:])
            phone_types[new_phone].add(phone)
    else:
        phone_types['unknown'].add(phone)
        
def is_phone(elem):
    return (elem.attrib['k'] == "phone")

def phone_audit(osmfile):
    osm_file = open(osmfile, "r")    
    phone_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_phone(tag):
                    audit_phone_type(phone_types, tag.attrib['v'])                
    osm_file.close()
    return phone_types
    
phones_types = phone_audit(OSM_FILE_UPDATED_PC)
# To validate or check the phone numbers format after the changes, uncomment the next line and comment the above one
#phones_types = phone_audit(OSM_FILE_UPDATED_PH) 

pprint.pprint(dict(phones_types))

OSM_FILE_UPDATED_PH = "toronto_canada3.osm"

# Following function update phone numbers to the correct format 

def u_phone(phone):
    m = phone_type_re.search(phone)
    if m:
        new_phone = phone_re.sub('', m.group())
        return ('+1-' + new_phone[:3] + '-' + new_phone[3:6] +
                '-' + new_phone[6:])        
    else:
        return phone

# This function replace the incosistent phone numbers with the right format

def update_phone(original_file, update_file):
    with open(update_file, 'wb') as output:
        output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        output.write('<osm>\n  ') 
        for i, element in enumerate(get_element(original_file)):
            for tag in element.iter("tag"):
                if is_phone(tag):
                    tag.set('v',u_phone(tag.attrib['v']))
            output.write(ET.tostring(element, encoding='utf-8'))
        output.write('</osm>')

update_phone(OSM_FILE_UPDATED_PC, OSM_FILE_UPDATED_PH)
     


{'+1-128-993-89380': set(['+12899389380']),
 '+1-128-999-70644': set(['12899970644']),
 '+1-141-621-40008': set(['14162140008']),
 '+1-141-624-39362': set(['+14162439362']),
 '+1-141-625-11144': set(['+14162511144']),
 '+1-141-629-30404': set(['+14162930404']),
 '+1-141-629-87917': set(['+14162987917']),
 '+1-141-631-42901': set(['+14163142901']),
 '+1-141-635-17065': set(['+14163517065']),
 '+1-141-636-10331': set(['14163610331']),
 '+1-141-636-17454': set(['+14163617454']),
 '+1-141-636-40054': set(['+14163640054']),
 '+1-141-636-70633': set(['+14163670633']),
 '+1-141-639-72014': set(['+14163972014']),
 '+1-141-646-19887': set(['+14164619887']),
 '+1-141-646-34927': set(['+14164634927']),
 '+1-141-646-56434': set(['+14164656434']),
 '+1-141-650-42121': set(['+14165042121']),
 '+1-141-656-74576': set(['+14165674576']),
 '+1-141-659-59995': set(['14165959995']),
 '+1-141-662-32323': set(['+14166232323']),
 '+1-141-665-19999': set(['+14166519999']),
 '+1-141-667-45959': set(['+14166745

 '+1-416-253-6466': set(['+1 416 253-6466']),
 '+1-416-253-7081': set(['+1 416 253 7081']),
 '+1-416-253-7224': set(['416-253-7224']),
 '+1-416-253-7364': set(['416-253-7364']),
 '+1-416-253-8800': set(['+1 416 253-8800']),
 '+1-416-253-9956': set(['416-253-9956']),
 '+1-416-255-0171': set(['416-255-0171']),
 '+1-416-255-0188': set(['416-255-0188']),
 '+1-416-255-0343': set(['+1(416) 255-0343']),
 '+1-416-255-0555': set(['+1 416 255-0555']),
 '+1-416-255-1155': set(['+1 416-255-1155']),
 '+1-416-255-1285': set(['+1 416 255-1285']),
 '+1-416-255-1414': set(['(416) 255-1414']),
 '+1-416-255-1566': set(['416-255-1566']),
 '+1-416-255-1974': set(['416-255-1974']),
 '+1-416-255-2222': set(['+1 416 255-2222']),
 '+1-416-255-2388': set(['+1 416-255-2388']),
 '+1-416-255-2606': set(['416-255-2606']),
 '+1-416-255-2761': set(['416-255-2761']),
 '+1-416-255-3100': set(['416-255-3100']),
 '+1-416-255-3333': set(['+1-416-255-3333']),
 '+1-416-255-3500': set(['416-255-3500']),
 '+1-416-255-3945': s

 '+1-416-489-4899': set(['4164894899']),
 '+1-416-489-5896': set(['+1 416 489 5896']),
 '+1-416-489-7368': set(['4164897368']),
 '+1-416-489-7369': set(['+1 416 489 7369']),
 '+1-416-489-7405': set(['416-489-7405']),
 '+1-416-489-7447': set(['416-489-7447']),
 '+1-416-489-7488': set(['416-489-7488']),
 '+1-416-489-7600': set(['416-489-7600']),
 '+1-416-489-7653': set(['+1 416 489 7653']),
 '+1-416-489-7931': set(['416-489-7931']),
 '+1-416-489-8387': set(['416-489-8387']),
 '+1-416-489-8472': set(['+1 416 489 8472']),
 '+1-416-489-8922': set(['416-489-8922']),
 '+1-416-490-9890': set(['(416)490-9890']),
 '+1-416-491-1417': set(['+1 416 491-1417']),
 '+1-416-491-1828': set(['416-491-1828']),
 '+1-416-491-2597': set(['+1 416 491 2597']),
 '+1-416-491-2727': set(['+1 416 491 2727']),
 '+1-416-491-4212': set(['+1 416 491 4212']),
 '+1-416-491-8706': set(['416-491-8706']),
 '+1-416-492-1642': set(['(416) 492-1642']),
 '+1-416-492-3113': set(['(416) 492-3113']),
 '+1-416-492-3937': set(['416

 '+1-416-798-2900': set(['+1 416-798-2900', '416-798-2900']),
 '+1-416-798-3178': set(['(416) 798-3178']),
 '+1-416-798-3500': set(['416 798 3500']),
 '+1-416-798-8840': set(['416-798-8840']),
 '+1-416-798-9755': set(['1-416-798-9755']),
 '+1-416-799-8344': set(['+1 416 799-8344']),
 '+1-416-800-4520': set(['+1 416 800 4520']),
 '+1-416-800-6580': set(['416-800-6580']),
 '+1-416-804-0745': set(['416-804-0745']),
 '+1-416-807-6004': set(['416-807-6004']),
 '+1-416-815-1111': set(['416-815-1111']),
 '+1-416-815-5500': set(['(416) 815-5500']),
 '+1-416-815-7325': set(['416-815-7325']),
 '+1-416-815-7562': set(['416-815-7562']),
 '+1-416-815-8076': set(['+1-416-815-8076']),
 '+1-416-815-8387': set(['+1-416-815-8387']),
 '+1-416-815-9898': set(['(416) 815-9898']),
 '+1-416-823-4433': set(['416-823-4433']),
 '+1-416-824-8180': set(['416-824-8180']),
 '+1-416-827-2407': set(['416-827-2407']),
 '+1-416-828-2596': set(['416-828-2596']),
 '+1-416-828-3815': set(['+1 416 828-3815', '+1 416-828-38

 '+1-647-693-5831': set(['647 693 5831']),
 '+1-647-700-6256': set(['647-700-6256']),
 '+1-647-710-8799': set(['+1 647 710-8799']),
 '+1-647-720-5460': set(['+1 647 720-5460']),
 '+1-647-721-5264': set(['6477215264']),
 '+1-647-722-2370': set(['+1 (647) 722-2370',
                         '+1 647 722 2370',
                         '647-722-2370']),
 '+1-647-722-6329': set(['647-722-6329']),
 '+1-647-725-2211': set(['647-725-2211']),
 '+1-647-727-6909': set(['647-727-6909']),
 '+1-647-728-3858': set(['647-728-3858']),
 '+1-647-748-0099': set(['647-748-0099']),
 '+1-647-748-0220': set(['647-748-0220']),
 '+1-647-748-1122': set(['647-748-1122']),
 '+1-647-748-1500': set(['+1 647 7481500']),
 '+1-647-748-1924': set(['647-748-1924']),
 '+1-647-748-2121': set(['+1 647 748 2121']),
 '+1-647-748-2333': set(['6477482333']),
 '+1-647-748-2350': set(['647-748-2350']),
 '+1-647-748-2663': set(['+1 647 748-2663']),
 '+1-647-748-2917': set(['+1 647-748-2917']),
 '+1-647-748-3009': set(['647-748-300

 '+1-905-337-9331': set(['+1 905-337-9331']),
 '+1-905-337-9606': set(['+1 905 337-9606']),
 '+1-905-337-9899': set(['905-337-9899']),
 '+1-905-338-1255': set(['+1 905 338 1255']),
 '+1-905-338-2262': set(['905-338-2262']),
 '+1-905-338-6236': set(['+1 905 338-6236']),
 '+1-905-338-9000': set(['+1 905 338 9000']),
 '+1-905-338-9999': set(['905-338-9999']),
 '+1-905-339-0005': set(['+1 905-339-0005']),
 '+1-905-339-0404': set(['905-339-0404']),
 '+1-905-339-0812': set(['+1 905 339-0812']),
 '+1-905-339-0900': set(['905-339-0900']),
 '+1-905-339-0988': set(['+1 905 339-0988']),
 '+1-905-339-1066': set(['905-339-1066']),
 '+1-905-339-1604': set(['905 339 1604']),
 '+1-905-339-1755': set(['+905-339-1755']),
 '+1-905-339-2224': set(['+1 905 339-2224']),
 '+1-905-339-3678': set(['905-339-3678']),
 '+1-905-357-3331': set(['905-357-3331']),
 '+1-905-362-0153': set(['+1 905-362-0153']),
 '+1-905-362-0226': set(['905-362-0226']),
 '+1-905-362-1234': set(['+1-905-362-1234']),
 '+1-905-362-1340': 

 '+1-905-615-4760': set(['+1 905 615-4760']),
 '+1-905-615-4835': set(['+1 905 615 4835']),
 '+1-905-615-4850': set(['905 615 4850']),
 '+1-905-615-4855': set(['905-615-4855']),
 '+1-905-615-8288': set(['+1 905 615-8288']),
 '+1-905-615-9009': set(['9056159009']),
 '+1-905-616-1863': set(['+1 905-616-1863']),
 '+1-905-616-5003': set(['+1 905-616-5003']),
 '+1-905-618-9934': set(['+1 905-618-9934']),
 '+1-905-624-0688': set(['+1 905 624-0688']),
 '+1-905-624-1995': set(['+1 905 624-1995']),
 '+1-905-624-2100': set(['+1 905 624-2100']),
 '+1-905-624-3034': set(['+1 905-624-3034']),
 '+1-905-624-3200': set(['+1 905 624-3200']),
 '+1-905-624-3404': set(['+1 905 624-3404']),
 '+1-905-624-5040': set(['+1 905 624-5040']),
 '+1-905-624-5545': set(['+1 905 624-5545']),
 '+1-905-624-6424': set(['+1 905-624-6424']),
 '+1-905-624-6900': set(['+1 (905) 624-6900']),
 '+1-905-624-8388': set(['+1 905 624-8388']),
 '+1-905-624-8448': set(['+1 905 624-8448']),
 '+1-905-624-8681': set(['+1 905 624-8681']

 '+1-905-823-2000': set(['1 905 823 2000']),
 '+1-905-823-2626': set(['905 823 2626']),
 '+1-905-823-3900': set(['+1 905 823-3900']),
 '+1-905-823-4335': set(['905 823 4335']),
 '+1-905-823-4440': set(['1 905 823 4440']),
 '+1-905-823-4510': set(['905 823 4510']),
 '+1-905-823-5535': set(['905 823 5535']),
 '+1-905-823-5999': set(['1 905 823 5999']),
 '+1-905-823-6341': set(['1 905 823 6341']),
 '+1-905-823-6520': set(['+1 905 823 6520']),
 '+1-905-823-6878': set(['905 823 6878']),
 '+1-905-823-7600': set(['+1 905 823-7600']),
 '+1-905-823-7636': set(['+1 905 823-7636']),
 '+1-905-823-8430': set(['905 823 8430']),
 '+1-905-823-8787': set(['905 823 8787']),
 '+1-905-823-8818': set(['905-823-8818']),
 '+1-905-823-8988': set(['905-823-8988']),
 '+1-905-823-9252': set(['+1 905 823 9252']),
 '+1-905-823-9555': set(['905-823-9555']),
 '+1-905-824-0155': set(['+1 905 824-0155']),
 '+1-905-824-0360': set(['+1 905 824-0360']),
 '+1-905-824-0727': set(['+1 905 824-0727']),
 '+1-905-824-1025': se

 '+1-905-891-3875': set(['+1 905 891-3875']),
 '+1-905-891-3978': set(['+1 905 891-3978']),
 '+1-905-891-5314': set(['905-891-5314']),
 '+1-905-891-5424': set(['9058915424']),
 '+1-905-891-5500': set(['+1 905 891 5500']),
 '+1-905-891-5999': set(['905-891-5999']),
 '+1-905-891-6061': set(['1 905 891 6061']),
 '+1-905-891-7393': set(['905-891-7393']),
 '+1-905-891-7867': set(['905-891-7867']),
 '+1-905-891-8412': set(['+1 905-891-8412']),
 '+1-905-891-8463': set(['9058918463']),
 '+1-905-891-8688': set(['1 905 891 8688']),
 '+1-905-891-8888': set(['1 905 891 8888;1 905 990 9100']),
 '+1-905-891-9448': set(['1 905 891 9448']),
 '+1-905-891-9493': set(['905-891-9493']),
 '+1-905-891-9642': set(['905-891-9642']),
 '+1-905-891-9696': set(['905-891-9696']),
 '+1-905-893-1661': set(['+1 905 893-1661']),
 '+1-905-893-1711': set(['905-893-1711']),
 '+1-905-893-2111': set(['+1 905 893 2111']),
 '+1-905-893-2829': set(['905-893-2829']),
 '+1-905-893-5594': set(['+1-905-893-5594']),
 '+1-905-893-7

Seeing abnormal or not regular format phone numbers as following

 'unknown': set(['(416) 536-SODA',
                 '+1 905 -90-4110',
                 '1 905 891 326',
                 '439-0000'])}

In [None]:
7. Find out how many unique users have contributed to the map in this particular area.

In [13]:
#The function process_map should return a set of unique user IDs ("uid")

import xml.etree.cElementTree as ET
import pprint
import re

def get_user(element):
    if element.get('uid'):
        return element.get('uid')
    
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        users.add(get_user(element))
        users.discard(None)
    return users

def test():

    users = process_map('toronto_canada3.osm')
    pprint.pprint(users)


if __name__ == "__main__":
    test()
    

set(['1',
     '100337',
     '100419',
     '1007022',
     '1007528',
     '100917',
     '1009527',
     '101184',
     '1012362',
     '1015527',
     '1015528',
     '1015531',
     '1015532',
     '1015536',
     '1015542',
     '102064',
     '10225',
     '102410',
     '102723',
     '103253',
     '1035032',
     '10353',
     '103530',
     '1035507',
     '103574',
     '103816',
     '1044834',
     '104519',
     '1051550',
     '105255',
     '105603',
     '105839',
     '1058666',
     '1060930',
     '106293',
     '1063003',
     '1066031',
     '106858',
     '106874',
     '106914',
     '1069176',
     '10716',
     '1074988',
     '10786',
     '10788',
     '107884',
     '1080971',
     '1083420',
     '108634',
     '1087647',
     '109002',
     '10927',
     '1093914',
     '109925',
     '110046',
     '110253',
     '110263',
     '1103322',
     '1103744',
     '1106095',
     '1108251',
     '110915',
     '1110934',
     '1114651',
     '111504',
     '

     '363803',
     '3638627',
     '3642265',
     '3642441',
     '3642793',
     '3642847',
     '3642909',
     '3642945',
     '3643219',
     '3643482',
     '3645304',
     '364790',
     '364916',
     '3650730',
     '3651808',
     '3656118',
     '365897',
     '3659083',
     '3663623',
     '366834',
     '36694',
     '3673398',
     '36737',
     '3675101',
     '3675236',
     '3676570',
     '3679137',
     '368614',
     '368874',
     '3695876',
     '3696414',
     '3696841',
     '37074',
     '37137',
     '3714149',
     '3720404',
     '3720929',
     '3724915',
     '3741753',
     '3747854',
     '37548',
     '3755964',
     '3758399',
     '3760084',
     '376330',
     '3769434',
     '3770899',
     '3772277',
     '3781487',
     '3782676',
     '3786635',
     '3787045',
     '3788787',
     '3794462',
     '3796219',
     '379970',
     '3803883',
     '3804592',
     '38090',
     '3810551',
     '3813367',
     '3818009',
     '3820742',
     '3821385

     '5216739',
     '5217962',
     '5221799',
     '5221845',
     '5223029',
     '5223984',
     '5224340',
     '5224474',
     '5225518',
     '5228586',
     '522859',
     '5229053',
     '5231095',
     '5231316',
     '523148',
     '5234603',
     '5236645',
     '5236942',
     '5237354',
     '5237720',
     '5239907',
     '524092',
     '5242417',
     '524500',
     '5245700',
     '5245874',
     '5252604',
     '5253809',
     '5254945',
     '52552',
     '5256253',
     '526375',
     '526846',
     '5268983',
     '5270261',
     '5274020',
     '5274327',
     '5275093',
     '527535',
     '5275576',
     '5276164',
     '527721',
     '52797',
     '528001',
     '5280056',
     '5283995',
     '5286660',
     '52867',
     '528847',
     '5289320',
     '5294873',
     '52982',
     '530151',
     '5304006',
     '53073',
     '5310279',
     '5311834',
     '5315651',
     '5319316',
     '5326425',
     '5328301',
     '5328410',
     '5329887',
     '533089'

### 4. Prepair for Database SQL 

After auditing is complete the next step is to prepare the data to be inserted into a SQL database.
   To do so I will parse the elements in the OSM XML file, transforming them from document format to
   tabular format.

   Making it possible to write to .csv files.  These csv files can then easily be imported to a SQL database as tables.

The process for this transformation is as follows:
- Use iterparse to iteratively step through each top level element in the XML
- Shape each element into several data structures using a custom function
- Utilize a schema and validation library to ensure the transformed data is in the correct format
- Write each data structure to the appropriate .csv files

#### From the Case study

I've already provided the code needed to load the data, perform iterative parsing and write the
output to csv files. My task is to complete the shape_element function that will transform each
element into the correct format. To make this process easier we've already defined a schema (schema.py)
for the .csv files and the eventual tables. Using the cerberus library we can validate the output
against this schema to ensure it is correct.

Shape Element Function
The function should take as input an iterparse Element object and return a dictionary.

If the element top level tag is "node":
The dictionary returned should have the format {"node": .., "node_tags": ...}

The "node" field should hold a dictionary of the following top level node attributes:
- id
- user
- uid
- version
- lat
- lon
- timestamp
- changeset
All other attributes can be ignored

The "node_tags" field should hold a list of dictionaries, one per secondary tag. Secondary tags are
child tags of node which have the tag name/type: "tag". Each dictionary should have the following
fields from the secondary tag attributes:
- id: the top level node id attribute value
- key: the full tag "k" attribute value if no colon is present or the characters after the colon if one is.
- value: the tag "v" attribute value
- type: either the characters before the colon in the tag "k" value or "regular" if a colon is not present.

Additionally,

- if the tag "k" value contains problematic characters, the tag should be ignored
- if the tag "k" value contains a ":" the characters before the ":" should be set as the tag type
  and characters after the ":" should be set as the tag key
- if there are additional ":" in the "k" value they and they should be ignored and kept as part of
  the tag key. For example:

  <tag k="addr:street:name" v="Lincoln"/>
  should be turned into
  {'id': 12345, 'key': 'street:name', 'value': 'Lincoln', 'type': 'addr'}

- If a node has no secondary tags then the "node_tags" field should just contain an empty list.

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET
import cerberus
import sys
import schema

OSM_PATH = "toronto_canada3.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  

    if element.tag == 'node':
        for attrib in element.attrib:
            if attrib in NODE_FIELDS:
                node_attribs[attrib] = element.attrib[attrib]
        
        for child in element:
            node_tag = {}
            if LOWER_COLON.match(child.attrib['k']):
                node_tag['type'] = child.attrib['k'].split(':',1)[0]
                node_tag['key'] = child.attrib['k'].split(':',1)[1]
                node_tag['id'] = element.attrib['id']
                node_tag['value'] = child.attrib['v']
                tags.append(node_tag)
            elif PROBLEMCHARS.match(child.attrib['k']):
                continue
            else:
                node_tag['type'] = 'regular'
                node_tag['key'] = child.attrib['k']
                node_tag['id'] = element.attrib['id']
                node_tag['value'] = child.attrib['v']
                tags.append(node_tag)
        
        return {'node': node_attribs, 'node_tags': tags}
        
    elif element.tag == 'way':
        for attrib in element.attrib:
            if attrib in WAY_FIELDS:
                way_attribs[attrib] = element.attrib[attrib]
        
        position = 0
        for child in element:
            way_tag = {}
            way_node = {}
            
            if child.tag == 'tag':
                if LOWER_COLON.match(child.attrib['k']):
                    way_tag['type'] = child.attrib['k'].split(':',1)[0]
                    way_tag['key'] = child.attrib['k'].split(':',1)[1]
                    way_tag['id'] = element.attrib['id']
                    way_tag['value'] = child.attrib['v']
                    tags.append(way_tag)
                elif PROBLEMCHARS.match(child.attrib['k']):
                    continue
                else:
                    way_tag['type'] = 'regular'
                    way_tag['key'] = child.attrib['k']
                    way_tag['id'] = element.attrib['id']
                    way_tag['value'] = child.attrib['v']
                    tags.append(way_tag)
                    
            elif child.tag == 'nd':
                way_node['id'] = element.attrib['id']
                way_node['node_id'] = child.attrib['ref']
                way_node['position'] = position
                position += 1
                way_nodes.append(way_node)
        
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    #process_map(OSM_PATH, validate=False)
    process_map(OSM_PATH, validate=True)
