# Data Wrangle  - Creating and Auditing the Data Set 

### Downloaded the file from https://www.openstreetmap.org/relation/324211 

### 1. Creating the Sample Toronto OSM file.

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "toronto_canada.osm"  # Replace this with your osm file
SAMPLE_FILE = "torontosample.osm"

k = 15 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')


### 2. Audit the Data

In [1]:
"""
It should return a dictionary with the tag name as the key and number of times this tag can be encountered in 
the map as value.

"""
import xml.etree.cElementTree as ET
import pprint

def count_tags(filename):
    dict_ = {}
    for event,elem in ET.iterparse(filename):
        if elem.tag not in dict_:
            dict_[elem.tag] = 1
        else:
            dict_[elem.tag] += 1
    return dict_

def test():
    tags = count_tags('toronto_canada.osm')
    #tags = count_tags('torontosample.osm')
    pprint.pprint(tags) 

if __name__ == "__main__":
    test()

{'bounds': 1,
 'member': 151372,
 'nd': 5875055,
 'node': 5127518,
 'osm': 1,
 'relation': 9677,
 'tag': 5030403,
 'way': 761285}


In [2]:
"""
From lecture I have 3 regular expressions to check for certain patterns in the tags.
I would like to change the data model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}

So, I want to see. I have such tags, and if I have any tags with problematic characters.

I have a count of each of four tag categories in a dictionary:
1.  "lower"        - for tags that contain only lowercase letters and are valid
2.  "lower_colon"  - for otherwise valid tags with a colon in their names
3.  "problemchars" - for tags with problematic characters 
4.  "other"        - for other tags that do not fall into the other three categories.

"""
import xml.etree.cElementTree as ET
import pprint
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

osm_file = "toronto_canada.osm"
#osm_file = "torontosample.osm"

def key_type(element, keys):
    if element.tag == "tag":
        att = element.attrib['k'] 
        m = lower.search(att)
        p = problemchars.search(att)
        n = lower_colon.search(att)
        if m:
            keys["lower"] += 1
        elif n:
            keys["lower_colon"] += 1
        elif p:
            keys["problemchars"] += 1
        else:
            keys["other"] += 1
#pass    
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

def test():
    #keys = process_map('torontosample.osm')
    keys = process_map(osm_file)
    pprint.pprint(keys)

if __name__ == "__main__":
    test()

{'lower': 2970797,
 'lower_colon': 1939747,
 'other': 119222,
 'problemchars': 637}


In [None]:
3. I am auditing and finding the abbreviated street name.

In [3]:
"""
The OSM file used is an abbreviated version of the Toronto Mapzen file  
"""

import xml.etree.cElementTree as ET
from collections import defaultdict
import re

#osm_file = open("torontosample.osm", "r")
osm_file = open("toronto_canada.osm", "r")
street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()

        street_types[street_type] += 1

def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())  # Python 2.7 ver
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v) 

def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit():
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])    
    print_sorted_dict(street_types)    

if __name__ == '__main__':
    audit()

#1: 2
#10: 1
#101: 1
#106: 1
#11: 2
#110: 2
#12A: 1
#17: 1
#18: 1
#185: 1
#19: 2
#2: 2
#200: 1
#202: 2
#204: 1
#23B: 1
#2801: 1
#282: 1
#2A: 1
#3: 1
#318: 1
#332: 1
#364: 1
#37: 1
#38: 1
#4: 4
#409: 1
#4B: 1
#5: 3
#5-6: 1
#57: 1
#6: 1
#7: 1
#8: 1
#A: 1
#A3: 1
#E8: 1
#G: 1
#PHW: 1
1: 329
10: 443
101: 4
102: 4
106: 2
107: 6
109: 19
10a: 4
11: 108
115c: 1
11a: 4
11b: 8
12: 139
124: 45
125: 12
12a: 4
12b: 4
13: 36
132: 2
13b: 4
14: 37
141: 2
146: 6
147: 4
14a: 4
14b: 6
14c: 2
14d: 4
14e: 2
14f: 2
15: 140
16: 60
17: 127
18: 120
19: 56
2: 517
20: 140
21: 76
22: 19
23: 36
24: 52
25: 159
26: 16
27: 379
28: 24
2b: 8
3: 387
30: 12
300: 1
302: 1
32: 52
34: 46
38: 1
39: 10
4: 359
400: 4
403: 2
42: 1
47: 98
48: 203
4b: 8
5: 283
50: 312
500: 1
52: 21
56: 2
57: 132
5700: 1
5a: 4
6: 459
6a: 6
7: 1190
7-12: 2
7;12: 5
7A: 84
8: 511
88: 53
89: 213
89): 1
8a: 4
9: 340
97: 32
99: 4
Abbey: 4
Access: 4
Acres: 14
Adjala: 180
Agostino: 4
Alley: 26
Alliston: 40
Amaranth: 42
Amberdale: 21
Ames: 8
Amici: 12
Anton

In [8]:
#The function process_map should return a set of unique user IDs ("uid")

import xml.etree.cElementTree as ET
import pprint
import re

def get_user(element):
    if element.get('uid'):
        return element.get('uid')
    
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        users.add(get_user(element))
        users.discard(None)
    return users

def test():

    users = process_map('toronto_canada3.osm')
    pprint.pprint(users)


if __name__ == "__main__":
    test()
    

set(['1',
     '100337',
     '100419',
     '1007022',
     '1007528',
     '100917',
     '1009527',
     '101184',
     '1012362',
     '1015527',
     '1015528',
     '1015531',
     '1015532',
     '1015536',
     '1015542',
     '102064',
     '10225',
     '102410',
     '102723',
     '103253',
     '1035032',
     '10353',
     '103530',
     '1035507',
     '103574',
     '103816',
     '1044834',
     '104519',
     '1051550',
     '105255',
     '105603',
     '105839',
     '1058666',
     '1060930',
     '106293',
     '1063003',
     '1066031',
     '106858',
     '106874',
     '106914',
     '1069176',
     '10716',
     '1074988',
     '10786',
     '10788',
     '107884',
     '1080971',
     '1083420',
     '108634',
     '1087647',
     '109002',
     '10927',
     '1093914',
     '109925',
     '110046',
     '110253',
     '110263',
     '1103322',
     '1103744',
     '1106095',
     '1108251',
     '110915',
     '1110934',
     '1114651',
     '111504',
     '

     '411528',
     '411561',
     '4120008',
     '41214',
     '412154',
     '4129036',
     '4129101',
     '413360',
     '4138265',
     '4138390',
     '413853',
     '414538',
     '4148813',
     '415250',
     '4153291',
     '416346',
     '4167103',
     '4174894',
     '4178914',
     '4178994',
     '418347',
     '4190651',
     '419113',
     '4193842',
     '419601',
     '4208231',
     '4210501',
     '42123',
     '4213567',
     '4215568',
     '4216140',
     '42191',
     '4221303',
     '4225057',
     '4225751',
     '4231697',
     '4240592',
     '42429',
     '4250350',
     '425214',
     '4254864',
     '4255535',
     '425846',
     '4258748',
     '4260390',
     '427384',
     '4274282',
     '4276249',
     '428339',
     '428720',
     '4288229',
     '4293558',
     '4294297',
     '4297322',
     '4302385',
     '4304988',
     '4308356',
     '43087',
     '4315127',
     '4315320',
     '4320188',
     '4320689',
     '4324046',
     '4324792',
  

     '5666377',
     '5666698',
     '5666860',
     '5667446',
     '5667994',
     '5668280',
     '5669016',
     '5669281',
     '5669613',
     '567139',
     '5671966',
     '5674088',
     '567601',
     '5677293',
     '567792',
     '5682957',
     '5683437',
     '5686313',
     '5687126',
     '56884',
     '5688958',
     '5692508',
     '5692965',
     '5697319',
     '5697688',
     '5698967',
     '5700381',
     '5703022',
     '570774',
     '571080',
     '571170',
     '5715057',
     '5715561',
     '57158',
     '5722011',
     '5725028',
     '5731246',
     '5737518',
     '5743216',
     '574654',
     '574684',
     '574730',
     '574775',
     '574850',
     '5751301',
     '5751745',
     '5756206',
     '5761375',
     '576174',
     '576338',
     '5764933',
     '5765055',
     '5765492',
     '5766343',
     '576659',
     '576728',
     '5768886',
     '5769800',
     '5771196',
     '5771543',
     '577490',
     '577807',
     '578519',
     '5788493'