In [40]:
import xml.etree.ElementTree as ET
from collections import defaultdict
from pprint import pprint
import operator
import re

filename = 'sydney_sample.osm'
top_20keys = []
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

# Function

In [52]:
"""Count total number of each tag"""
def count_tags(filename):

        osm_file = open(filename, "r")
        tag_types = {}
        
        for event, elem in ET.iterparse(filename):

            if elem.tag not in tag_types:
                tag_types[elem.tag] = 1
            else:
                tag_types[elem.tag] +=1

        return tag_types

def count_keys(filename):
        osm_file = open(filename, "r")
        key_types = {}
        
        for event, elem in ET.iterparse(filename):
            # only parse all key name under tag element
            if elem.tag == "tag" and 'k' in elem.attrib:                
                v = elem.get('k')
                if v not in key_types:
                    key_types[v] = 1
                else:
                    key_types[v] +=1
        
        return key_types
    
# print all tag
def print_all_tags(filename):
    tags = count_tags(filename)    
    pprint(tags)
    
# collect and print top keys from each tag
def print_top_keys(filename, keyNum = 20):
    keys = count_keys(filename)
    #sort keys
    sorted_keys = sorted(keys.items(), key=operator.itemgetter(1))
    sorted_keys.reverse()
    # print top 20 keys
    for i in range(0,keyNum):
        top_20keys.append
        print sorted_keys[i][0]


def key_values(top_keys, Max):
    result = {k:[] for k in top_keys}
    # parse file
    osm_file = open(filename, "r")
    
    for event, elem in ET.iterparse(filename):

        # only parse all key name under tag element
        if elem.tag == "tag" and 'k' in elem.attrib and 'v' in elem.attrib:                
            k_val = elem.get('k')
            v_val = elem.get('v')
            # find 20 unique values of the key
            if k_val in top_keys and len(result[k_val]) < Max and v_val not in result[k_val]:        
                result[k_val].append(v_val)
    return result 

"""Max Speed"""


def is_maxspeed_name(elem):
    return (elem.attrib['k'] == "maxspeed")

def update_name(speed):
    new_sp = 0
    try:
        val = int(speed)
    except:
        if speed == 'signals':
            # default value for signal sign
            new_sp = 80
            print speed, "=>", new_sp
        elif 'knots' in speed or 'mph' in speed:
            #convert unit
            new_sp = int(speed.split()[0])*2
            print speed, "=>", new_sp
        else:
            # remove other values, e.g. ";" and 'undefined'
            pass

def improve_maxspeed(osmfile):
    osm_file = open(osmfile, "r")
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_maxspeed_name(tag):
                    val = tag.attrib['v']
                    update_name(val)
                    
    osm_file.close()

"""Update Satreet Name"""

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Highway", "Place", "Parade", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons","Row", "East", "West", "Esplanade", "Crescent"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "st": "Street",
            "St.": "Street",
            "Ave.": "Avenue",
           "Av.": "Avenue",
           "Hwy": "Highway",
            "Ave": "Avenue",
            "Rd.": "Road",
            "Rd": "Road"
            }

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            # remove street type that is equal to itself
            if street_name != street_type:
                street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):                    
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

def update_name(name, mapping):

    p = name.split()
    if p[-1] in mapping:
        p[-1] = mapping[p[-1]]
        name = " ".join(p)

    return name

def improve_street_names(filename):
    st_types = audit(filename)
    
    pprint(dict(st_types))
    show = 1
    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            if show:
                print "--------------------------"
                print "After update names"
                print "--------------------------"
                show = 0
            print name, "=>", better_name

# Main

In [9]:
if False:
    print_all_tags(filename)
    
if False:
    print_top_keys(filename)
    


In [11]:
"""find the 20 value for each top key"""

MAX_COUNT = 20
top_keys = ['highway','source','name','building',
        'created_by','oneway','maxspeed','addr:street','amenity','surface','foot','leisure',
        'layer','bicycle','ref','source:name','service','railway','landuse','operator']

In [35]:
# print the total 20 values of the top keys
if False:
    pprint(key_values(top_keys, MAX_COUNT))   

# update maxspeed values
if False:
    improve_maxspeed(filename)

In [53]:
# improve street name
if True:
    improve_street_names(filename)

{'Av.': set(['Ryan Av.']),
 'Hwy': set(['Pacific Hwy']),
 'St': set(['Station St']),
 'St.': set(['Lower Almora St.'])}
--------------------------
After update names
--------------------------
Ryan Av. => Ryan Avenue
Station St => Station Street
Lower Almora St. => Lower Almora Street
Pacific Hwy => Pacific Highway
