# Clean Open Street Map in Palo Alto Area

<h2> Brief Overview </h2>

<p>Audit and clean the dataset of Palo Alto area from OpenStreetMap, converting it from XML to JSON format. Then, import the clean JSON file into a MongoDB database and run some queries against it.</p>

In [7]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint
import json
import codecs

In [8]:
osm_file = open("OpenStreetMap.osm", "r")

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
zipcode = re.compile(r'\d{5}(?:-\d{4})?')
#zip_types = defaultdict(set)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court",
           "Place", "Parkway", "Path", "Pass", "Circle", "Road",
           "Road", "Way", "Lane", "Mall"]

mapping = {"Dr.": "Drive",
           "St": "Street",
           "Ave": "Avenue",
           "Rd": "Road",
           "avenue": "Avenue"}

In [9]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
            
def audit_zip_type(zip_types, postal):
    z = zipcode.search(postal)
    if z:
        zip_types['normal'].add(postal)
    else:
        zip_types['non_normal'].add(postal)
        
        
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def is_zip(elem):
    return (elem.attrib['k'] == "addr:postcode")

def audit_street_name(osm_file):
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types
    #pprint.pprint(dict(street_types))

def update_name(name, mapping):
    name_list = name.split(" ")
    suffix = name_list[-1]
    if suffix in mapping:
        new_suffix = mapping[suffix]
        name_list[-1] = new_suffix
    name = " ".join(name_list)
    return name

def audit_zipcode_name():
    zip_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_zip(tag):
                    audit_zip_type(zip_types, tag.attrib['v'])
    pprint.pprint(dict(zip_types))
    
def better_name():
    st_types = audit_street_name(osm_file)
    #pprint.pprint(dict(st_types))
    
    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            name = better_name
            print name, "=>", better_name

#better_name()

#audit_street_name()
audit_zipcode_name()
    

{'normal': set(['94025',
                '94036',
                '94301',
                '94301-2019',
                '94303',
                '94304',
                '94304-1050',
                '94305',
                '94305-6015',
                '94305;94309',
                '94306'])}


In [None]:
# Convert into json file
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
POS = ['lat', 'lon']
#elem_dict = defaultdict(list)

def shape_element(element):
    #node = defaultdict()
    node = {}
    if element.tag == "node" or element.tag == "way":
        for subelement in element:
            if 'k' in subelement.attrib:
                if re.search(problemchars, subelement.attrib['k']):
                    pass
                elif subelement.attrib['k'].count(":")>1:
                    pass
                elif subelement.attrib['k'].startswith("addr:") and subelement.attrib['k'].count(":")==1:
                    if 'address' not in node: 
                        node['address'] = {}
             
                    name = subelement.attrib['k'].split(":")[1]
                    node['address'][name] = subelement.attrib['v']
                    #print node
                elif not subelement.attrib['k'].startswith("addr:"):
                    node[subelement.attrib['k']] = subelement.attrib['v']
            if 'ref' in subelement.attrib:
                if 'node_refs' not in node:
                    node['node_refs'] = []
                node['node_refs'].append(subelement.attrib['ref'])
                        
        node['type'] = element.tag
        created = {}
        if element.tag == 'node':
            node['pos'] = extract_pos(element)
        for elem in element.attrib.keys():
            if elem in CREATED:
                created[elem] = element.attrib[elem]
                node['created'] = created
            elif elem in POS:
                continue
            else:
                node[elem] = element.attrib[elem]
        print node        
        return node
    else:
        return None

def extract_pos(element):
    try:
        return [float(element.attrib['lat']),  float(element.attrib['lon'])]
    except (ValueError, KeyError):
        print 'Failed to parse position ...'
    return None
    
"""
def extract_pos(element):
    lat, lon = None, None
    try:
        lat = float(element.attrib['lat'])
    except:
        print 'Could not parse lat as float', element.attrib['lat']
    try:
        lon = float(element.attrib['lon'])
    except:
        print 'Could not parse lon as float', element.attrib['lon']
    pos = []
    if lat:
        pos.append(lat)
    if lon:
        pos.append(lon)
    return pos
"""

def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def main():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map("OpenStreetMap.osm")
    #pprint.pprint(data)
    
#main()

{'id': '26050447', 'type': 'node', 'pos': [37.4103695, -122.1233837], 'created': {'changeset': '3022229', 'user': 'AM909', 'version': '13', 'uid': '82317', 'timestamp': '2009-11-03T05:32:39Z'}}
{'id': '26050448', 'type': 'node', 'pos': [37.4060528, -122.1199372], 'created': {'changeset': '3022229', 'user': 'AM909', 'version': '9', 'uid': '82317', 'timestamp': '2009-11-03T05:32:37Z'}}
{'id': '26050449', 'type': 'node', 'pos': [37.4055105, -122.1193643], 'created': {'changeset': '865564', 'user': 'StellanL', 'version': '8', 'uid': '28775', 'timestamp': '2009-03-28T08:53:35Z'}}
{'id': '26050450', 'type': 'node', 'pos': [37.4021958, -122.115435], 'created': {'changeset': '3022229', 'user': 'AM909', 'version': '9', 'uid': '82317', 'timestamp': '2009-11-03T05:32:37Z'}}
{'id': '26050451', 'type': 'node', 'pos': [37.4020498, -122.1152392], 'created': {'changeset': '865564', 'user': 'StellanL', 'version': '8', 'uid': '28775', 'timestamp': '2009-03-28T08:53:35Z'}}
{'id': '26068491', 'type': 'nod

In [54]:
# Separate block of function
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        key = element.attrib['k']
        #print key
        if lower.search(key):
            keys['lower'] += 1
        elif lower_colon.search(key):
            keys['lower_colon'] += 1
        elif problemchars.search(key):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
            print key
        
    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys


def main():
    keys = process_map('OpenStreetMap.osm')
    pprint.pprint(keys)

#main()

COUNTYFP
STATEFP
Tiger:MTFCC
COUNTYFP
STATEFP
Tiger:MTFCC
gnis:Class
gnis:County
gnis:County_num
gnis:ST_alpha
gnis:ST_num
gnis:Class
gnis:County
gnis:County_num
gnis:ST_alpha
gnis:ST_num
gnis:Class
gnis:County
gnis:County_num
gnis:ST_alpha
gnis:ST_num
gnis:Class
gnis:County
gnis:County_num
gnis:ST_alpha
gnis:ST_num
gnis:Class
gnis:County
gnis:County_num
gnis:ST_alpha
gnis:ST_num
FIXME
name2
addr:1:housenumber
service:bicycle:chain_tool
socket:type1
source:hgv:national_network
turn:lanes:forward
name_1
tiger:name_base_1
tiger:name_type_1
tiger:zip_left_1
tiger:zip_left_2
turn:lanes:backward
tiger:zip_left_1
tiger:zip_right_1
name_1
tiger:name_base_1
tiger:name_type_1
tiger:zip_left_1
name_1
tiger:name_base_1
tiger:name_type_1
name_1
tiger:name_base_1
tiger:name_type_1
name_1
tiger:name_base_1
tiger:name_type_1
name_1
tiger:name_base_1
tiger:name_type_1
tiger:name_base_1
tiger:zip_left_1
name_1
turn:lanes:forward
tiger:zip_left_1
tiger:zip_left_2
tiger:zip_right_1
tiger:zip_right_2
name