In [1]:
#creates sample file out of original file

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "austin_texas.osm"  # Actual file name
SAMPLE_FILE = "austin_sample.osm" #sample file name to be created

k = 10# Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [1]:
#function to calculate number of times a tag has appeared

import xml.etree.cElementTree as ET
import pprint

def count_tags(filename):
    tree = ET.parse(filename)
    d = dict()
    for elem in tree.iter():
        if elem.tag in d:
            d[elem.tag] += 1
        else:
            d[elem.tag] = 1
    return d

In [2]:
count_tags('austin_sample.osm')

{'member': 2466,
 'nd': 701474,
 'node': 639940,
 'osm': 1,
 'relation': 241,
 'tag': 239116,
 'way': 67065}

In [3]:
#function to check for certain patterns in tag keys

import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        ln = lower.search(element.attrib['k'])
        lcn = lower_colon.search(element.attrib['k'])
        pcn = problemchars.search(element.attrib['k'])
        if(ln):
            keys['lower'] += 1
        elif(lcn):
            keys['lower_colon'] += 1
        elif(pcn):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [4]:
process_map('austin_sample.osm')

{'lower': 131004, 'lower_colon': 106900, 'other': 1212, 'problemchars': 0}

In [7]:
#function to find unique users that have contributed to map of austin area

def get_user(element):
    return element.attrib['uid']


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if((element.tag == 'way') or (element.tag == 'node') or (element.tag == 'relation')) :
            u = get_user(element)
            users.add(u)
        else:
            pass

    return users

In [8]:
process_map('austin_sample.osm')

{'1007528',
 '1012362',
 '104962',
 '105002',
 '1051550',
 '105255',
 '105454',
 '1058177',
 '1058308',
 '105839',
 '1058397',
 '1058433',
 '1058666',
 '105946',
 '106663',
 '1072516',
 '1073361',
 '107681',
 '10786',
 '1087647',
 '108775',
 '109362',
 '110639',
 '110797',
 '1110270',
 '1132286',
 '113375',
 '113450',
 '1136560',
 '1149057',
 '11547',
 '11566',
 '115918',
 '1171541',
 '118021',
 '118134',
 '119002',
 '1195151',
 '119748',
 '119881',
 '121241',
 '121264',
 '1213926',
 '1219875',
 '1227690',
 '1240849',
 '124836',
 '1249761',
 '1260280',
 '129841',
 '129867',
 '130472',
 '1306',
 '131059',
 '13203',
 '132444',
 '1326818',
 '135163',
 '135807',
 '1363265',
 '136520',
 '136828',
 '1376118',
 '13832',
 '139731',
 '1406824',
 '142426',
 '1425439',
 '1425613',
 '142739',
 '1429602',
 '1430450',
 '143162',
 '1432330',
 '1434650',
 '143480',
 '143523',
 '1443840',
 '145231',
 '1464344',
 '147510',
 '1484179',
 '14850',
 '1494110',
 '1501513',
 '152074',
 '152289',
 '1529361',
 

In [5]:
#Code to audit and update fields such as street, phone, zipcode etc

from collections import defaultdict


#OSMFILE = "sample2.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
PHONENUM = re.compile(r'\+1\s\d{3}\-\d{3}\-\d{4}')


expected = ["street", "avenue", "boulevard", "drive", "court", "place", "expressway", "lane", "road", "way", "bend", 
            "branch", "trail", "parkway", "commons", "circle", "cove", "plaza", "loop", "park", "path", "pass","highway",
            "east", "west", "north", "south", "terrace", "crossing","ridge", "trace", "hollow", "view", "walk", "vista"]

#mapping of street abbreviation
st_mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Rd.": "Road",
            "Ln": "Lane",
            "Dr.":"Drive",
            "Dr" : "Drive",
            "Cv" : "Cove",
            "Rd" : "Road",
            "Tr" : "Trail",
            "Ps" : "Pass",
            "Ct" : "Court",
            "Pl" : "Place",
            "Trl" : "Trail",
            "Cir" : "Circle",
            "Blvd" : "Boulevard"
            }

#function to audit street names
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)    
    if m:
        street_type = m.group()
        if str(street_type[0]).isdigit():
            return
        if street_type.lower() not in expected:
            street_types[street_type].add(street_name)
            
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit_street(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

#function to update street names
def update_street_name(name):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type in st_mapping.keys():
            new_name = re.sub(street_type_re, st_mapping[street_type], name)
            return new_name
        else:
            return name

#functions to audit phone numbers
def is_phone(elem):
    return ((elem.attrib['k'] == "phone") | (elem.attrib['k'] == "contact:phone"))


def isvalid_ph(phone_number):
    m = PHONENUM.match(phone_number)
    if m is None:
        return phone_number
    else:
        pass
       
        

def audit_phone(osmfile):
    osm_file = open(osmfile, "r")
    postcode_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_phone(tag):
                    if isvalid_ph(tag.attrib['v']):
                        print tag.attrib['v']          
    osm_file.close()
    
    
#function to format phone numbers as '+1 999-999-9999' 
def update_phone(phone_num):
    m = PHONENUM.match(phone_num)
    
    if m is None:
        #remove brackets
        if "(" in phone_num or ")" in phone_num:
            phone_num = re.sub("[()]", "", phone_num)
        #remove hyphen
        if "-" in phone_num:
            phone_num = re.sub("-", "", phone_num)
        #remove all spaces
        if " " in phone_num:
            phone_num = re.sub(" ", "", phone_num)
        #format phone 
        if re.match(r'\+1\d{10}', phone_num) is not None:
            phone_num = phone_num[:2] + " " + phone_num[2:5] + "-" + phone_num[5:8] + "-" + phone_num[8:]
        # add '+' and format phone number
        elif re.match(r'\d{11}', phone_num) is not None:
            phone_num = "+" + phone_num[:1] + " " + phone_num[1:4] + "-" + phone_num[4:7] + "-" + phone_num[7:]
        #add country code and format phone number
        elif re.match(r'\d{10}', phone_num) is not None:
            phone_num = "+1" + " " + phone_num[:3] + "-" + phone_num[3:6] + "-" + phone_num[6:]
        #checking if number of digits are less than 10
        elif sum(d.isdigit() for d in phone_num) < 10:
            return None
        
    return phone_num

#function to audit direction
def is_direction(elem):
    return ((elem.attrib['k'] == "tiger:name_direction_prefix") | (elem.attrib['k'] == "tiger:name_direction_suffix"))

def audit_direction(osmfile):
    osm_file = open(osmfile, "r")
    postcode_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_direction(tag):
                    print tag.attrib['v']
                                  
    osm_file.close()

#function to audit postcodes
def is_post_code(elem):
    return ((elem.attrib['k'] == "tiger:zip_left") | (elem.attrib['k'] == "tiger:zip_right") | (elem.attrib['k'] == "addr:postcode"))

def audit_postcode(osmfile):
    osm_file = open(osmfile, "r")
    postcode_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_post_code(tag):
                    #checking if zipcode is of 5 digit
                    if ((len(tag.attrib['v']) == 5) & (tag.attrib['v'].isdigit())):
                        continue
                    else:
                        print tag.attrib['v']             
    osm_file.close()

In [9]:
audit_postcode('austin_sample.osm')

78704-7205
78705; 78731; 78703
78705; 78731; 78703
78634; 76574
78731:78759
76574:76578
78703;78701
78703;78701
78705; 78731; 78703
78705; 78731; 78703
78745:78748
78745:78748
78705; 78731; 78703
78705; 78731; 78703
78705; 78731; 78703
78705; 78731; 78703
78745:78748
78745:78748
78745:78748
78745:78748
78727:78728
78745:78748
78745:78748
78731;78756
78756;78731
78727; 78727:78729
78727; 78727:78729
78378:78733


In [13]:
audit_phone('austin_sample.osm')


(512) 443-1057
(512) 444-7770
(512) 501-6738
512-420-0001
+1 512 368 1818
512-266-3168
512-858-4083
512-328-4033
512-472-6266
+15 12 554 4678
512-264-9787
(512) 828-5500
(512) 973-8753
(512) 926-0586
(512) 255-0617
(512) 238-7905
(512)491-8859
512-258-5700
+1 512 262 6500
+1 512 268 3697
(512) 236-1219
(469) 227-2652
512-345-7238
+1 512 357 4143
(512) 451-2306
512-422-9994
512-712-5661
512-234-1868
+1 512 246 1748
+1 512 218 5062
1-512-454-2399
+1 512 7746002
512-459-2300
5124780098
512-759-4422
+1-512-480-2255
+15125706300
(512) 249-0400
512-451-7979
+1 512 634 0070
(512) 528-7700
512-443-3083
512-477-7196
+1-512-471-5482
(512) 288-5440
512-858-4663
(512) 926-1491
+1 512 268 3232
+1 512 268 1963
+1 512 733 9660
Main: (512) 899-4300 Catering: (512) 899-4343
512-288-3344
(512) 285-3375
5128211561
+1 512 847 2296


In [14]:
audit_direction('austin_sample.osm')

W
N
E
E
W
S
E
N
N
W
S
N
N
N
E
N
W
N
W
N
E
E
E
E
S
E
N
N
S
N
W
S
E
N
E
N
W
S
W
W
S
W
N
N
E
W
E
E
W
S
S:W
E
E
W
N
E
E
W
N
E
W
W
S
E
E
E
E
E
W
W
W
E
W
W
E
N
W
E
S
W
E
W
W
W
E
N
E
W
E
W
N
E
S
S
W
W
E
W
E
E
W
W
E
E
S
E
E
E
W
E
W
N
N
N
E
N
W
N
E
E
E
E
W
W
E
W
W
E
N
W
W
W
W
N
N
N
W
E
E
N
E
E
W
W
E
S
E
W
N
N
W
W
E
E
W
S
E
W
E
E
E
N
S
W
W
W
S
S
E
E
N
N
E
E
E
W
S
S
E
S
E
S
W
N
S
S
W
E
E
W
W
E
W
W
E
E
E
W
W
E
S
N:S
S
E
N
S
E
N
W
N
W
S
S
S
E
E
E
S
E
E
E
W
W
W
E
W
E
W
S
N
E
E
S
W
N
E
N:S;N
E
E
E
SW; NW
W
S
N
E
N
E
E
S
E
S
W
S
W
N
S
E
N
W
E
N
S
E
E
W
W
S
S
S
W
W
N
E
E
W
E
E
S
N
N
S
E
E
N
W
E
S
S:W
S
W
E
N
W
W
N
E
E
E
N
S
W;E
W
E
E
N
N
W
S
N
N
N
E
W
E
E
E
E
E
E
W
W
N
W
E
N
W
W
E
S
E
W
E
W
E
W
E
E
N
W
S
S
W
E
W
S
W
E
W
W
N
W
E
W
W
N;S;N
N
W
E
W
W
E
E
E
E
W
N
W
N
W
W
E
W
E
W
E
W
W
E
W
N
W
W
S
W
W
N
E
N
W
N
W
W
E
E
E
W
W
E
N
E
N
N
S
N
N
S
S
E
S
N:W
W
E
S
S
S
S
S
S
W
W
E
E
W
W
E
S
W
N
N
S
S
N
N
N
N
N
N
W
W
W
W
W
W
W
W
W
W
W
N
W
E
E
S
E
E
N
N
S
N
E
N
N
W
E
E
E
N
E
W
E
E
E
E
N
S
S
S
W
W
N
N
W
W
S
N
N
E
N
S

In [15]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import string
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

OSM_PATH = "austin_sample.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
default_tag_type = 'regular'

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

#Function to add speed unit when missing
def add_speed_unit(speed):
    unit = speed.split(' ')
    if unit[-1] == 'mph':
        return speed
    else:
        return speed + " mph"

#Function to update street names, phone, direction, state, speed before loading to csv files
def clean_data(element, elem_child):
    tag_data = {}
    tag_data['id'] = element.attrib['id']
    if LOWER_COLON.search(elem_child.attrib['k'].lower()):
        tag_data['type'] = elem_child.attrib['k'].split(':',1)[0]
        tag_data['key'] = elem_child.attrib['k'].split(':',1)[1]
        tag_data['value'] = elem_child.attrib['v']
    else:
        tag_data['key'] = elem_child.attrib['k']
        tag_data['value'] = elem_child.attrib['v']
        tag_data['type'] = default_tag_type
    
    #update street names
    if is_street_name(elem_child):
        tag_data['value'] = update_street_name(elem_child.attrib['v'])
     
    #change format of phone number
    if is_phone(elem_child):
        tag_data['value'] = update_phone(elem_child.attrib['v'])
        
    #Update TX, tx , Texas etc to common state code
    if (elem_child.attrib['k'] == "addr:state"):
        tag_data['value'] = 'TX'
        
    #update directions    
    if ((elem_child.attrib['k'] == "tiger:name_direction_suffix") | (elem_child.attrib['k'] == "tiger:name_direction_prefix")):
        tag_data['value'] = elem_child.attrib['v']
        if 'N' in tag_data['value']:
            tag_data['value'] = tag_data['value'].replace('N','North')
        if 'E' in tag_data['value']:
            tag_data['value'] = tag_data['value'].replace('E','East')
        if 'W' in tag_data['value']:
            tag_data['value'] = tag_data['value'].replace('W','West')
        if 'S' in tag_data['value']:
            tag_data['value'] = tag_data['value'].replace('S','South')
            
    #update speed units            
    if(elem_child.attrib['k'] == "maxspeed"):
        tag_data['value'] = add_speed_unit(elem_child.attrib['v'])
            
    return tag_data
    


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        for attrib in element.attrib:
            if attrib in NODE_FIELDS:
                node_attribs[attrib] = element.attrib[attrib]
                
        for child in element.iter():
            if child.tag == 'tag':
                if problem_chars.search(child.attrib['k']):
                    continue
                else:
                    new_data = clean_data(element, child)
                    if new_data:
                        tags.append(new_data)
        return {'node': node_attribs, 'node_tags': tags}
        
    elif element.tag == 'way':
        for attrib in element.attrib:
            if attrib in WAY_FIELDS:
                way_attribs[attrib] = element.attrib[attrib]
                
        position = 0
        for child in element.iter():   
            if child.tag == 'tag':
                if problem_chars.search(child.attrib['k']):
                    continue
                else:
                    new_data = clean_data(element, child)
                    if new_data:
                        tags.append(new_data)
            elif child.tag == 'nd':
                way_node = {}
                way_node['id'] = element.attrib['id']
                way_node['node_id'] = child.attrib['ref']
                way_node['position'] = position
                position += 1
                way_nodes.append(way_node)
                
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)