# Sample

In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "boston_massachusetts.osm"  # Replace this with your osm file
SAMPLE_FILE = "boston_massachusetts_sample.osm"

k = 100 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

## File sizes

In [45]:
import os
print "file {!r} is {!s} MB".format(i,round(os.path.getsize("boston_massachusetts.osm")/(1024*1024.0),1))

file 2241802 is 414.2 MB


## Quiz: Iterative Parsing

In [9]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
Fill out the count_tags function. It should return a dictionary with the 
tag name as the key and number of times this tag can be encountered in 
the map as value.

Note that your code will be tested with a different data file than the 'example.osm'
"""
import xml.etree.cElementTree as ET
import pprint

def count_tags(filename):
        # YOUR CODE HERE
    tags = {}
    context = ET.iterparse(filename)
    for event, elem in context:
        if elem.tag not in tags.keys():
            tags[elem.tag] = 1
        else:
            tags[elem.tag] +=1
    return tags

#def test():
#
#    tags = count_tags('example.osm')
#    pprint.pprint(tags)
#    assert tags == {'bounds': 1,
#                     'member': 3,
#                     'nd': 4,
#                     'node': 20,
#                     'osm': 1,
#                     'relation': 1,
#                     'tag': 7,
#                     'way': 1}
#
#    
#
#if __name__ == "__main__":
#    test()
tags = count_tags(SAMPLE_FILE)
#tags = count_tags(OSM_FILE)
pprint.pprint(tags)

{'member': 188,
 'nd': 22220,
 'node': 19315,
 'osm': 1,
 'relation': 13,
 'tag': 9123,
 'way': 3091}


## Quiz: Tag Types

In [47]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into your database, you should check the
"k" value for each "<tag>" and see if there are any potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""

lower = re.compile(r'^([a-z]|_)*$', re.IGNORECASE)
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$', re.IGNORECASE)
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]', re.IGNORECASE)


def key_type(element, keys):
    if element.tag == "tag":
        # YOUR CODE HERE
        v = element.attrib["k"]
        #print problemchars.match(v)
        if lower.match(v) != None:
            keys["lower"] +=1
        elif lower_colon.match(v) != None:
            keys["lower_colon"] +=1
        elif problemchars.match(v) != None:
            keys["problemchars"] +=1
        else:
            #print v
            keys["other"] +=1    
        pass
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys



#def test():
#    # You can use another testfile 'map.osm' to look at your solution
#    # Note that the assertion below will be incorrect then.
#    # Note as well that the test function here is only used in the Test Run;
#    # when you submit, your code will be checked against a different dataset.
#    keys = process_map('example.osm')
#    pprint.pprint(keys)
#    assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}
#
#
#if __name__ == "__main__":
#    test()
keys = process_map(SAMPLE_FILE)
#keys = process_map(OSM_FILE)
pprint.pprint(keys)


{'lower': 7930, 'lower_colon': 1148, 'other': 45, 'problemchars': 0}


## Quiz: Exploring Users

In [274]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    uid = element.attrib['uid']
    return uid


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if element.tag in ['node', 'way', 'relation']:
            uid = get_user(element)
            users.add(uid)
        pass

    return users


#def test():
#
#    users = process_map('example.osm')
#    pprint.pprint(users)
#    assert len(users) == 6
#
#
#
#if __name__ == "__main__":
#    test()
users = process_map(SAMPLE_FILE)
#users = process_map(OSM_FILE)

print len(users)
pprint.pprint(users)


1310
set(['1',
     '100042',
     '100049',
     '100054',
     '1001936',
     '1001987',
     '1002877',
     '1002879',
     '100643',
     '100884',
     '101433',
     '102066',
     '1024769',
     '103253',
     '1034',
     '103574',
     '1041828',
     '104829',
     '104962',
     '1051550',
     '105191',
     '106602',
     '106909',
     '1069176',
     '1072240',
     '107257',
     '1080494',
     '108069',
     '1082652',
     '108403',
     '1084189',
     '108775',
     '1093220',
     '110263',
     '110489',
     '11126',
     '111652',
     '111714',
     '112350',
     '1127439',
     '113450',
     '1137433',
     '1137518',
     '1137571',
     '1137842',
     '114161',
     '115141',
     '115653',
     '116029',
     '1163952',
     '118021',
     '118103',
     '1182159',
     '118856',
     '1194974',
     '1195220',
     '1197653',
     '119838',
     '1198553',
     '119881',
     '1200061',
     '1203164',
     '1203395',
     '120468',
     '1206577',


## Quiz: Improving Street Names

In [197]:
"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

#OSMFILE = "example.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", 'Circle','Highway','Center','Turnpike','Way']




def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


#def test():
#    st_types = audit(OSMFILE)
#    assert len(st_types) == 3
#    pprint.pprint(dict(st_types))
#
#    for st_type, ways in st_types.iteritems():
#        for name in ways:
#            better_name = update_name(name, mapping)
#            print name, "=>", better_name
#            if name == "West Lexington St.":
#                assert better_name == "West Lexington Street"
#            if name == "Baldwin Rd.":
#                assert better_name == "Baldwin Road"
#
#
#if __name__ == '__main__':
#    test()
    
street_types = audit(SAMPLE_FILE)
#street_types = audit(OSM_FILE)
print len(street_types)
pprint.pprint(dict(street_types))

61
{'1100': set(['First Street, Suite 1100']),
 '12': set(['Harvard St #12']),
 '1302': set(['Cambridge Street #1302']),
 '1702': set(['Franklin Street, Suite 1702']),
 '3': set(['Kendall Square - 3']),
 '303': set(['First Street, Suite 303']),
 '501': set(['Bromfield Street #501']),
 '6': set(['South Station, near Track 6']),
 '846028': set(['PO Box 846028']),
 'Artery': set(['Southern Artery']),
 'Ave': set(['738 Commonwealth Ave',
             'Blue Hill Ave',
             'Boston Ave',
             'College Ave',
             'Commonwealth Ave',
             'Concord Ave',
             'Everett Ave',
             'Francesca Ave',
             'Gibson Ave',
             'Harrison Ave',
             'Highland Ave',
             'Josephine Ave',
             'Lexington Ave',
             'Massachusetts Ave',
             'Morrison Ave',
             'Mystic Ave',
             'Sagamore Ave',
             'Somerville Ave',
             "St. Paul's Ave",
             'Washington Ave',
 

In [198]:
# UPDATE THIS VARIABLE
mapping_street = {"Ave": "Avenue","Ave.":"Avenue","Ct":"Court","Dr":"Drive","Ext":"Exit",
           "HIghway":"Highway","Hwy":"Highway","Pkwy":"Parkway","Pl":"Place","Rd":"Road",
           "ST":"Street","Sq.":"Square","St":"Street","St,":"Street","St.":"Street",
           "Street.":"Street","rd.":"Road","st":"Street","street":"Street"}
def update_street(name, mapping):
    name = name.split(" ")
    if name[-1] in mapping.keys():
        name[-1] = mapping[name[-1]]
    name = " ".join(name)
    #name = name.title()
    
    # YOUR CODE HERE
  #  for key in mapping:
   #     name = name.replace(key, mapping[key])
        #print name
    return name

for typ , ways in street_types.iteritems():
    for name in ways:
        better_name = update_street(name, mapping_street)
        print name, "=>", better_name


South Station, near Track 6 => South Station, near Track 6
Cambridge Street #1302 => Cambridge Street #1302
Elm => Elm
Walnut St, => Walnut Street
Winsor => Winsor
Maverick St. => Maverick Street
Pearl St. => Pearl Street
Banks St. => Banks Street
Tremont St. => Tremont Street
Centre St. => Centre Street
Marshall St. => Marshall Street
Prospect St. => Prospect Street
Main St. => Main Street
Albion St. => Albion Street
Saint Mary's St. => Saint Mary's Street
Boylston St. => Boylston Street
Stuart St. => Stuart Street
Elm St. => Elm Street
Furnace Brook => Furnace Brook
Jamaicaway => Jamaicaway
East Boston Greenway => East Boston Greenway
Oakland Rd => Oakland Road
Abby Rd => Abby Road
Bristol Rd => Bristol Road
Squanto Rd => Squanto Road
Goodnough Rd => Goodnough Road
Soldiers Field Rd => Soldiers Field Road
Aberdeen Rd => Aberdeen Road
Cambrdige => Cambrdige
Boston street => Boston Street
Webster Street, Coolidge Corner => Webster Street, Coolidge Corner
Dartmouth => Dartmouth
Newton S

## Quiz: Improving State Names

In [204]:

def audit_state_type(state_types, state_name):
    if state_name not in state_types:
        state_types[state_name] = 1
    else:
        state_types[state_name] += 1
        
def is_state_name(elem):
    return (elem.attrib['k'] == "addr:state")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    state_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_state_name(tag):
                    audit_state_type(state_types, tag.attrib['v'])
    osm_file.close()
    return state_types
  
state_types = audit(SAMPLE_FILE)
#state_types = audit(OSM_FILE)
print len(state_types)
pprint.pprint(dict(state_types))

7
{'MA': 2029,
 'MA- MASSACHUSETTS': 60,
 'MASSACHUSETTS': 1,
 'Ma': 6,
 'Massachusetts': 13,
 'WA': 1,
 'ma': 6}


In [207]:
mapping_state = { "MA- MASSACHUSETTS": "MA",
            "MASSACHUSETTS": "MA",
            "Ma": "MA",
            "Massachusetts": "MA",
            "ma": "MA"
            }

def update_state(name, mapping):
    if name in mapping.keys():
        name = mapping[name]
    return name

for state_type, num in state_types.iteritems():
    better_name = update_state(state_type, mapping_state)
    print state_type, "=>", better_name

ma => MA
MA => MA
MA- MASSACHUSETTS => MA
Massachusetts => MA
Ma => MA
WA => WA
MASSACHUSETTS => MA


## Quiz: Improving ZIP Code

In [206]:
def audit_zipcode(zipcode_types, zipcode):
#    m = street_type_re.search(street_name)
#    if m:
#        street_type = m.group()
#        if street_type not in expected:
#            street_types[street_type].add(street_name)

    if zipcode not in zipcode_types:
        zipcode_types[zipcode] = 1
    else:
        zipcode_types[zipcode] += 1
        
def is_zipcode(elem):
    return (elem.attrib['k'] == "addr:postcode")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    zipcode_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_zipcode(tag):
                    audit_zipcode(zipcode_types, tag.attrib['v'])
    osm_file.close()
    return zipcode_types
  
zipcode_types = audit(SAMPLE_FILE)
#zipcode_types = audit(OSM_FILE)
print len(zipcode_types)
pprint.pprint(dict(zipcode_types))

123
{'01125': 1,
 '01238': 1,
 '01240': 1,
 '01250': 1,
 '01821': 1,
 '01854': 1,
 '01944': 1,
 '02026': 6,
 '02026-5036': 1,
 '02043': 3,
 '02108': 16,
 '02109': 12,
 '02110': 17,
 '02110-1301': 1,
 '02111': 23,
 '02113': 2,
 '02114': 82,
 '02114-3203': 2,
 '02115': 15,
 '02116': 53,
 '02118': 11,
 '02119': 7,
 '02120': 8,
 '02121': 2,
 '02122': 7,
 '02124': 10,
 '02125': 10,
 '02126': 8,
 '02127': 13,
 '02128': 26,
 '02129': 10,
 '02130': 177,
 '02130-4803': 1,
 '02131': 7,
 '02131-3025': 2,
 '02131-4931': 1,
 '02132': 18,
 '02132-1239': 1,
 '02132-3226': 1,
 '02134': 57,
 '02134-1305': 9,
 '02134-1306': 2,
 '02134-1307': 28,
 '02134-1311': 3,
 '02134-1312': 2,
 '02134-1313': 4,
 '02134-1316': 3,
 '02134-1317': 4,
 '02134-1318': 2,
 '02134-1319': 5,
 '02134-1321': 4,
 '02134-1322': 5,
 '02134-1327': 1,
 '02134-1409': 4,
 '02134-1420': 9,
 '02134-1433': 11,
 '02134-1442': 5,
 '02135': 263,
 '02136': 6,
 '02136-2460': 1,
 '02138': 55,
 '02138-1901': 1,
 '02138-2701': 8,
 '02138-2706': 

In [215]:
# UPDATE THIS VARIABLE
zipcode_re = re.compile(r'\d+')

def update_zipcode(zipcode):
    zipcode = zipcode_re.findall(zipcode)
    
    if zipcode != [] and len(zipcode[0]) == 5:
        zipcode = zipcode[0]
        if int(zipcode) <= 1431 or int(zipcode) >= 2770:
            zipcode = '0'
    else:
        zipcode = '0'
    return zipcode

for zipcode_type, num in zipcode_types.iteritems():
    better_zipcode = update_zipcode(zipcode_type)
    print zipcode_type, "=>", better_zipcode


0239 => 0
02186 => 02186
02184 => 02184
02134-1327 => 02134
02189 => 02189
02134-1322 => 02134
02134-1321 => 02134
02138-1901 => 02138
02132-3226 => 02132
01821 => 01821
02134-1433 => 02134
02108 => 02108
02026 => 02026
02476 => 02476
02474 => 02474
02472 => 02472
02139 => 02139
02134-1319 => 02134
02478 => 02478
02136-2460 => 02136
02131-3025 => 02131
02136 => 02136
02140-1340 => 02140
02134 => 02134
02205 => 02205
02132 => 02132
02131 => 02131
02130 => 02130
01854 => 01854
02110-1301 => 02110
02138 => 02138
02138-2903 => 02138
02138-2901 => 02138
02134-1442 => 02134
01250 => 0
02132-1239 => 02132
02445-7638 => 02445
02446 => 02446
02445 => 02445
02138-2742 => 02138
02120 => 02120
02121 => 02121
02210 => 02210
02124 => 02124
02125 => 02125
02126 => 02126
02215 => 02215
02128 => 02128
02129 => 02129
02474-8735 => 02474
01240 => 0
02114-3203 => 02114
02458 => 02458
02459 => 02459
MA 02118 => 02118
MA 02116 => 02116
01125 => 0
02109 => 02109
MA 02186 => 02186
02155 => 02155
02151 => 0215

In [154]:
print type(update_zipcode('10101'))


<type 'str'>


## Quiz: Preparing For Database

In [251]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
After auditing is complete the next step is to prepare the data to be inserted into a SQL database.
To do so you will parse the elements in the OSM XML file, transforming them from document format to
tabular format, thus making it possible to write to .csv files.  These csv files can then easily be
imported to a SQL database as tables.

The process for this transformation is as follows:
- Use iterparse to iteratively step through each top level element in the XML
- Shape each element into several data structures using a custom function
- Utilize a schema and validation library to ensure the transformed data is in the correct format
- Write each data structure to the appropriate .csv files

We've already provided the code needed to load the data, perform iterative parsing and write the
output to csv files. Your task is to complete the shape_element function that will transform each
element into the correct format. To make this process easier we've already defined a schema (see
the schema.py file in the last code tab) for the .csv files and the eventual tables. Using the 
cerberus library we can validate the output against this schema to ensure it is correct.

## Shape Element Function
The function should take as input an iterparse Element object and return a dictionary.

### If the element top level tag is "node":
The dictionary returned should have the format {"node": .., "node_tags": ...}

The "node" field should hold a dictionary of the following top level node attributes:
- id
- user
- uid
- version
- lat
- lon
- timestamp
- changeset
All other attributes can be ignored

The "node_tags" field should hold a list of dictionaries, one per secondary tag. Secondary tags are
child tags of node which have the tag name/type: "tag". Each dictionary should have the following
fields from the secondary tag attributes:
- id: the top level node id attribute value
- key: the full tag "k" attribute value if no colon is present or the characters after the colon if one is.
- value: the tag "v" attribute value
- type: either the characters before the colon in the tag "k" value or "regular" if a colon
        is not present.

Additionally,

- if the tag "k" value contains problematic characters, the tag should be ignored
- if the tag "k" value contains a ":" the characters before the ":" should be set as the tag type
  and characters after the ":" should be set as the tag key
- if there are additional ":" in the "k" value they and they should be ignored and kept as part of
  the tag key. For example:

  <tag k="addr:street:name" v="Lincoln"/>
  should be turned into
  {'id': 12345, 'key': 'street:name', 'value': 'Lincoln', 'type': 'addr'}

- If a node has no secondary tags then the "node_tags" field should just contain an empty list.

The final return value for a "node" element should look something like:

{'node': {'id': 757860928,
          'user': 'uboot',
          'uid': 26299,
       'version': '2',
          'lat': 41.9747374,
          'lon': -87.6920102,
          'timestamp': '2010-07-22T16:16:51Z',
      'changeset': 5288876},
 'node_tags': [{'id': 757860928,
                'key': 'amenity',
                'value': 'fast_food',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'cuisine',
                'value': 'sausage',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'name',
                'value': "Shelly's Tasty Freeze",
                'type': 'regular'}]}

### If the element top level tag is "way":
The dictionary should have the format {"way": ..., "way_tags": ..., "way_nodes": ...}

The "way" field should hold a dictionary of the following top level way attributes:
- id
-  user
- uid
- version
- timestamp
- changeset

All other attributes can be ignored

The "way_tags" field should again hold a list of dictionaries, following the exact same rules as
for "node_tags".

Additionally, the dictionary should have a field "way_nodes". "way_nodes" should hold a list of
dictionaries, one for each nd child tag.  Each dictionary should have the fields:
- id: the top level element (way) id
- node_id: the ref attribute value of the nd tag
- position: the index starting at 0 of the nd tag i.e. what order the nd tag appears within
            the way element

The final return value for a "way" element should look something like:

{'way': {'id': 209809850,
         'user': 'chicago-buildings',
         'uid': 674454,
         'version': '1',
         'timestamp': '2013-03-13T15:58:04Z',
         'changeset': 15353317},
 'way_nodes': [{'id': 209809850, 'node_id': 2199822281, 'position': 0},
               {'id': 209809850, 'node_id': 2199822390, 'position': 1},
               {'id': 209809850, 'node_id': 2199822392, 'position': 2},
               {'id': 209809850, 'node_id': 2199822369, 'position': 3},
               {'id': 209809850, 'node_id': 2199822370, 'position': 4},
               {'id': 209809850, 'node_id': 2199822284, 'position': 5},
               {'id': 209809850, 'node_id': 2199822281, 'position': 6}],
 'way_tags': [{'id': 209809850,
               'key': 'housenumber',
               'type': 'addr',
               'value': '1412'},
              {'id': 209809850,
               'key': 'street',
               'type': 'addr',
               'value': 'West Lexington St.'},
              {'id': 209809850,
               'key': 'street:name',
               'type': 'addr',
               'value': 'Lexington'},
              {'id': '209809850',
               'key': 'street:prefix',
               'type': 'addr',
               'value': 'West'},
              {'id': 209809850,
               'key': 'street:type',
               'type': 'addr',
               'value': 'Street'},
              {'id': 209809850,
               'key': 'building',
               'type': 'regular',
               'value': 'yes'},
              {'id': 209809850,
               'key': 'levels',
               'type': 'building',
               'value': '1'},
              {'id': 209809850,
               'key': 'building_id',
               'type': 'chicago',
               'value': '366409'}]}
"""

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

#OSM_PATH = "example.osm"
OSM_PATH = "boston_massachusetts_sample.osm"
#OSM_PATH = "boston_massachusetts.osm"


NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        node_attribs['id'] = element.attrib['id']
        node_attribs['user'] = element.attrib['user']
        node_attribs['uid'] = element.attrib['uid']
        node_attribs['version'] = element.attrib['version']
        node_attribs['lat'] = element.attrib['lat']
        node_attribs['lon'] = element.attrib['lon']
        node_attribs['timestamp'] = element.attrib['timestamp']
        node_attribs['changeset'] = element.attrib['changeset']

        for tag in element.iter("tag"):
            d={}
            d['id'] = node_attribs['id']
            k = tag.attrib['k']
            if PROBLEMCHARS.match(k) == None:
                if LOWER_COLON.match(k) != None:
                    d['type'] = k.split(':')[0]
                    d['key'] = ':'.join(k.split(':')[1:])
                else:
                    d['type'] = 'regular'
                    d['key'] = k
                    
            if  k == "addr:street":
                d['value'] = update_street(tag.attrib['v'], mapping_street)
            elif  k == "addr:state":
                d['value'] = update_state(tag.attrib['v'], mapping_state)
            elif  k == "addr:postcode":
                d['value'] = update_zipcode(tag.attrib['v'])
            else:
                d['value'] = tag.attrib['v']
            tags.append(d)
        return {'node': node_attribs, 'node_tags': tags}
    
    elif element.tag == 'way':
        way_attribs['id'] = element.attrib['id']
        way_attribs['user'] = element.attrib['user']
        way_attribs['uid'] = element.attrib['uid']
        way_attribs['version'] = element.attrib['version']
        way_attribs['timestamp'] = element.attrib['timestamp']
        way_attribs['changeset'] = element.attrib['changeset']

        for tag in element.iter("tag"):
            d={}
            d['id'] = way_attribs['id']
            k = tag.attrib['k']
            if PROBLEMCHARS.match(k) == None:
                if LOWER_COLON.match(k) != None:
                    d['type'] = k.split(':')[0]
                    d['key'] = ':'.join(k.split(':')[1:])
                else:
                    d['type'] = 'regular'
                    d['key'] = k
                    
            if  k == "addr:street":
                d['value'] = update_street(tag.attrib['v'], mapping_street)
            elif  k == "addr:state":
                d['value'] = update_state(tag.attrib['v'], mapping_state)
            elif  k == "addr:postcode":
                d['value'] = update_zipcode(tag.attrib['v'])
            else:
                d['value'] = tag.attrib['v']
            tags.append(d)
        
        index = 0
        for tag in element.iter("nd"):
            d={}
            d['id'] = way_attribs['id']
            d['node_id'] = tag.attrib['ref']
            d['position'] = index
            way_nodes.append(d)
            index +=1
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}



# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)


## create sql table for nodes


In [None]:
# .import is not woring in Python shell. It should be run from sqlite cmd.

import csv
import sqlite3

#create sql table for nodes
con = sqlite3.connect('boston_massachusetts_sample.db')
con.text_factory = str
cur = con.cursor()
cur.execute('''CREATE TABLE nodes (
    id  INTEGER PRIMARY KEY NOT NULL,
    lat FLOAT,
    lon FLOAT,
    user TEXT,
    uid INTEGER,
    version INTEGER,
    changeset INTEGER,
    timestamp TIMESTAMP);''')
cur.execute('''.mode csv''')
cur.execute(''' .import nodes.csv nodes ''')
QUERY = '''PRAGMA table_info(nodes)'''
rows = cur.execute(QUERY).fetchall()
print(rows)

In [256]:
import csv
import sqlite3

#create sql table for nodes
#con = sqlite3.connect('boston_massachusetts_sample.db')
con = sqlite3.connect('boston_massachusetts.db')
con.text_factory = str
cur = con.cursor()

cur.execute('''DROP TABLE nodes;''')
cur.execute('''CREATE TABLE nodes (
    id INTEGER PRIMARY KEY NOT NULL,
    lat REAL,
    lon REAL,
    user TEXT,
    uid INTEGER,
    version INTEGER,
    changeset INTEGER,
    timestamp TEXT);''') 

with open ('nodes.csv', 'rb') as table:
    dicts = csv.DictReader(table)
    #header = reader.next()
    #dicts =({'id': int(line[0]), 'lat': float(line[1]), 'lon':float(line[2]), 'user':line[3],'uid':int(line[4]),
    #         'version':int(line[5]), 'changeset':int(line[6]),'timestamp':line[7]} for line in reader) 
    to_db = ((i['id'], i['lat'],i['lon'],i['user'],i['uid'],i['version'],i['changeset'],i['timestamp']) for i in dicts) 
    #to_db = (list(i.values()) for i in dicts) 
    cur.executemany("INSERT INTO nodes (id, lat, lon, user, uid, version, changeset, timestamp) VALUES (?,?,?,?,?,?,?,?);", to_db)
    #cur.executemany("INSERT INTO nodes (id, lat, lon, user, uid, version, changeset, timestamp) VALUES (list(dicts.values()));")
con.commit()
#QUERY = '''PRAGMA table_info(nodes)'''
QUERY = '''SELECT * FROM nodes LIMIT 3;'''
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[(30730952,
  42.3678097,
  -71.0218711,
  'wambag',
  326503,
  2,
  14335103,
  '2012-12-19T19:24:31Z'),
 (30730953,
  42.3677364,
  -71.0218568,
  'wambag',
  326503,
  2,
  14335103,
  '2012-12-19T19:24:31Z'),
 (30730954,
  42.3676084,
  -71.0218168,
  'wambag',
  326503,
  2,
  14335103,
  '2012-12-19T19:24:31Z')]


## create sql table for nodes_tags

In [258]:
#create sql table for nodes_tags
#cur.execute('''DROP TABLE nodes_tags;''')
cur.execute('''CREATE TABLE nodes_tags (
    id INTEGER,
    key TEXT,
    value TEXT,
    type TEXT,
    FOREIGN KEY (id) REFERENCES nodes(id));''') 

with open ('nodes_tags.csv', 'rb') as table:
    dicts = csv.DictReader(table)
    to_db = ((i['id'], i['key'],i['value'],i['type']) for i in dicts)  
    cur.executemany("INSERT INTO nodes_tags (id, key,value,type) VALUES (?,?,?,?);", to_db)
con.commit()
#QUERY = '''PRAGMA table_info(nodes_tags)'''
QUERY = '''SELECT * FROM nodes_tags LIMIT 5;'''
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[(31419556, 'name', 'Firebrand Saints', 'regular'),
 (31419556, 'amenity', 'restaurant', 'regular'),
 (31419650,
  'url',
  'http://www.mbta.com/schedules_and_maps/subway/lines/stations/?stopId=12412',
  'regular'),
 (31419650, 'name', 'Kendall/MIT', 'regular'),
 (31419650, 'railway', 'subway_entrance', 'regular')]


## create sql table for ways


In [259]:
#create sql table for ways
#cur.execute('''DROP TABLE ways;''')
cur.execute('''CREATE TABLE ways (
    id INTEGER PRIMARY KEY NOT NULL,
    user TEXT,
    uid INTEGER,
    version TEXT,
    changeset INTEGER,
    timestamp TEXT);''') 

with open ('ways.csv', 'rb') as table:
    dicts = csv.DictReader(table)
    to_db = ((i['id'], i['user'],i['uid'],i['version'],i['changeset'],i['timestamp']) for i in dicts)  
    cur.executemany("INSERT INTO ways (id, user,uid,version,changeset,timestamp) VALUES (?,?,?,?,?,?);", to_db)
con.commit()
#QUERY = '''PRAGMA table_info(ways)'''
QUERY = '''SELECT * FROM ways LIMIT 5;'''
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[(4790718, 'JessAk71', 381909, '3', 6963203, '2011-01-14T00:06:42Z'),
 (4790735, 'claysmalley', 119881, '4', 9069918, '2011-08-19T22:12:46Z'),
 (4824114, 'effektz', 3983283, '20', 39519438, '2016-05-23T20:54:09Z'),
 (4824115, 'ezr2', 1733549, '13', 32296955, '2015-06-29T23:22:38Z'),
 (4824116, 'effektz', 3983283, '24', 39518613, '2016-05-23T20:16:38Z')]


## create sql table for ways_tags


In [260]:
#create sql table for ways_tags
#cur.execute('''DROP TABLE ways_tags;''')
cur.execute('''CREATE TABLE ways_tags (
    id INTEGER NOT NULL,
    key TEXT NOT NULL,
    value TEXT NOT NULL,
    type TEXT,
    FOREIGN KEY (id) REFERENCES ways(id));''') 

with open ('ways_tags.csv', 'rb') as table:
    dicts = csv.DictReader(table)
    to_db = ((i['id'], i['key'],i['value'],i['type']) for i in dicts)  
    cur.executemany("INSERT INTO ways_tags(id, key,value,type) VALUES (?,?,?,?);", to_db)
con.commit()
#QUERY = '''PRAGMA table_info(ways_tags)'''
QUERY = '''SELECT * FROM ways_tags LIMIT 5;'''
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[(4790718, 'foot', 'yes', 'regular'),
 (4790718, 'name', 'Skybridge to Hilton', 'regular'),
 (4790718, 'layer', '1', 'regular'),
 (4790718, 'bridge', 'yes', 'regular'),
 (4790718, 'highway', 'footway', 'regular')]


## create sql table for ways_nodes 


In [261]:
#create sql table for ways_nodes 
#cur.execute('''DROP TABLE ways_nodes;''')
cur.execute('''CREATE TABLE ways_nodes (
    id INTEGER NOT NULL,
    node_id INTEGER NOT NULL,
    position INTEGER NOT NULL,
    FOREIGN KEY (id) REFERENCES ways(id),
    FOREIGN KEY (node_id) REFERENCES nodes(id));''') 

with open ('ways_nodes.csv', 'rb') as table:
    dicts = csv.DictReader(table)
    to_db = ((i['id'], i['node_id'],i['position']) for i in dicts)  
    cur.executemany("INSERT INTO ways_nodes(id, node_id,position) VALUES (?,?,?);", to_db)
con.commit()
#QUERY = '''PRAGMA table_info(ways_nodes)'''
QUERY = '''SELECT * FROM ways_nodes LIMIT 5;'''
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[(4790718, 30730967, 0),
 (4790718, 30730968, 1),
 (4790718, 325383304, 2),
 (4790735, 1404985339, 0),
 (4790735, 1404985374, 1)]


In [263]:
import os
files_lst = ['nodes.csv','ways.csv','nodes_tags.csv','ways_tags.csv',
             'boston_massachusetts.osm', 'boston_massachusetts.db']
for i in files_lst: 
    print "file {!r} is {!s} MB".format(i,round(os.path.getsize(i)/(1024*1024.0),1))

file 'nodes.csv' is 151.8 MB
file 'ways.csv' is 20.0 MB
file 'nodes_tags.csv' is 16.9 MB
file 'ways_tags.csv' is 21.5 MB
file 'boston_massachusetts.osm' is 414.2 MB
file 'boston_massachusetts.db' is 234.6 MB


## Top Postal Codes

In [264]:
QUERY=('''SELECT tags.value, COUNT(*) as count 
        FROM (SELECT * FROM nodes_tags 
        UNION ALL 
        SELECT * FROM ways_tags) tags
        WHERE tags.key='postcode'
        GROUP BY tags.value
        ORDER BY count DESC;''')
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[('02139', 431),
 ('02135', 268),
 ('02130', 182),
 ('02134', 162),
 ('02474', 133),
 ('02144', 117),
 ('02138', 97),
 ('02114', 86),
 ('02143', 67),
 ('02145', 64),
 ('02215', 63),
 ('02116', 59),
 ('02169', 54),
 ('02150', 42),
 ('02446', 41),
 ('02142', 39),
 ('02210', 33),
 ('02472', 33),
 ('02155', 28),
 ('02467', 28),
 ('02128', 27),
 ('02149', 25),
 ('02111', 24),
 ('02140', 24),
 ('02132', 20),
 ('02141', 19),
 ('02110', 18),
 ('02108', 17),
 ('02445', 17),
 ('02115', 15),
 ('02127', 14),
 ('02109', 13),
 ('02131', 13),
 ('02118', 12),
 ('02186', 11),
 ('02124', 10),
 ('02125', 10),
 ('02129', 10),
 ('0', 9),
 ('02478', 9),
 ('02120', 8),
 ('02126', 8),
 ('02459', 8),
 ('02476', 8),
 ('02026', 7),
 ('02119', 7),
 ('02122', 7),
 ('02136', 7),
 ('02151', 7),
 ('02171', 5),
 ('02458', 5),
 ('02152', 4),
 ('02170', 4),
 ('02043', 3),
 ('02184', 3),
 ('02199', 3),
 ('02113', 2),
 ('02121', 2),
 ('01821', 1),
 ('01854', 1),
 ('01944', 1),
 ('02148', 1),
 ('02159', 1),
 ('02174', 1),


## Sort cities by count, descending

In [265]:
QUERY=('''SELECT tags.value, COUNT(*) as count 
FROM (SELECT * FROM nodes_tags UNION ALL 
      SELECT * FROM ways_tags) tags
WHERE tags.key LIKE '%city'
GROUP BY tags.value
ORDER BY count DESC;''')
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)


[('Boston', 949),
 ('Cambridge', 556),
 ('Malden', 413),
 ('1', 411),
 ('Arlington', 285),
 ('Somerville', 245),
 ('Jamaica Plain', 96),
 ('2', 92),
 ('Quincy', 55),
 ('15', 52),
 ('Chelsea', 43),
 ('Brookline', 40),
 ('Medford', 34),
 ('Weymouth', 34),
 ('4', 33),
 ('14', 32),
 ('19', 29),
 ('6', 28),
 ('Chestnut Hill', 21),
 ('West Roxbury', 21),
 ('Allston', 18),
 ('Watertown', 18),
 ('10', 17),
 ('3', 16),
 ('18', 15),
 ('Arlington, MA', 15),
 ('20', 14),
 ('17', 12),
 ('Charlestown', 12),
 ('Dorchester', 12),
 ('Brookline, MA', 11),
 ('Everett', 11),
 ('16', 10),
 ('24', 10),
 ('Belmont', 10),
 ('Brighton', 10),
 ('Newton', 10),
 ('Roslindale', 10),
 ('12', 9),
 ('13', 9),
 ('Boston, MA', 9),
 ('East Boston', 9),
 ('Milton', 9),
 ('Cambridge, MA', 8),
 ('40', 6),
 ('Arlington. MA', 6),
 ('Hingham', 6),
 ('Hyde Park', 6),
 ('Mattapan', 6),
 ('Revere', 6),
 ('11', 5),
 ('21', 5),
 ('25', 5),
 ('5', 5),
 ('50', 5),
 ('Dedham', 5),
 ('yes', 5),
 ('8', 4),
 ('Cambridge, Massachusetts',

## Number of nodes 

In [266]:
QUERY=('''SELECT COUNT(*) FROM nodes;''')
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[(1931442,)]


## Number of ways 

In [267]:
QUERY=('''SELECT COUNT(*) FROM ways;''')
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[(309066,)]


## Number of  unique users 

In [268]:
QUERY=('''SELECT COUNT(DISTINCT(e.uid))          
FROM (SELECT uid FROM nodes UNION ALL SELECT uid FROM ways) e;''')
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[(1295,)]


## Top 10 contributing users

In [269]:
QUERY=('''SELECT e.user, COUNT(*) as num
FROM (SELECT user FROM nodes UNION ALL SELECT user FROM ways) e
GROUP BY e.user
ORDER BY num DESC
LIMIT 10;''')
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[('crschmidt', 1202965),
 ('jremillard-massgis', 430112),
 ('OceanVortex', 92067),
 ('wambag', 80064),
 ('morganwahl', 69535),
 ('ryebread', 67063),
 ('MassGIS Import', 63277),
 ('ingalls_imports', 32461),
 ('Ahlzen', 27154),
 ('mapper999', 14967)]


## Number of users appearing only once (having 1 post)

In [270]:
QUERY=('''SELECT COUNT(*) 
FROM
    (SELECT e.user, COUNT(*) as num
     FROM (SELECT user FROM nodes UNION ALL SELECT user FROM ways) e
     GROUP BY e.user
     HAVING num=1)  u;''')
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[(355,)]


## Additional Data Exploration

## Top 10 appearing amenities



In [271]:
QUERY=('''SELECT value, COUNT(*) as num
FROM nodes_tags
WHERE key='amenity'
GROUP BY value
ORDER BY num DESC
LIMIT 10;''')
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[('bench', 1060),
 ('restaurant', 602),
 ('school', 509),
 ('place_of_worship', 287),
 ('library', 280),
 ('bicycle_parking', 273),
 ('cafe', 248),
 ('fast_food', 184),
 ('bicycle_rental', 138),
 ('post_box', 114)]


## Biggest religion (no surprise here)



In [272]:
QUERY=('''SELECT nodes_tags.value, COUNT(*) as num
FROM nodes_tags 
    JOIN (SELECT DISTINCT(id) FROM nodes_tags WHERE value='place_of_worship') i
    ON nodes_tags.id=i.id
WHERE nodes_tags.key='religion'
GROUP BY nodes_tags.value
ORDER BY num DESC
LIMIT 1;''')
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[('christian', 255)]


## Most popular cuisines



In [273]:
QUERY=('''SELECT nodes_tags.value, COUNT(*) as num
FROM nodes_tags 
    JOIN (SELECT DISTINCT(id) FROM nodes_tags WHERE value='restaurant') i
    ON nodes_tags.id=i.id
WHERE nodes_tags.key='cuisine'
GROUP BY nodes_tags.value
ORDER BY num DESC;''')
rows = cur.execute(QUERY).fetchall()
#print(rows)
pprint.pprint(rows)

[('pizza', 38),
 ('american', 35),
 ('italian', 31),
 ('chinese', 29),
 ('mexican', 27),
 ('indian', 21),
 ('thai', 19),
 ('asian', 13),
 ('japanese', 12),
 ('regional', 12),
 ('sandwich', 11),
 ('seafood', 9),
 ('ice_cream', 8),
 ('international', 7),
 ('sushi', 7),
 ('french', 5),
 ('ramen', 5),
 ('burger', 4),
 ('diner', 4),
 ('fish', 4),
 ('turkish', 4),
 ('mediterranean', 3),
 ('tapas', 3),
 ('vegetarian', 3),
 ('barbecue', 2),
 ('coffee_shop', 2),
 ('german', 2),
 ('greek', 2),
 ('kebab', 2),
 ('persian', 2),
 ('pub', 2),
 ('steak_house', 2),
 ('vietnamese', 2),
 ('Burmese', 1),
 ('Greek-American', 1),
 ('Israeli', 1),
 ('Italian panini, subs, pizza, calzone, zuppe', 1),
 ('Italian,_pizza,_pasta', 1),
 ('Modern American', 1),
 ('Scottish', 1),
 ('Tapas', 1),
 ('afghan', 1),
 ('argentinian', 1),
 ('bagel', 1),
 ('brazilian', 1),
 ('creole', 1),
 ('crepe', 1),
 ('deli', 1),
 ('eritrean', 1),
 ('ethiopian', 1),
 ('falafel', 1),
 ('french_cambodian', 1),
 ('frozen_yogurt', 1),
 ('kor

## Conclusion

# =========================