# Open Street Map
## PKENNEDY
### First things first, let's import some initial relevant libraries and set our path

In [1]:
from xml.etree import cElementTree as ET  # Use cElementTree or lxml if too slow
from collections import defaultdict
import pprint
import re
DATA_DIR = '/Users/patrickkennedy/Downloads'
atx_osm = DATA_DIR + '/austin_texas.osm'


### For purposes of testing, let's make a smaller file called sample by which we can audit and clean

In [36]:
#For the submission of a smaller data set (~8mb)

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#NOTE: code provided in instructions

OSM_FILE = atx_osm
SAMPLE_FILE = "sample.osm"


def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every 5th top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % 5 == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')


### Great, now let's dig in...

In [3]:
#Given street names may offer varying standards, let's set some initial definitions...
#we will use both the expected list and mapping dictionary to identify if street names follow the expected conventions

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Cove", "Highway", "North", "East", "West", "South"]

mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Rd." : "Road",
            "Rd" : "Road",
            "Blvd" : "Boulevard",
            "Blvd." : "Boulevard",
            "Ln." : "Lane",
            "Ln" : "Lane",
            "Cv" : "Cove",
            "Cv." : "Cove",
            "Dr" : "Drive",
            "Dr." : "Dr",
            "Ct" : "Court",
            "Ct." : "Court",
            "Ovlk" : "Overlook",
            "Ovlk." : "Overlook",
            "Pkwy" : "Parkway",
            "Pkwy." : "Parkway",
            "Pl" : "Place",
            "Pl." : "Place",
            "Tr": "Trail",
            "Trl": "Trail",
            "Tr.": "Trail",
            "Trl.": "Trail",
            "street" : "Street",
            "cove" : "Cove",
            "lane" : "Lane",
            "court" : "Court",
            "pass" : "Pass",
            "HWY" : "Highway",
            "Hwy" : "Highway",
            "N" : "North",
            "N." : "North",
            "E" : "East",
            "E." : "East",
            "W" : "West",
            "W." : "West",
            "S" : "South",
            "S." : "South",
            "US" : "U.S."
            }


In [4]:
#Auditing street name data

#use regex to ID the street name
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

#add all unexpected street names to the street_types set
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

#if the attribute key matches 'addr:street', it is a street
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

#open the osm file and iterate using event based parsing
#navigate to the street, and audit, returning the street_types set
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types

#for each street name, check each element and replace each split element
#of the street name with a value in the mapping dictionary if the mapping key matches the split element
def update_name(name, mapping):

    split_name = name.split(" ")
    for index, i in enumerate(split_name):
        if i in mapping:
            split_name[index] = mapping[i]
    name = " ".join(split_name)
    
    return name

#test to see if the functions above work and print out the initial streets with their corrected versions
def test():
    st_types = audit(atx_osm)
    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name

if __name__ == '__main__':
    test()

Merimac => Merimac
Clara Van => Clara Van
Capri => Capri
Chelsea Moor => Chelsea Moor
Royal Birkdale Ovlk => Royal Birkdale Overlook
Lions Lair => Lions Lair
Apache => Apache
Farm-to-Market Road 812 => Farm-to-Market Road 812
Bee Cave Road Suite 163 => Bee Cave Road Suite 163
Adventurer => Adventurer
Affirmed => Affirmed
West 35th Street Cutoff => West 35th Street Cutoff
Ferguson Cutoff => Ferguson Cutoff
House Wren => House Wren
N I-35 Suite 298 => North I-35 Suite 298
Melody => Melody
East Highway 290 => East Highway 290
Highway 290 => Highway 290
W. Highway 290 => West Highway 290
East Hwy 290 => East Highway 290
C R 290 => C R 290
W US Highway 290 => West U.S. Highway 290
West Highway 290 => West Highway 290
W Hwy 290 => West Highway 290
U.S. 290 => U.S. 290
E Hwy 290 => East Highway 290
US Highway 290 => U.S. Highway 290
County Road 290 => County Road 290
W Highway 290 => West Highway 290
W HWY 290 => West Highway 290
US 290 => U.S. 290
Pony Chase => Pony Chase
Pecan Chase => Peca

In [5]:
#auditing postcode data

#regex to ID if there are letters in the postcode
postcode_letter_re = re.compile('[a-zA-Z]')

#open the osm fie, iterate by events, find postcode elem
#if appropriate attribute is a postcode then check to see if either:
#the post code contains letters or if the post code is longer than 5 characters
#if either condition is true, add it to a postcode_types set and return the set
def audit(osmfile):
    osm_file = open(osmfile, "r")
    postcode_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'] == 'addr:postcode':
                    m = postcode_letter_re.search(tag.attrib['v'])
                    if m:
                        postcode_type = m.group()
                        postcode_types[postcode_type].add(tag.attrib['v'])
                    elif len(tag.attrib['v']) > 5:
                        postcode_types[len(tag.attrib['v'])].add(tag.attrib['v'])
    
    return postcode_types


#take the offending code, split it, ID if letters need to be truncated from front
#or if the +4 needs to be stripped from back
#if there is only text in the zip field, return None
#return only the 5 digit zip
def update_code(code):

    split_code = re.split(' |-',code)
    if len(split_code) > 1:
        m = postcode_letter_re.search(split_code[0])
        if m:
            code = split_code[1]
        if len(split_code[-1]) < 5:
            if len(split_code) > 2:
                code = split_code[1]
            elif len(split_code) < 3:
                code = split_code[0]
    else:
        if postcode_letter_re.search(split_code[0]):
            code = None    
    return code

#testing the code and as with the street names, show original and bettered versions
if __name__ == '__main__':
    pc_types = audit(atx_osm)
    for pc_type, ways in pc_types.iteritems():
        for code in ways:
            better_code = update_code(code)
            print code , "=>", better_code
        


78640-4520 => 78640
76574-4649 => 76574
78754-5701 => 78754
78724-1199 => 78724
78704-7205 => 78704
78758-7008 => 78758
78728-1275 => 78728
78753-4150 => 78753
78758-7013 => 78758
78640-6137 => 78640
78704-5639 => 78704
TX 78745 => 78745
TX 78758 => 78758
TX 78735 => 78735
TX 78613 => 78613
TX 78759-3504 => 78759
TX 78724 => 78724
TX 78728 => 78728
Texas => None
tx => None


In [54]:
#what other data to audit? let's see if phones have differing standards...
#yep, they sure do!


#auditing postcode data

#check if the phone number has text
phone_letter_re = re.compile('[a-zA-Z]')

#as with the postcode audit... open osm, iterate event-wise, find phone elem,
#if phone has letters or if phone is essentially not blank, add this to phone_types set
#and return the set
def audit(osmfile):
    osm_file = open(osmfile, "r")
    phone_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'] == 'phone':                    
                    m = phone_letter_re.search(tag.attrib['v'])
                    if m:
                        phone_type = m.group()
                        phone_types[phone_type].add(tag.attrib['v'])
                    elif len(tag.attrib['v']) > 0:
                        phone_types[tag.attrib['v']].add(tag.attrib['v'])
    
    return phone_types


#helper function that formats the phone number
#takes a phone number split by digit, assembles into basic
#pieces: country code, area code, first 3 and last 4
#then it returns the assemble list of length 4
def assemble_number(data):
    assemble = []
    assemble.append(data[0])
    assemble.append("".join(data[1:4]))
    assemble.append("".join(data[4:7]))
    assemble.append("".join(data[7:]))
    return assemble
    
    
#update previously unstandardized phone numbers into a single standard form    
def update_code(code):
    #turn the original phone number into a list after stripping any excess chars
    clean_code = code.strip()
    split_code = list(clean_code)
    
    #walk through each char in the phone number and only spit out the digits
    new = [x for x in split_code if x.isdigit()]
    
    #if the first number does not have a '1' as a country code, insert it
    if new[0] != '1':
        new.insert(0,'1')
    
    #if the list has a length greater than the length of the standard phone number,
    #split it into two numbers
    if len(new) > 11:
        first = new[:11]
        second = new[11:]
        
        #but if the second number is less than the standard phone number excluding a country code
        #return None as we don't know if there was a misstype in the original number that led it
        #to be longer than 11 digits
        if len(second) < 10:
            return None
        
        #if the second number doesn't have a country code, add it
        if second[0] != '1':
            second.insert(0,'1')
        both_list = [first, second]
        both = []
        
        #iterate through the list of phone numbers and use the assemble function to structure the 4 phone elems
        #join the new list on '-' so that it fits a X-XXX-XXX-XXXX standard and return both numbers as a list
        for order in both_list:
            phone = "-".join(assemble_number(order))
            both.append(phone)
        return both
    
    #if we just had one number rather than two and it was the appropriate length,
    #call the assemble function to structure the number and join on the '-'
    phone = "-".join(assemble_number(new))
    
    return phone
    
#as with postcodes, print the offending number and its bettered self
if __name__ == '__main__':
    phone_types = audit(atx_osm)
    for phone_type, ways in phone_types.iteritems():
        for phone in ways:
            better_code = update_code(phone)
            print phone , "=>", better_code
        


+1 512 585 6703 => 1-512-585-6703
(512) 828-5500 => 1-512-828-5500
(512) 334-4684 => 1-512-334-4684
(512) 891-8906 => 1-512-891-8906
1 512 268 3232 => 1-512-268-3232
512-447-1447 => 1-512-447-1447
(512) 502-8445 => 1-512-502-8445
+1-512-246-0460 => 1-512-246-0460
(512) 285-2867 => 1-512-285-2867
+1 512 295 2921 => 1-512-295-2921
+1 512 479 0485 => 1-512-479-0485
(512) 528-0150 => 1-512-528-0150
+1 512 295 7818 => 1-512-295-7818
512-791-5961 => 1-512-791-5961
(512) 251-4553 => 1-512-251-4553
(512) 652-1200 => 1-512-652-1200
+1-512-973-8143 => 1-512-973-8143
+1 512 670 5174 => 1-512-670-5174
(512) 459-6513 => 1-512-459-6513
512-386-3400 => 1-512-386-3400
(512) 759-4700 => 1-512-759-4700
+1 512 295 5004 => 1-512-295-5004
(512) 442-2354 => 1-512-442-2354
(512) 451-2306 => 1-512-451-2306
512.386.1295 => 1-512-386-1295
512-280-1201 => 1-512-280-1201
+1 512 256 0666 => 1-512-256-0666
+1 512 310 9024 => 1-512-310-9024
512-343-8020 => 1-512-343-8020
+1-512-666-5286;+1-855-444-8301 => ['1-512-66

### Ok we are done with 3 ways to clean this particular dataset, let's do it for real this time

In [56]:
#using the structure of quiz 6.5 to form the shape of the json doc

import codecs
import json

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

def shape_element(element):
    
    node = {}
    address = {}
    pos=[]
    node['pos'] = pos
    
    #only looking at node and way tags
    if element.tag == "node" or element.tag == "way" :
        node['id'] = element.attrib['id']
        node['type'] = element.tag
        if 'visible' in element.attrib:
            node['visible'] = element.attrib['visible']
       
        created = {}
        for i in CREATED:
            created[i]=element.attrib[i]  
            node['created']=created
        
        #now let's dig through elements
        for elem in element:
            if 'k' in elem.attrib:  #if there is a key attribute...
                if elem.attrib['k'].count(':')<2:  #and if that key attribute does not contain more than 1 ':'...
                    if elem.attrib['k'].startswith("addr:"): #and if it starts with 'addr:'...
                        stripped=elem.attrib['k'].replace("addr:","")  #remove 'addr:' and take what remains
                        if stripped == 'housenumber':  #if what remains is 'housenumber', store that
                            address['housenumber'] = elem.attrib['v'] 
                        if stripped == 'street': #if what remains is street...
                            m = street_type_re.search(elem.attrib['v']) #run the regex matching function
                            if m:  #if the street is not appropriately formatted...
                                address['street'] = update_name(elem.attrib['v'], mapping) #run the update function from above
                            else: #otherwise just set the street name as is
                                address['street'] = elem.attrib['v']
                        if stripped == 'postcode': #if what remains is postcode...
                            code = elem.attrib['v'] #take the postcode...
                            split_code = re.split(' |-',code) #split it on whitespace and dash...
                            if len(split_code) > 1: #if the code is longer than 1...
                                m = postcode_letter_re.search(split_code[0]) #run the regex matching function
                                if m: #if there is a match...
                                    code = split_code[1] #take the second item in the split postcode and store it
                                if len(split_code[-1]) < 5: #if the last item in split_code is less than 5 (i.e. it's the +4)...
                                    if len(split_code) > 2: #and if the split_code list is more than 2 items..
                                        code = split_code[1]#store the second item in split_code
                                    elif len(split_code) < 3:#but if the split_code list is 2 or 1 item long...
                                        code = split_code[0] #store the first item in split_code     
                            else: #but if split_code is only one item long...
                                if postcode_letter_re.search(split_code[0]): #and there is a match!
                                    code = None #code becomes None (likely text rather than numbers in the zip field)
                            address['postcode'] = code
                        node['address'] = address    #load the address dictionary into 'address'
                    else:
                        node[elem.attrib['k']]=elem.attrib['v']

        
        
        #run through some try/except statements in case some fields are missing in the dataset
        try:
            node["amenity"]=element.attrib['amenity']
        except :
            pass
        try:
            node["cuisine"]=element.attrib['cuisine']
        except :
            pass        
        try:
            node["name"]=element.attrib['name']
        except :
            pass
        try:
            #here we standardize the phone attribute
            
            #take the initial phone number...
            #and run a modified form of the update function from above
            
            #key differences are that the return statements from the update function are now
            #replaced with a slightly different if/elif/else structure
            
            code = element.attrib['phone']
            clean_code = code.strip()  #strip it...
            split_code = list(clean_code)#turn it into a list...
            new = [x for x in split_code if x.isdigit()]#and turn that list into a char by char list composed of only digits
            if new[0] != '1':
                new.insert(0,'1')
            if len(new) > 11:
                first = new[:11]
                second = new[11:]
                if len(second) < 10:
                    node["phone"] = None
                elif second[0] != '1':
                    second.insert(0,'1')
                    both_list = [first, second]
                    both = []
        
                    for order in both_list:
                        phone = "-".join(assemble_number(order))
                        both.append(phone)
                    node["phone"] = both
    
            else:
                phone = "-".join(assemble_number(new))
                node["phone"] = phone
        except :
            pass 
        try:
            node['pos'].append(float(element.attrib['lat']))
            node['pos'].append(float(element.attrib['lon']))             
        except : 
            pass
                                                           
        #print node
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map(atx_osm, False)
    #pprint.pprint(data)
    

if __name__ == "__main__":
    test()

### Now I have a json file... ho, ho, ho 
###### (rip Alan Rickman)

In [69]:
#loading json into mongodb instance

import json

if __name__ == "__main__":
    
    from pymongo import MongoClient
    client = MongoClient()
    db = client.atx

    with open(DATA_DIR + '/austin_texas.osm.json') as f:
        data = []
        for line in f:
            data.append(json.loads(line))
       
        


In [72]:
#insert the data into the instance
db.atx.insert(data)

  if __name__ == '__main__':


[ObjectId('569dc100c8ac4e03274ae65e'),
 ObjectId('569dc100c8ac4e03274ae65f'),
 ObjectId('569dc100c8ac4e03274ae660'),
 ObjectId('569dc100c8ac4e03274ae661'),
 ObjectId('569dc100c8ac4e03274ae662'),
 ObjectId('569dc100c8ac4e03274ae663'),
 ObjectId('569dc100c8ac4e03274ae664'),
 ObjectId('569dc100c8ac4e03274ae665'),
 ObjectId('569dc100c8ac4e03274ae666'),
 ObjectId('569dc100c8ac4e03274ae667'),
 ObjectId('569dc100c8ac4e03274ae668'),
 ObjectId('569dc100c8ac4e03274ae669'),
 ObjectId('569dc100c8ac4e03274ae66a'),
 ObjectId('569dc100c8ac4e03274ae66b'),
 ObjectId('569dc100c8ac4e03274ae66c'),
 ObjectId('569dc100c8ac4e03274ae66d'),
 ObjectId('569dc100c8ac4e03274ae66e'),
 ObjectId('569dc100c8ac4e03274ae66f'),
 ObjectId('569dc100c8ac4e03274ae670'),
 ObjectId('569dc100c8ac4e03274ae671'),
 ObjectId('569dc100c8ac4e03274ae672'),
 ObjectId('569dc100c8ac4e03274ae673'),
 ObjectId('569dc100c8ac4e03274ae674'),
 ObjectId('569dc100c8ac4e03274ae675'),
 ObjectId('569dc100c8ac4e03274ae676'),
 ObjectId('569dc100c8ac4e

In [73]:
#testing it out
print db.atx.find_one()

{u'created': {u'changeset': u'8497118', u'user': u'Tylan', u'version': u'15', u'uid': u'388279', u'timestamp': u'2011-06-20T18:36:15Z'}, u'_id': ObjectId('569dc100c8ac4e03274ae65e'), u'type': u'node', u'id': u'26546004', u'pos': [30.4695355, -97.7972587]}


In [102]:
print db.atx.find().count()  #number of docs
print db.atx.find({"type":"node"}).count()  #number of nodes
print db.atx.find({"type":"way"}).count()   #number of ways
print len(db.atx.distinct("created.user"))  #number of distinct users


1031

In [113]:
def make_pipeline():
    #these are all the different pipelines for aggregation included in the report
    
    
    
    #pipeline = [{"$group":{"_id":"$created.user", 
    #                       "count":{"$sum":1}}}, 
    #            { "$sort":{"count": ‐1}},
    #            {"$limit":1}]
    
    #pipeline = [{"$group":{"_id":"$created.user", 
    #                       "count":{"$sum":1}}},  
    #            {"$group":{"_id":"$count", 
    #                       "num_users":{"$sum":1}}},  
    #            {"$sort":{"_id":1}},
    #            {"$limit":1}]
    
    #pipeline = [{"$match":{"amenity":{"$exists":1}}}, 
    #            {"$group":{"_id":"$amenity",
    #                       "count":{"$sum":1}}}, 
    #            {"$sort":{"count": ‐1}},
    #            {"$limit":10}]
    
    #pipeline = [{"$match":{"amenity":{"$exists":1}, 
    #                       "amenity":"place_of_worship"}},
    #            {"$group":{"_id":"$religion", 
    #                       "count":{"$sum":1}}},
    #            {"$sort":{"count": ‐1}},
    #            {"$limit":1}]
    
    #pipeline = [{"$match":{"amenity":{"$exists":1}, 
    #                       "amenity":"restaurant"}},  
    #            {"$group":{"_id":"$cuisine", 
    #                       "count":{"$sum":1}}},
    #            {"$sort":{"count": ‐1}},
    #            {"$limit":3}]
    
    pipeline = [{"$group" : {"_id":"$created.user",
                            "count":{"$sum":1}}},
                {"$sort":{"count":1}},
                {"$limit":100}]
    
    return pipeline

if __name__ == '__main__':
    pipeline = make_pipeline()
    result = [doc for doc in db.atx.aggregate(pipeline)]
    import pprint
    if len(result) < 150:
        pprint.pprint(result)
    else:
        pprint.pprint(result[:100])

[{u'_id': u'justcaldwell', u'count': 1},
 {u'_id': u'jarcher', u'count': 1},
 {u'_id': u'mrw6060', u'count': 1},
 {u'_id': u'Jeff Yutzler', u'count': 1},
 {u'_id': u'Red Stella', u'count': 1},
 {u'_id': u'coinfaq', u'count': 1},
 {u'_id': u'RobCo', u'count': 1},
 {u'_id': u'Hartmut Holzgraefe', u'count': 1},
 {u'_id': u'OpenBrian', u'count': 1},
 {u'_id': u'geojwh', u'count': 1},
 {u'_id': u'mhm44', u'count': 1},
 {u'_id': u'taranis83', u'count': 1},
 {u'_id': u'adi_z', u'count': 1},
 {u'_id': u'wieland', u'count': 1},
 {u'_id': u'jacmin', u'count': 1},
 {u'_id': u'marook', u'count': 1},
 {u'_id': u'ahmef', u'count': 1},
 {u'_id': u'Pete Trujillo', u'count': 1},
 {u'_id': u'mp2526', u'count': 1},
 {u'_id': u'Robert Butler', u'count': 1},
 {u'_id': u'wolfie78', u'count': 1},
 {u'_id': u'Davio', u'count': 1},
 {u'_id': u'BryanSK', u'count': 1},
 {u'_id': u'manishchacko', u'count': 1},
 {u'_id': u'joshparnham', u'count': 1},
 {u'_id': u'livingeasy', u'count': 1},
 {u'_id': u'Chris Baines'

In [110]:
#some math for stats included on the file
2747363+1302500+941550+354652+301759+159464+157899+48236

6013423

In [111]:
6013423/6319385.0

0.9515835797312555

In [114]:
100/6319385.0

1.5824324677163996e-05

In [None]:
#Lesson 6 Quizzes
#6.1



#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
Fill out the count_tags function. It should return a dictionary with the 
tag name as the key and number of times this tag can be encountered in 
the map as value.

Note that your code will be tested with a different data file than the 'example.osm'
"""
import xml.etree.cElementTree as ET
import pprint

def count_tags(filename):
    count_dict = {}
    for event, elem in ET.iterparse(filename):
        if elem.tag in count_dict:
            count_dict[elem.tag] += 1
        else:
            count_dict[elem.tag] = 1

    return count_dict

def test():

    tags = count_tags('example.osm')
    pprint.pprint(tags)
    assert tags == {'bounds': 1,
                     'member': 3,
                     'nd': 4,
                     'node': 20,
                     'osm': 1,
                     'relation': 1,
                     'tag': 7,
                     'way': 1}

    

if __name__ == "__main__":
    test()


In [None]:
#6.2



#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into MongoDB, you should check the "k"
value for each "<tag>" and see if they can be valid keys in MongoDB, as well as
see if there are any other potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        for tag in element.iter('tag'):
            if lower.match(tag.attrib['k']):
                keys['lower'] += 1
            elif lower_colon.match(tag.attrib['k']):
                keys['lower_colon'] += 1
            elif problemchars.match(tag.attrib['k']):
                keys['problemchars'] += 1
            else:
                keys['other'] += 1
        
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys



def test():
    # You can use another testfile 'map.osm' to look at your solution
    # Note that the assertion below will be incorrect then.
    # Note as well that the test function here is only used in the Test Run;
    # when you submit, your code will be checked against a different dataset.
    keys = process_map('example.osm')
    pprint.pprint(keys)
    assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}


if __name__ == "__main__":
    test()

In [None]:
#6.3



#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    return


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        for tag in element.iter('node'):
            users.add(tag.attrib['uid'])
        for tag in element.iter('way'):
            users.add(tag.attrib['uid'])
        for tag in element.iter('relation'):
            users.add(tag.attrib['uid'])
                    

    return users


def test():

    users = process_map('example.osm')
    pprint.pprint(users)
    assert len(users) == 6



if __name__ == "__main__":
    test()

In [None]:
#6.4



"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "example.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Rd." : "Road",
            "Rd" : "Road"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types


def update_name(name, mapping):
    #print name.split(" ")[-1]
    # YOUR CODE HERE
    split_name = name.split(" ")
    postfix = split_name[-1]
    if postfix in mapping:
        postfix = mapping[postfix]
        split_name_clean = split_name[:-1]
        split_name_clean.append(postfix)
        name = " ".join(split_name_clean)
    return name


def test():
    st_types = audit(OSMFILE)
    assert len(st_types) == 3
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name
            if name == "West Lexington St.":
                assert better_name == "West Lexington Street"
            if name == "Baldwin Rd.":
                assert better_name == "Baldwin Road"


if __name__ == '__main__':
    test()

In [None]:
#6.5



#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
  same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    
    node = {}
    address = {}
    pos=[]
    node['pos'] = pos
    
    if element.tag == "node" or element.tag == "way" :
        node['id'] = element.attrib['id']
        node['type'] = element.tag
        if 'visible' in element.attrib:
            node['visible'] = element.attrib['visible']
       
        created = {}
        for i in CREATED:
            created[i]=element.attrib[i]  
            node['created']=created
                                              
        for elem in element:
            if 'k' in elem.attrib:
                if elem.attrib['k'].count(':')<2:
                    if elem.attrib['k'].startswith("addr:"):
                        stripped=elem.attrib['k'].replace("addr:","")
                        if stripped == 'housenumber' or stripped == 'street':
                            address[stripped] = elem.attrib['v']
                            node['address'] = address
                    else:
                        node[elem.attrib['k']]=elem.attrib['v']

        try:
            node["amenity"]=element.attrib['amenity']
        except :
            pass
        try:
            node["cuisine"]=element.attrib['cuisine']
        except :
            pass        
        try:
            node["name"]=element.attrib['name']
        except :
            pass
        try:
            node["phone"]=element.attrib['phone']
        except :
            pass 
        try:
            node['pos'].append(float(element.attrib['lat']))
            node['pos'].append(float(element.attrib['lon']))             
        except : 
            pass
                                                           
        #print node
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map('example.osm', True)
    #pprint.pprint(data)
    
    correct_first_elem = {
        "id": "261114295", 
        "visible": "true", 
        "type": "node", 
        "pos": [41.9730791, -87.6866303], 
        "created": {
            "changeset": "11129782", 
            "user": "bbmiller", 
            "version": "7", 
            "uid": "451048", 
            "timestamp": "2012-03-28T18:31:23Z"
        }
    }
    assert data[0] == correct_first_elem
    assert data[-1]["address"] == {
                                    "street": "West Lexington St.", 
                                    "housenumber": "1412"
                                      }
    assert data[-1]["node_refs"] == [ "2199822281", "2199822390",  "2199822392", "2199822369", 
                                    "2199822370", "2199822284", "2199822281"]

if __name__ == "__main__":
    test()