In [2]:
import xml.etree.cElementTree as ET
import pprint as pp
import re
import codecs
import json
from pymongo import MongoClient

In [3]:
OSM_FILE = "plaisir.osm"

In [4]:
SAMPLE_FILE = "sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

# First view of the file

In [5]:
tree = ET.parse("sample.osm")
root = tree.getroot()

In [6]:
print "\nChildren of root:"
childs = set()
for child in root:
    childs.add(child.tag)
print childs


Children of root:
set(['node', 'relation', 'way'])


In [7]:
print "\nChildren of childrens of root"
children = {}
children["node"] = set()
children["relation"] = set()
children["way"] = set()
for child in root:
    for i in child:
        children[child.tag].add(i.tag)
print children


Children of childrens of root
{'node': set(['tag']), 'relation': set(['member', 'tag']), 'way': set(['tag', 'nd'])}


# Audit of the file

In [8]:
tags = {}
for event, elem in ET.iterparse("sample.osm"):
    if elem.tag not in tags:
        tags[elem.tag]= 1
    else:
        tags[elem.tag] += 1
print tags

{'node': 33069, 'nd': 46051, 'member': 1718, 'tag': 14808, 'relation': 45, 'way': 5618, 'osm': 1}


**What I will do to get a better view of the file?**
- Build a dictionnary to count :
    - the number and types of amenity, 
    - the number and types of shops, 
    - the number and types of sport.

In [9]:
tags_details = {}
keys = ["amenity","shop","sport","place","service","building"]

def create_tags_details(binder, list_keys, filename):
    for key in list_keys:
        binder[key] = {}
    for event, elem in ET.iterparse(filename, events = ("start",)):
        if elem.tag == "tag":
            for tag in elem.iter("tag"):
                for key in list_keys:
                    if elem.attrib["k"] == key:
                        if tag.attrib["v"] not in binder[key]:
                            binder[key][tag.attrib["v"]] = 1
                        else:    
                            binder[key][tag.attrib["v"]] += 1
    return pp.pprint(binder)

In [10]:
create_tags_details(tags_details,keys,"sample.osm")

{'amenity': {'atm': 2,
             'bank': 2,
             'bar': 2,
             'bench': 12,
             'bus_station': 1,
             'cafe': 1,
             'car_wash': 1,
             'clock': 1,
             'college': 1,
             'community_centre': 1,
             'doctors': 1,
             'fast_food': 3,
             'fountain': 1,
             'fuel': 2,
             'hospital': 1,
             'kindergarten': 1,
             'parking': 28,
             'parking_entrance': 2,
             'pharmacy': 2,
             'place_of_worship': 2,
             'post_box': 3,
             'recycling': 3,
             'restaurant': 4,
             'school': 13,
             'shelter': 1,
             'swimming_pool': 8,
             'telephone': 6,
             'theatre': 1,
             'vending_machine': 2,
             'waste_basket': 11},
 'building': {'chapel': 1,
              'church': 1,
              'commercial': 1,
              'house': 16,
              'industrial'

**What questions I want to answer?**
- Is there a correlation between the number of house and the number of bus stop in a given area?
- What is the most popular type of shop in Plaisir?
- What is the sport with the most facilities in Plaisir?
- Are there more restaurant or Fastfood in Plaisir

## Data cleaning plan

- Clean addr:street fields
- Create a list with one document per tag (we will only care about node and way) and the following structure:


{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

In [11]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
startswith = re.compile(r'addr:')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
POS = ["lon","lat"]
BUILDING_TYPES = ["amenity","shop","sport","place","service","building"]

def shape_element(element):
    node = {}
    pos = []
    node_refs = []
    created = {}
    address = {}
    types = {}
    if element.tag == "node" or element.tag == "way" :
        types['type'] = element.tag
        if 'lat' in element.attrib.keys() and 'lon' in element.attrib.keys():
            try:
                lat = float(element.attrib['lat'])
                lon = float(element.attrib['lon'])
                pos.insert(0,lat)
                pos.insert(1,lon)
            except:
                pass
        for k, m in element.attrib.items():
            if k not in POS:
                if k in CREATED:
                    created[k] = m
                else:
                    node[k] = m

        for child in element:
            if child.tag == "nd":
                node_refs.append(child.attrib['ref'])
            elif child.tag == "tag":
                if child.attrib['k'].startswith("addr:"):
                    key = re.sub('addr:', '', child.attrib['k']).strip()
                    if lower_colon.match(key):
                        break
                    else:
                        address[key] = child.attrib['v']
                elif child.attrib['k'] in BUILDING_TYPES:
                    types[child.attrib['k']] = child.attrib['v']
        if types:
            node['types'] = types
        if created:
            node['created'] = created
        if pos:
            node['pos'] = pos
        if address:
            node['address'] = address
        if node_refs:
            node['node_refs'] = node_refs
        return node
    else:
        return None
    
def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [12]:
data = process_map('plaisir.osm', True)

In [13]:
##pp.pprint(data)

## Insert data in MongoDb

In [18]:
def insert_data(data, db):
    for item in data:
        db.plaisir_osm.insert_one(item)

from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.osm_udacity

insert_data(data, db)
print db.plaisir_osm.find_one()

DuplicateKeyError: E11000 duplicate key error collection: osm_udacity.plaisir_osm index: _id_ dup key: { : ObjectId('5805c87aecf40b794d9ad942') }