In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.cElementTree as ET
from pprint import pprint

OSMFILE = 'toronto_canada.osm'
tag_tags = []

def count_tags(filename):
        tags = {}
        tag = []
        for event, elem in ET.iterparse(filename):
            tag.append(elem.tag)
        for i in tag:
            if i in tags:
                tags[i] += 1
            else:
                tags[i] = 1         
        return tags

def test():
    tags = count_tags(OSMFILE)
    pprint(tags)
    tag_tags.append(tags['tag'])
   

if __name__ == "__main__":
    test()

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import re
import operator
from pprint import pprint

OSMFILE = 'toronto_canada.osm'

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
lower_2colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
omit_keys = []

def key_type(element, keys, other, problem_characters):
    if element.tag == "tag":
        k = element.get("k")
        if re.search(lower,k):
            keys['lower'] += 1
        elif re.search(lower_colon,k):
            keys['lower_colon'] += 1
        elif re.search(lower_2colon,k):
            keys['lower_2colon'] += 1
        elif re.search(problemchars,k):
            keys['problemchars'] += 1
            if k in problem_characters:
                problem_characters[k] += 1
            else:
                problem_characters[k] = 1
        else:
            keys['other'] += 1
            if k in other:
                other[k] += 1
            else:
                other[k] = 1
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "lower_2colon": 0, "problemchars": 0, "other": 0}
    other = {}
    problem_characters = {}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys, other, problem_characters)
    sort_other = sorted(other.items(), key=operator.itemgetter(1), reverse=True)
    sort_problem_characters = sorted(problem_characters.items(), 
                                     key=operator.itemgetter(1), reverse=True)
    #The two print statements below will display problem tags and problem characters
    #as well as the amount of times that these tag appears
    #print "OTHER:", sort_other
    #print "OTHER:", sort_problem_characters
    
    gct_count = 0
    
    for key in other:
        omit_keys.append(key)
        if 'geobase' in key or 'canvec' in key or 'tiger' in key:
            #print key, other[key]
            gct_count += other[key]
    for key in problem_characters:
        omit_keys.append(key)
        
    unique_not_gct = 0 
    for i in omit_keys:
        if 'geobase' not in i and 'canvec' not in i and 'tiger' not in i:
            unique_not_gct += 1
            #print i
    
    print                                                                                
    print "Problem tags make up %f%% of the total %d tags in the 'Toronto Canada' dataset" % (100*float(keys['other'])/tag_tags[0], 
                                                                                               tag_tags[0])
    print "%d out of the %d problem tags (%d%%) come from geobase, canvec and tiger tags" % (gct_count, keys['other'], 
                                                                                             100*float(gct_count)/keys['other'])
    print "i.e. 'geobase:acquisitionTechnique', 'canvec:UUID', 'tiger:name_base_1'"
    print "The remaining %d (%f%%) tags can either be changed by hand or ignored;" % (keys['other']-gct_count, 
                                                                                    (keys['other']-gct_count)/float(tag_tags[0]))
    print"of the remaining problem tags %d are unique." % unique_not_gct
    print
    #The print statment below will show all tags that are being omitted from this dataset
    #print omit_keys
    return keys



def test():
    keys = process_map(OSMFILE)
    pprint(keys)


if __name__ == "__main__":
    test()

In [None]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
from pprint import pprint

OSMFILE = "toronto_canada.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
updated_street_names = {}

# Values following "Commons" were added after running against the Toronto dataset
# Compared to https://en.wikipedia.org/wiki/Street_suffix
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Crescent", "Line", "Circle", "Gardens", "Close", "Concession",
            "Gate", "Grove", "Heights", "Hill", "Path", "Run", "Sideroad", "Terrace", "Way", "Townline",
            "North", "East", "South", "West"]
# Names ending with North, East, South, West 
# 'Highway  5  West' => 'Highway 5 West' 
# 'Sideroad 5 Tosorontio' => 'Tosorontio Sideroad 5'


# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "street": "Street",
            "STREET": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "avenue": "Avenue",
            "Rd": "Road",
            "Rd.": "Road",
            "road": "Road",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            "Fernway": "Fern Way",
            "By-pass": "Bypass",
            "Cir": "Circle",
            "Cres": "Crescent",
            "Cresent": "Crescent",
            #"Crest": "Crescent", -> Individual cases...
            "Crt": "Court",
            "Ct": "Court",
            "Cv": "Court",
            "Dr": "Drive",
            "Driver": "Drive",
            "E": "East",
            "E.": "East",
            "Hrbr": "Harbour Way",
            "Lan": "Lane",
            "Lanes": "Lane",
            "Ldg": "Landing",
            "N": "North",
            "S": "South",
            "S.": "South",
            "Trl": "Trail",
            "W": "West",
            "W.": "West",
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    #re.sub(pattern, repl, string, count=0, flags=0)
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            if street_type in mapping:
                name = re.sub(street_type, mapping[street_type], name)
    return name


def test():
    st_types = audit(OSMFILE)
    print "The dataset contains %d street type tags" % len(st_types)
    #pprint(dict(st_types))
    
    count = 0
    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            if better_name != name:
                # The print statment below shows all street names that were altered
                #print name, "=>", better_name
                count += 1
                updated_street_names[name] = better_name
    print "of these, %d tags were updated for constistancy" % count
    # the variable updated_street_names contains a dictionary using the old names as the key 
    # and the updated names as the values
    #print updated_street_names


if __name__ == '__main__':
    test()

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import codecs
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

OSM_PATH = "toronto_canada.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "node_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "way_nodes.csv"
WAY_TAGS_PATH = "way_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        for field in NODE_FIELDS:
            node_attribs[field] = element.attrib[field]
        for tag in element.iter("tag"):    
            tags = create_tag_dict(element, tag, tags, default_tag_type)
        #print {'node': node_attribs, 'node_tags': tags} ...debugging
        return {'node': node_attribs, 'node_tags': tags}
    
    elif element.tag == 'way':
        for field in WAY_FIELDS:
            way_attribs[field] = element.attrib[field]
        
        position = 0
        for node in element.iter("nd"):
            node_dict = {'id': element.attrib['id'], 'node_id': node.attrib['ref'], 'position': position}
            position += 1
            way_nodes.append(node_dict)
            
        for tag in element.iter("tag"):    
            tags = create_tag_dict(element, tag, tags, default_tag_type)
        #print {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}    
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

def create_tag_dict(element, tag, tags, tag_type):
    k = tag.attrib['k']
    if not re.search(PROBLEMCHARS, k):
        # The following three lines omit problem keys found earlier as well as update
        # old street names with the improved street names from the previous script
        if k not in omit_keys:
            if k in updated_street_names:
                k = updated_street_names[k]
            tag_dict = {}
            tag_dict['id'] = element.attrib['id']
            if ":" not in k:
                tag_dict['key'] = k
            else:
                tag_type = tag.attrib['k'].split(':',1)[0]
                k = tag.attrib['k'].split(':',1)[1]
                tag_dict['key'] = k
            tag_dict['value'] = tag.attrib['v']
            tag_dict['type'] = tag_type
            tags.append(tag_dict)
    return tags

        

# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_strings = (
            "{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
            for k, v in errors.iteritems()
        )
        raise cerberus.ValidationError(
            message_string.format(field, "\n".join(error_strings))
        )


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as node_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(node_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=False)


In [None]:
import sqlite3
import csv
from pprint import pprint

nodes_csv = 'nodes.csv'
node_tags_csv = 'node_tags.csv'
ways_csv = 'ways.csv'
way_tags_csv = 'way_tags.csv'
way_nodes_csv = 'way_nodes.csv'


# Connect to the database, if it doesn't exist it will be created
sqlite_file = 'toronto_canada.db'
db = sqlite3.connect(sqlite_file)
db.text_factory = str

# Create a cursor object
c = db.cursor()

# Create the tables
c.execute('''
CREATE TABLE nodes (
    id INTEGER PRIMARY KEY NOT NULL,
    lat REAL,
    lon REAL,
    user TEXT,
    uid INTEGER,
    version INTEGER,
    changeset INTEGER,
    timestamp TEXT)
''')
c.execute('''
CREATE TABLE node_tags (
    id INTEGER,
    key TEXT,
    value TEXT,
    type TEXT,
    FOREIGN KEY (id) REFERENCES nodes(id))
''')
c.execute('''
CREATE TABLE ways (
    id INTEGER PRIMARY KEY NOT NULL,
    user TEXT,
    uid INTEGER,
    version TEXT,
    changeset INTEGER,
    timestamp TEXT)
''')
c.execute('''
CREATE TABLE way_tags (
    id INTEGER NOT NULL,
    key TEXT NOT NULL,
    value TEXT NOT NULL,
    type TEXT,
    FOREIGN KEY (id) REFERENCES ways(id))
''')
c.execute('''
CREATE TABLE way_nodes (
    id INTEGER NOT NULL,
    node_id INTEGER NOT NULL,
    position INTEGER NOT NULL,
    FOREIGN KEY (id) REFERENCES ways(id),
    FOREIGN KEY (node_id) REFERENCES nodes(id))    
''')

# Commit changes
db.commit()

# Read in the data
with open(nodes_csv, 'rb') as read_csv:
    reader = csv.DictReader(read_csv) #comma is default delimiter
    nodes_db = [(i['id'], i['lat'], i['lon'], i['user'], i['uid'], i['version'], i['changeset'], i['timestamp']) for i in reader]
with open(node_tags_csv, 'rb') as read_csv:
    reader = csv.DictReader(read_csv) #comma is default delimiter
    node_tags_db = [(i['id'], i['key'], i['value'], i['type']) for i in reader]
with open(ways_csv, 'rb') as read_csv:
    reader = csv.DictReader(read_csv) #comma is default delimiter
    ways_db = [(i['id'], i['user'], i['uid'], i['version'], i['changeset'], i['timestamp']) for i in reader]
with open(way_tags_csv, 'rb') as read_csv:
    reader = csv.DictReader(read_csv) #comma is default delimiter
    way_tags_db = [(i['id'], i['key'], i['value'], i['type']) for i in reader]
with open(way_nodes_csv, 'rb') as read_csv:
    reader = csv.DictReader(read_csv) #comma is default delimiter
    way_nodes_db = [(i['id'], i['node_id'], i['position']) for i in reader]
    
    
# Insert formatted data
c.executemany('''
    INSERT INTO nodes(id, lat, lon, user, uid, version, changeset, timestamp) 
    VALUES (?, ?, ?, ?, ?, ?, ?, ?);''', nodes_db)
c.executemany('''
    INSERT INTO node_tags(id, key, value, type) 
    VALUES (?, ?, ?, ?);''', node_tags_db)
c.executemany('''
    INSERT INTO ways(id, user, uid, version, changeset, timestamp) 
    VALUES (?, ?, ?, ?, ?, ?);''', ways_db)
c.executemany('''
    INSERT INTO way_tags(id, key, value, type) 
    VALUES (?, ?, ?, ?);''', way_tags_db)
c.executemany('''
    INSERT INTO way_nodes(id, node_id, position) 
    VALUES (?, ?, ?);''', way_nodes_db)
db.commit()

# Check that the data is imported correctly
#c.execute ('SELECT * FROM nodes')
#all_rows = c.fetchall()
#print('1):')
#pprint(all_rows)

db.close()




In [5]:
#TEMP
import sqlite3
from pprint import pprint
sqlite_file = 'toronto_canada.db'

In [None]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
#c.execute ('SELECT user, COUNT(*) FROM nodes GROUP BY user ORDER BY COUNT(*) DESC')
#c.execute ('SELECT user, COUNT(*) FROM ways GROUP BY user ORDER BY COUNT(*) DESC')
#c.execute ('SELECT * FROM way_nodes')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

In [None]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
#c.execute ('SELECT * FROM way_nodes')
c.execute ('PRAGMA PAGE_SIZE')
page_size = c.fetchall()
c.execute ('PRAGMA PAGE_COUNT')
page_count = c.fetchall()
db.close()
#print page_size, page_count
# Page size shows the size of the pages in bytes
# Page count shows the amount of pages that are allocated to the database
page_size = page_size[0][0]
page_count = page_count[0][0]
print "The size of this database is %d kB" % (page_size*page_count/1024)

In [25]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT COUNT(DISTINCT uid)
    FROM nodes
''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(1741,)]


In [24]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT COUNT(DISTINCT uid)
    FROM (SELECT uid FROM nodes
    UNION SELECT uid FROM ways)
''')
all_rows = c.fetchall()
#pprint(all_rows)
db.close()
unique_uid = all_rows[0][0]
print "%d unique users have contributed to the node and way tags for this database" % unique_uid

1985 unique users have contributed to the node and way tags for this database


In [26]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT count(id) FROM nodes
''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(4873350,)]


In [27]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT count(id) FROM ways
''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(710944,)]


In [30]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT *
    FROM (SELECT count(id) FROM ways
    UNION SELECT count(id) FROM nodes)
''')
idWaysNodes = c.fetchall()
c.execute ('''
    SELECT COUNT(id)
    FROM (SELECT id FROM ways
    UNION SELECT id FROM nodes)
''')
idUnique = c.fetchall()
db.close()
# changing the order in which ways and nodes appear in the union 
# does not seem to change the order of the final table
print "There are %d node tags and %d way tags combining to a total of %d way/node tags." % (idWaysNodes[1][0], idWaysNodes[0][0], 
                                                                                            idWaysNodes[1][0] + idWaysNodes[0][0])
print "Of these tags, %d are not unique between the nodes and the ways tables." % (idWaysNodes[1][0] + idWaysNodes[0][0]
                                                                                   - idUnique[0][0])


There are 4873350 node tags and 710944 way tags combining to a total of 5584294 way/node tags.
Of these tags, 506 are not unique between the nodes and the ways tables.


In [31]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
# Verify the number of non-unique id's between the nodes and ways tables.
c.execute ('''
    SELECT COUNT(id)
    FROM nodes,
    (SELECT id as wayid FROM ways) as subq
    WHERE nodes.id = wayid
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(506,)]


In [32]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
# sample of common way and node tags
c.execute ('''
    SELECT id
    FROM nodes,
    (SELECT id as wayid FROM ways) as subq
    WHERE nodes.id = wayid
    LIMIT 1
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(21428979,)]


In [33]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT id, lat, lon, user, uid, version, changeset, timestamp
    FROM nodes
    WHERE id = '21428979'
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(21428979,
  43.6131187,
  -79.5521881,
  u'permute',
  158267,
  12,
  30030302,
  u'2015-04-07T03:05:37Z')]


In [34]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT id, user, uid, version, changeset, timestamp
    FROM ways
    WHERE id = '21428979'
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(21428979, u'andrewpmk', 1679, u'3', 20777634, u'2014-02-25T19:30:51Z')]


In [35]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT type, COUNT(*)
    FROM node_tags
    GROUP BY type
    ORDER BY COUNT(*) DESC
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'addr', 1463356),
 (u'regular', 833650),
 (u'opendata', 4109),
 (u'payment', 1342),
 (u'source', 1044),
 (u'species', 941),
 (u'genus', 931),
 (u'colour', 799),
 (u'name', 610),
 (u'crossing', 601),
 (u'fire_hydrant', 582),
 (u'tower', 492),
 (u'fuel', 358),
 (u'contact', 337),
 (u'traffic_signals', 318),
 (u'gnis', 263),
 (u'recycling', 220),
 (u'generator', 154),
 (u'works', 112),
 (u'internet_access', 97),
 (u'toilets', 81),
 (u'diet', 64),
 (u'cost', 63),
 (u'wetap', 60),
 (u'is_in', 57),
 (u'atm', 35),
 (u'healthcare', 34),
 (u'seamark', 31),
 (u'service', 30),
 (u'wheelchair', 25),
 (u'building', 23),
 (u'survey', 22),
 (u'storage', 19),
 (u'lamp', 13),
 (u'phone', 13),
 (u'capacity', 12),
 (u'maxspeed', 12),
 (u'communication', 11),
 (u'disused', 10),
 (u'surveillance', 10),
 (u'beacon', 9),
 (u'camera', 9),
 (u'destination', 8),
 (u'railway', 6),
 (u'currency', 5),
 (u'light', 5),
 (u'mast', 5),
 (u'monitoring', 5),
 (u'opening_hours', 5),
 (u'dance', 4),
 (u'exit_to', 4),
 

In [36]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT key, COUNT(*)
    FROM node_tags
    GROUP BY key
    ORDER BY COUNT(*) DESC
    LIMIT 20
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'source', 483041),
 (u'street', 472140),
 (u'housenumber', 472136),
 (u'city', 414610),
 (u'highway', 85423),
 (u'country', 50682),
 (u'name', 35732),
 (u'state', 26598),
 (u'amenity', 25980),
 (u'province', 23890),
 (u'created_by', 20635),
 (u'crossing', 20245),
 (u'operator', 17359),
 (u'shop', 12181),
 (u'natural', 11226),
 (u'shelter', 10177),
 (u'power', 8872),
 (u'website', 7052),
 (u'type', 7044),
 (u'note', 5010)]


In [43]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
# Shows the most common keys within way_tags
c.execute ('''
    SELECT key, COUNT(*)
    FROM way_tags
    GROUP BY key
    ORDER BY COUNT(*) DESC
    LIMIT 10
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'source', 464856),
 (u'highway', 242805),
 (u'interpolation', 226512),
 (u'surface', 161827),
 (u'lanes', 150670),
 (u'name', 145372),
 (u'building', 102326),
 (u'attribution', 87188),
 (u'access', 59372),
 (u'is_in', 49761)]


In [38]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT value, COUNT(*)
    FROM node_tags
    WHERE key = 'amenity'
    GROUP BY value
    ORDER BY COUNT(*) DESC
    LIMIT 10
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'fast_food', 2996),
 (u'restaurant', 2744),
 (u'bench', 2352),
 (u'post_box', 1964),
 (u'cafe', 1409),
 (u'parking', 1233),
 (u'waste_basket', 1171),
 (u'bank', 1046),
 (u'fuel', 1001),
 (u'pharmacy', 743)]


In [39]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT value, COUNT(*)
    FROM way_tags
    WHERE key = 'amenity'
    GROUP BY value
    ORDER BY COUNT(*) DESC
    LIMIT 10
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'parking', 30252),
 (u'school', 2340),
 (u'place_of_worship', 1445),
 (u'restaurant', 309),
 (u'fast_food', 290),
 (u'cafe', 226),
 (u'bank', 207),
 (u'car_wash', 130),
 (u'community_centre', 127),
 (u'fire_station', 117)]


In [9]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
# what are the way id's that have only a single node
c.execute ('''
    SELECT id, node_id, position FROM 
    (SELECT id, node_id, position, COUNT(*) as nodeCount
    FROM way_nodes
    GROUP BY id)
    WHERE nodeCount = 1
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(4055112, 21488808, 0),
 (5715385, 43007918, 0),
 (5715901, 43013058, 0),
 (5716071, 43014448, 0),
 (5717530, 43026496, 0),
 (5718641, 43044724, 0),
 (5718776, 43046596, 0),
 (5718892, 43033354, 0),
 (8046513, 60169823, 0),
 (13866550, 130161213, 0),
 (23634848, 60226522, 0),
 (24540440, 26307677, 0),
 (25336264, 60169620, 0),
 (27052915, 1236861279, 0),
 (27209568, 298579089, 0),
 (27238513, 298935188, 0),
 (27238760, 298936587, 0),
 (27238784, 298936377, 0),
 (31513894, 31934822, 0),
 (33931591, 388655609, 0),
 (33932698, 388744304, 0),
 (33933093, 388854128, 0),
 (33933120, 388720583, 0),
 (33933886, 388808456, 0),
 (33939353, 388735979, 0),
 (33940953, 388869618, 0),
 (33944343, 388897093, 0),
 (33947565, 388655609, 0),
 (33949064, 388829216, 0),
 (33949318, 388904235, 0),
 (33949742, 388821332, 0),
 (33950831, 388725017, 0),
 (33952601, 388774538, 0),
 (33957561, 388672393, 0),
 (33961358, 388744304, 0),
 (33961913, 388926463, 0),
 (33963782, 388787143, 0),
 (33965528, 388694030,

In [45]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT COUNT(*) FROM 
    (SELECT COUNT(*) as nodeCount
    FROM way_nodes
    GROUP BY id)
    WHERE nodeCount = 1
    ''')
all_rows = c.fetchall()
db.close()
print "%d ways have only a single node assigned to them" % all_rows[0][0]

545 ways have only a single node assigned to them


In [13]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
#???
c.execute ('''
    SELECT key, COUNT(*)
    FROM (SELECT key FROM node_tags
    UNION SELECT key FROM way_tags)
    GROUP BY key
    ORDER BY COUNT(*) DESC
    LIMIT 20
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'abandoned', 1),
 (u'abutters', 1),
 (u'access', 1),
 (u'accessible', 1),
 (u'account_cards', 1),
 (u'ad_screens', 1),
 (u'addr', 1),
 (u'address', 1),
 (u'admin_level', 1),
 (u'administrative', 1),
 (u'advanced_green', 1),
 (u'advertising', 1),
 (u'advised', 1),
 (u'advisory', 1),
 (u'advisory_speed', 1),
 (u'aerialway', 1),
 (u'aerodrome', 1),
 (u'aeroway', 1),
 (u'after_hours_return', 1),
 (u'air', 1)]


In [14]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
# Cafes
c.execute ('''
    SELECT id, value, key, type
    FROM node_tags
    WHERE value = 'cafe'
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(48553537, u'cafe', u'amenity', u'regular'),
 (53542349, u'cafe', u'amenity', u'regular'),
 (54965507, u'cafe', u'amenity', u'regular'),
 (59566254, u'cafe', u'amenity', u'regular'),
 (59566311, u'cafe', u'amenity', u'regular'),
 (60930953, u'cafe', u'amenity', u'regular'),
 (60930956, u'cafe', u'amenity', u'regular'),
 (60931257, u'cafe', u'amenity', u'regular'),
 (79877659, u'cafe', u'amenity', u'regular'),
 (80927400, u'cafe', u'amenity', u'regular'),
 (95378352, u'cafe', u'amenity', u'regular'),
 (118421701, u'cafe', u'amenity', u'regular'),
 (158701760, u'cafe', u'amenity', u'regular'),
 (158707495, u'cafe', u'amenity', u'regular'),
 (176544089, u'cafe', u'amenity', u'regular'),
 (211287594, u'cafe', u'amenity', u'regular'),
 (211290791, u'cafe', u'amenity', u'regular'),
 (211313114, u'cafe', u'amenity', u'regular'),
 (241263058, u'cafe', u'amenity', u'regular'),
 (243421095, u'cafe', u'amenity', u'regular'),
 (245020011, u'cafe', u'amenity', u'regular'),
 (245020012, u'cafe', u'

In [15]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT STRFTIME('%Y', timestamp) as year, COUNT(*)
    FROM nodes
    GROUP BY year
    ORDER BY year DESC
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'2016', 379884),
 (u'2015', 320166),
 (u'2014', 456962),
 (u'2013', 394806),
 (u'2012', 769813),
 (u'2011', 598622),
 (u'2010', 1444220),
 (u'2009', 438753),
 (u'2008', 52188),
 (u'2007', 17162),
 (u'2006', 774)]


In [16]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT STRFTIME('%Y', timestamp) as year, COUNT(*)
    FROM ways
    GROUP BY year
    ORDER BY year DESC
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'2016', 107786),
 (u'2015', 90670),
 (u'2014', 68349),
 (u'2013', 75598),
 (u'2012', 65214),
 (u'2011', 87542),
 (u'2010', 190070),
 (u'2009', 24653),
 (u'2008', 923),
 (u'2007', 138),
 (u'2006', 1)]


In [17]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
# Number of ways and nodes added to the database each year 
c.execute ('''
    SELECT STRFTIME('%Y', timestamp) as year, COUNT(*)
    FROM (SELECT timestamp FROM nodes
    UNION SELECT timestamp FROM ways)
    GROUP BY year
    ORDER BY year DESC
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'2016', 52011),
 (u'2015', 57476),
 (u'2014', 52523),
 (u'2013', 52881),
 (u'2012', 144792),
 (u'2011', 99937),
 (u'2010', 241083),
 (u'2009', 114439),
 (u'2008', 17638),
 (u'2007', 4653),
 (u'2006', 372)]


In [18]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
# Which months of which years had the most ways and nodes added to the database
c.execute ('''
    SELECT STRFTIME('%Y-%m', timestamp) as yearMonth, COUNT(*)
    FROM (SELECT timestamp FROM nodes
    UNION SELECT timestamp FROM ways)
    GROUP BY yearMonth
    ORDER BY COUNT(*) DESC
    LIMIT 5
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'2010-09', 75669),
 (u'2010-04', 59329),
 (u'2010-07', 58310),
 (u'2012-04', 33734),
 (u'2012-05', 29342)]


In [19]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
# Which year-month had the highest amount of node/way entries
c.execute ('''
    SELECT STRFTIME('%Y-%m', timestamp) as yearMonth
    FROM (SELECT timestamp FROM nodes
    UNION SELECT timestamp FROM ways)
    GROUP BY yearMonth
    ORDER BY COUNT(*) DESC
    LIMIT 1
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'2010-09',)]


In [20]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
# How many node or way tags were created in the year-month with the highest amount of entries
c.execute ('''
    SELECT COUNT(*)
    FROM (SELECT timestamp FROM nodes
    UNION SELECT timestamp FROM ways),
    (SELECT STRFTIME('%Y-%m', timestamp) as yearMonth
    FROM (SELECT timestamp FROM nodes
    UNION SELECT timestamp FROM ways)
    GROUP BY yearMonth
    ORDER BY COUNT(*) DESC
    LIMIT 1) as subq
    WHERE STRFTIME('%Y-%m', timestamp) = yearMonth
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(75669,)]


In [21]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
# In the year-month with the highest way/node entries, which users were adding to the database and how many
# entries did each user contribute
c.execute ('''
    SELECT user, COUNT(*)
    FROM (SELECT timestamp, user FROM nodes
    UNION SELECT timestamp, user FROM ways),
    (SELECT STRFTIME('%Y-%m', timestamp) as yearMonth
    FROM (SELECT timestamp FROM nodes
    UNION SELECT timestamp FROM ways)
    GROUP BY yearMonth
    ORDER BY COUNT(*) DESC
    LIMIT 1) as subq
    WHERE STRFTIME('%Y-%m', timestamp) = yearMonth
    GROUP BY user
    ORDER BY COUNT(*) DESC
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'andrewpmk', 72535),
 (u'Victor Bielawski', 2949),
 (u'bdustan', 101),
 (u'mfagan', 25),
 (u'MikeyCarter', 23),
 (u'tixuwuoz', 18),
 (u'bgibbard', 13),
 (u'brandoncote', 10),
 (u'salocinbake', 8),
 (u'emvee', 7),
 (u'Andre68', 2),
 (u'Sven L', 1),
 (u'de239', 1)]


In [22]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT user, COUNT(*)
    FROM (SELECT timestamp, user FROM nodes
    UNION SELECT timestamp, user FROM ways)
    WHERE STRFTIME('%Y-%m', timestamp) = '2010-09'
    GROUP BY user
    ORDER BY COUNT(*) DESC
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'andrewpmk', 72535),
 (u'Victor Bielawski', 2949),
 (u'bdustan', 101),
 (u'mfagan', 25),
 (u'MikeyCarter', 23),
 (u'tixuwuoz', 18),
 (u'bgibbard', 13),
 (u'brandoncote', 10),
 (u'salocinbake', 8),
 (u'emvee', 7),
 (u'Andre68', 2),
 (u'Sven L', 1),
 (u'de239', 1)]


In [23]:
db = sqlite3.connect(sqlite_file)
c = db.cursor()
c.execute ('''
    SELECT user, COUNT(*)
    FROM (SELECT timestamp, user FROM nodes
    UNION SELECT timestamp, user FROM ways)
    GROUP BY user
    ORDER BY COUNT(*) DESC
    LIMIT 5
    ''')
all_rows = c.fetchall()
pprint(all_rows)
db.close()

[(u'andrewpmk', 533475),
 (u'MikeyCarter', 64494),
 (u'Victor Bielawski', 40151),
 (u'rw__', 28356),
 (u'Kevo', 25771)]
