In [1]:
# Allow to import without installing
import sys
sys.path.insert(0, "..")

# Example files

These are downloaded from http://download.geofabrik.de/ which offers snapshots of various parts of the planet in a variety of formats.

I have found that "xz" offers better compression that bzip2.  Linux users can install the "xz" package, or on Windows, use 7zip.  Python itself can easily recompress a file.

The examples below use the data, in uncompressed format, for the Isle of Wight (a small island off the south coast of England, known to me from childhood holidays).  See http://download.geofabrik.de/europe/great-britain/england.html

In [2]:
import os, lzma
basedir = os.path.join("/media/disk", "OSM_Data")
#basedir = os.path.join("f:\\", "OSM_Data")
filename = "isle-of-wight-latest.osm.xz"
with lzma.open(os.path.join(basedir, filename), mode="rt", encoding="utf-8") as f:
    print(next(f), end="")
    print(next(f), end="")
    print(next(f), end="")
    print(next(f), end="")

<?xml version='1.0' encoding='UTF-8'?>
<osm version="0.6" generator="osmconvert 0.8.5" timestamp="2017-04-25T20:43:28Z">
	<bounds minlat="50.50555" minlon="-1.659074" maxlat="50.80102" maxlon="-1.0313699"/>
	<node id="195206" lat="50.6275781" lon="-1.1730057" version="10" timestamp="2016-03-29T12:53:40Z" changeset="38143882" uid="3099236" user="iwhs"/>


# Look at the generated data

In [3]:
import osmdigest.detail as detail
import datetime

In [4]:
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

#with detail.Parser(os.path.join(basedir, filename)) as gen:
start = datetime.datetime.now()
with detail.Parser("isle-of-wight-latest.osm") as gen:
    for x in gen:
        if isinstance(x, detail.OSM) or isinstance(x, detail.Bounds):
            print(x)
        elif isinstance(x, detail.Node):
            for y in x.subobjs:
                assert isinstance(y, detail.Tag)
                possible_node_tags.add(y.key)
        elif isinstance(x, detail.Way):
            for y in x.subobjs:
                if isinstance(y, detail.Tag):
                    possible_way_tags.add(y.key)
                else:
                    assert isinstance(y, detail.NodeRef)
        elif isinstance(x, detail.Relation):
            for y in x.subobjs:
                if isinstance(y, detail.Tag):
                    possible_relation_tags.add(y.key)
                else:
                    assert isinstance(y, detail.Member)
                    assert y.type in {"way", "node", "relation"}
        else:
            raise Exception("Should see this")
print("Took {}".format(datetime.datetime.now()-start))

OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(latitude:[50.50555,50.80102], longitude:[-1.659074,-1.0313699]
Took 0:05:19.125741


In [5]:
len(possible_node_tags), list(possible_node_tags)[:5]

(335, ['fixme', 'dog', 'traffic_signals', 'ford', 'email'])

In [6]:
len(possible_way_tags), list(possible_way_tags)[:5]

(484,
 ['email',
  'access:conditional',
  'construction',
  'opening_time',
  'building:roof'])

In [7]:
len(possible_relation_tags), list(possible_relation_tags)[:5]

(151, ['fixme', 'email', 'ref:nuts:3', 'junction', 'name'])

# Parse the data in a reduced way

In [8]:
import osmdigest.digest as digest

In [9]:
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

start = datetime.datetime.now()
for x in digest.parse_sax("isle-of-wight-latest.osm"):
    if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
        print(x)
    elif isinstance(x, digest.Node):
        for key in x.tags.keys():
            possible_node_tags.add(key)
    elif isinstance(x, digest.Way):
        for key in x.tags.keys():
            possible_way_tags.add(key)
    elif isinstance(x, digest.Relation):
        for key in x.tags.keys():
            possible_relation_tags.add(key)
print("Took {}".format(datetime.datetime.now()-start))

OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(latitude:[50.50555,50.80102], longitude:[-1.659074,-1.0313699]
Took 0:05:38.765608


In [10]:
len(possible_node_tags), list(possible_node_tags)[:5]

(335, ['fixme', 'dog', 'traffic_signals', 'ford', 'email'])

In [11]:
len(possible_way_tags), list(possible_way_tags)[:5]

(484,
 ['email',
  'access:conditional',
  'construction',
  'opening_time',
  'building:roof'])

In [12]:
len(possible_relation_tags), list(possible_relation_tags)[:5]

(151, ['fixme', 'email', 'ref:nuts:3', 'junction', 'name'])

## Use `xml.etree` instead

A different Python standard library `xml` parser.

In [13]:
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

start = datetime.datetime.now()
for x in digest.parse("isle-of-wight-latest.osm"):
    if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
        print(x)
    elif isinstance(x, digest.Node):
        for key in x.tags.keys():
            possible_node_tags.add(key)
    elif isinstance(x, digest.Way):
        for key in x.tags.keys():
            possible_way_tags.add(key)
    elif isinstance(x, digest.Relation):
        for key in x.tags.keys():
            possible_relation_tags.add(key)
print("Took {}".format(datetime.datetime.now()-start))

OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(latitude:[50.50555,50.80102], longitude:[-1.659074,-1.0313699]
Took 0:00:10.848116


In [14]:
len(possible_node_tags), list(possible_node_tags)[:5]

(335, ['fixme', 'dog', 'traffic_signals', 'ford', 'email'])

In [15]:
len(possible_way_tags), list(possible_way_tags)[:5]

(484,
 ['email',
  'access:conditional',
  'construction',
  'opening_time',
  'building:roof'])

In [16]:
len(possible_relation_tags), list(possible_relation_tags)[:5]

(151, ['fixme', 'email', 'ref:nuts:3', 'junction', 'name'])

# Via a callback

Just to show that it's not the SAX library itself which is the bottleneck.

In [17]:
class Handler(digest.OSMDataHandler):
    def __init__(self):
        self.possible_node_tags = set()
        self.possible_way_tags = set()
        self.possible_relation_tags = set()

    def start(self, osm):
        print(osm)
        
    def bounds(self, bounds):
        print(bounds)

    def node(self, x):
        for key in x.tags.keys():
            self.possible_node_tags.add(key)

    def way(self, x):
        for key in x.tags.keys():
            self.possible_way_tags.add(key)

    def relation(self, x):
        for key in x.tags.keys():
            self.possible_relation_tags.add(key)

start = datetime.datetime.now()
handler = Handler()
digest.parse_callback("isle-of-wight-latest.osm", handler)
print("Took {}".format(datetime.datetime.now()-start))

OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(latitude:[50.50555,50.80102], longitude:[-1.659074,-1.0313699]
Took 0:00:12.288264


In [18]:
len(handler.possible_node_tags), list(handler.possible_node_tags)[:5]

(335, ['fixme', 'dog', 'traffic_signals', 'ford', 'email'])

In [19]:
len(handler.possible_way_tags), list(handler.possible_way_tags)[:5]

(484,
 ['email',
  'access:conditional',
  'construction',
  'opening_time',
  'building:roof'])

In [20]:
len(handler.possible_relation_tags), list(handler.possible_relation_tags)[:5]

(151, ['fixme', 'email', 'ref:nuts:3', 'junction', 'name'])

# Convert the callback to a generator at the OSM data level

This works fairly well.

In [24]:
import osmdigest.utils.cbtogen as cbtogen

In [28]:
class Handler(digest.OSMDataHandler):
    def __init__(self, delegate):
        self.delegate = delegate
        
    def start(self, osm):
        self.delegate.notify(osm)
        
    def bounds(self, bounds):
        self.delegate.notify(bounds)

    def node(self, x):
        self.delegate.notify(x)

    def way(self, x):
        self.delegate.notify(x)

    def relation(self, x):
        self.delegate.notify(x)
        
generator = cbtogen.CallbackToGenerator()
handler = Handler(generator)
def func():
    digest.parse_callback("isle-of-wight-latest.osm", handler)
generator.set_callback_function(func)

possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

with generator:
    start = datetime.datetime.now()
    for x in generator:
        if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
            print(x)
        elif isinstance(x, digest.Node):
            for key in x.tags.keys():
                possible_node_tags.add(key)
        elif isinstance(x, digest.Way):
            for key in x.tags.keys():
                possible_way_tags.add(key)
        elif isinstance(x, digest.Relation):
            for key in x.tags.keys():
                possible_relation_tags.add(key)
    print("Took {}".format(datetime.datetime.now()-start))
    
len(possible_node_tags), len(possible_way_tags), len(possible_relation_tags)

OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(latitude:[50.50555,50.80102], longitude:[-1.659074,-1.0313699]
Took 0:00:34.578027


(335, 484, 151)

# Conclusion

Using 2 threads to convert from a callback to a generator might be fun, but it's not performant.  At all.

I suspect what happens is that the inter-thread communication (and whatever context switching Python does, as CPYthon is essentially single threaded) adds a certain overhead.  It we put this overhead at the XML parsing level, then we generate a huge number of temporary objects which are pushed onto the queue only to be removed and essentially ignored (e.g. "character" messages).  Those events which aren't ignored are often converted into a much smaller number of OSM specific objects (e.g. a way with many tags and node references yields only osm object but could be 50 XML events).