In [1]:
# Allow to import without installing
import sys
sys.path.insert(0, "..")

# Pythonify

If you have a reasonable amount of ram, then it's possible to load quite big XML files fully into memory and to general python dictionaries from them.  These can then be saved out in compressed pickled format.

This is a low-tech, non-portable solution for subsequently quickly extracting data, but it's also quick and easy.

In [2]:
import osmdigest.pythonify as pythonify

import os
basedir = os.path.join("/media/disk", "OSM_Data")
filename = "illinois-latest.osm.xz"

## Extract tags

In [3]:
tags = pythonify.Tags(os.path.join(basedir, filename))

In [6]:
pythonify.pickle(tags, "illinois_tags.pic.xz")
os.stat("illinois_tags.pic.xz").st_size / 1024**2

20.2431640625

In [9]:
tags.nodes_from_key("name")[:5]

[('Aurora Toll Plaza', 701092),
 ('Aurora Toll Plaza', 235231121),
 ('Aurora Toll Plaza', 461515840),
 ('Aurora Toll Plaza', 461515849),
 ('River Road Toll Plaza', 701654)]

In [10]:
tags_by_id = pythonify.TagsById(tags)

In [11]:
tags_by_id.node(701092)

{'barrier': 'toll_booth',
 'name': 'Aurora Toll Plaza',
 'operator': 'Illinois State Toll Highway Authority',
 'ref': '61'}

## Load tags back in

In [3]:
tags = pythonify.unpickle("illinois_tags.pic.xz")

In [4]:
list(tags.all_relation_tag_keys)[:5]

['roundtrip', 'name:ba', 'tower:type', 'name:diq', 'name:kl']

In [6]:
tags.relations_from_key("tower:type")

[('climbing', 5813084)]

In [7]:
pythonify.TagsById(tags).relation(5813084)

{'layer': '2',
 'leisure': 'pitch',
 'name': 'Climbing Park',
 'sport': 'climbing',
 'tower:type': 'climbing',
 'type': 'multipolygon'}

## Extract nodes

This is typically the most memory intensive operation.

In [10]:
nodes = pythonify.Nodes(os.path.join(basedir, filename))

In [11]:
pythonify.pickle(nodes, "illinois_nodes.pic.xz")
os.stat("illinois_nodes.pic.xz").st_size / 1024**2

140.4869499206543

## Extract ways and relations

In [3]:
ways = pythonify.Ways(os.path.join(basedir, filename))

In [5]:
pythonify.pickle(ways, "illinois_ways.pic.xz")
os.stat("illinois_ways.pic.xz").st_size / 1024**2

31.613712310791016

In [8]:
relations = pythonify.Relations(os.path.join(basedir, filename))

In [9]:
pythonify.pickle(relations, "illinois_relations.pic.xz")
os.stat("illinois_relations.pic.xz").st_size / 1024**2

1.08172607421875

# Load back node data and recompress

In [3]:
nodes = pythonify.unpickle("illinois_nodes.pic.xz")

In [4]:
nodes = pythonify.NodesPacked.from_Nodes(nodes)

In [7]:
i = iter(nodes)
for j in range(10):
    print(next(i))

(219850, (-87.9101245, 41.7585879))
(219851, (-87.9076432, 41.7593116))
(700724, (-88.0158606, 41.7120272))
(700725, (-88.0116119, 41.7142377))
(700726, (-88.007417, 41.716384))
(700727, (-88.0091658, 41.7154871))
(700728, (-88.0029645, 41.7187545))
(700729, (-88.0005612, 41.7199717))
(700731, (-87.9887166, 41.7258174))
(700732, (-87.9915919, 41.7245362))


In [8]:
nodes[700732]

(-87.9915919, 41.7245362)

In [9]:
pythonify.pickle(nodes, "illinois_nodes_packed.pic.xz")
os.stat("illinois_nodes_packed.pic.xz").st_size / 1024**2

75.53979110717773

## Load back way data

In [11]:
ways = pythonify.unpickle("illinois_ways.pic.xz")

In [15]:
print(next(iter(ways)))

(3819179, [20326165, 33235915, 2754748538, 2754748544, 33235916, 33235917, 33235918, 33235919, 33235920, 33235921, 33235922, 2754748527, 33235924, 33235925, 33235926, 33235927, 4320607048, 33235928, 2754748488, 2754748494, 2754748497, 4320607050, 4320607049, 2754748491, 33235929, 33235888, 33235889, 33235890, 158765688, 19222947, 19222948, 19222949, 19222950, 19222951, 19222952, 19222953, 19222954, 19222955, 19222956, 19222957, 19222958, 19222959, 19222960, 19222961, 158756608, 19222962, 19222963, 19222964, 19222965, 19222966, 19222967, 2754748511, 2754748502, 2754748500, 2754748505, 2754748508, 19222968, 19222969, 19222970, 19222971, 19222972, 2754748530, 2754748533, 2754748536, 19222973, 19222974, 19222975, 19222976, 19222977, 19222978, 2754748563, 2754748566, 19222979, 19222980, 19222981, 19222982, 19222983, 19222984, 19222985, 19222986, 4320607224, 19222987, 2754748586, 19222988, 19222989, 19222990, 2754748600, 19222991, 19222992, 19222993, 19222997, 19223002, 19223007, 19223008, 1

In [17]:
nodes[20326165], nodes[33235915]

((-85.201651, 46.0517207), (-85.2023348, 46.051174))

# Process California data in one go

In [2]:
import osmdigest.pythonify as pythonify

import os
basedir = os.path.join("/media/disk", "OSM_Data")
filename = "california-latest.osm.xz"

In [None]:
pythonify.pythonify_and_pickle(os.path.join(basedir, filename), os.path.join(basedir, "california"))