In [1]:
import sys, os
sys.path.insert(0, os.path.abspath('../'))
from utils.osm_helper import node2pt, way2line, rltn2cltn, osm_container
import geopandas as gp
from osmread import Node, Way, Relation
import datetime
import pandas as pd

In [2]:
OSM_DC_BBOX_PATH = '../data/dc_bbox.osm'
OSM_DC_BBOX = osm_container(OSM_DC_BBOX_PATH)
print OSM_DC_BBOX.data_size()

begin reading osm 2016-11-09 22:59:10.244000
finish reading osm 2016-11-09 23:00:23.133000
["len of <class 'osmread.elements.Node'> = 2546260", "len of <class 'osmread.elements.Way'> = 306332", "len of <class 'osmread.elements.Relation'> = 3427"]


In [169]:
DC_OSM_IDS_PATH ='../data/dc_osm_ids_within_polygon.txt'
DC_OSM_IDS={}
with open(DC_OSM_IDS_PATH) as f:
    DC_OSM_IDS = eval(f.readlines()[0])
for key, value in DC_OSM_IDS.items():
    print key, len(value)
    
    
def dc_obj(osm_objs, osm_ids, osmtype, osmtype_str):
    return [obj for obj in osm_objs.osm_objs[osmtype] if obj.id in osm_ids[osmtype_str]]

DC_NODES = dc_obj(OSM_DC_BBOX, DC_OSM_IDS,Node, 'Node')
DC_WAYS = dc_obj(OSM_DC_BBOX, DC_OSM_IDS, Way, 'Way')
DC_RLTNS = dc_obj(OSM_DC_BBOX, DC_OSM_IDS, Relation, 'Relation')
len(DC_WAYS), len(DC_NODES), len(DC_RLTNS)

Node 1652906
Relation 2556
Way 194767


(194767, 1652906, 2556)

In [177]:
# TODO: move to osm helper.py
def have_tag_value(obj, tag='*', value='*'):
    if not obj.tags: # have no tag, discard it whatever query is
        return False
    if tag=='*': # True for having any tag
        return True
    if not tag in obj.tags:
        return False
    if value=='*':
        return True
    return obj.tags[tag] in value

def filter_obj(obj, have_one=[('*','*')], donthave=[]):
    for tag, value in donthave:
        if have_tag_value(obj,tag, value):
            return False
    
    for tag, value in have_one:
        if have_tag_value(obj, tag,value):
            return True
    return False

def filter_osm_data(osm_objs,have_one=[('*','*')], donthave=[], special_filters=None):
    objs = []
    if special_filters:
        for o in osm_objs:
            pass_filter = True
            for filt in special_filters:
                if not filt(o): 
                    pass_filter=False
                    break
            if pass_filter:
                objs.append(o)
    else:
        for o in osm_objs:
            if filter_obj(o, have_one, donthave):
                objs.append(o)
    return objs

def filter_osm_data_to_df(osm_objs,have_one=[('*','*')], donthave=[], special_filters=None):
    objs = filter_osm_data(osm_objs,have_one, donthave, special_filters=None)
    attr =[x[0] for x in have_one]
    objs = [[o.id]+[o.tags.get(k,'') for k in attr] for o in objs]
    df_objs = pd.DataFrame(objs, columns=['id']+attr)
    return df_objs



In [178]:
# specific filter
def filter_is_motorway(obj):
    tag_highway = [('highway','*')]
    tag_bike_walk = [('highway', set(['path','pedestrian','footway','steps','cycleway','crossing']))]
    return filter_obj(obj, tag_highway, tag_bike_walk)
def filter_isnot_motorway(obj):
    return not filter_is_motorway(obj)

def filter_is_bike_walk_way(obj):
    tag_bike_walk = [('highway', set(['path','pedestrian','footway','steps','cycleway','crossing']))]
    return filter_obj(obj, tag_bike_walk)
def filter_isnot_bike_walk_way(obj):
    return not filter_is_bike_walk_way(obj)

def filter_is_admin(obj):
    tag_admin_have = [('boundary','*'),('place','*')]
    tag_admin_donthave = [('leisure','*'), ('amenity','*'), ('boundary',['national_park','protected_area']),
                          ('natural','*'), ('place', ['island', 'islet', 'square', 'farm'])]
    return filter_obj(obj, tag_admin_have, tag_admin_donthave)

def filter_isnot_admin(obj):
    return not filter_is_admin(obj)

In [186]:
DC_NODES_WITH_TAG = filter_osm_data(DC_NODES)
DC_WAYS_WITH_TAG = filter_osm_data(DC_WAYS)
DC_RLTNS_WITH_TAG = filter_osm_data(DC_RLTNS)
len(DC_NODES_WITH_TAG), len(DC_WAYS_WITH_TAG), len(DC_RLTNS_WITH_TAG)

(37912, 190931, 2552)

In [179]:
motor_way = filter_osm_data(DC_WAYS,special_filters=[filter_is_motorway])
bike_walk_way = filter_osm_data(DC_WAYS, special_filters=[filter_is_bike_walk_way])
len(motor_way), len(bike_walk_way), 18506-13457

(13457, 5049, 5049)

In [180]:
nodes_highway = filter_osm_data(DC_NODES, special_filters=[filter_is_motorway])
len(nodes_highway)

2344

In [181]:
not_motor_way_or_admin = filter_osm_data(DC_WAYS,special_filters=[filter_isnot_motorway,filter_isnot_admin])
len(not_motor_way_or_admin), len(DC_WAYS)-len(motor_way)-len(admin_way)

(181144, 181144)

In [182]:
not_motor_way = filter_osm_data(DC_WAYS,special_filters=[filter_isnot_motorway])
len(not_motor_way), len(DC_WAYS)-len(motor_way)

(181310, 181310)

In [183]:
admin_rltn = filter_osm_data(DC_RLTNS, special_filters=[filter_is_admin])
admin_way = filter_osm_data(DC_WAYS, special_filters=[filter_is_admin])
len(admin_rltn),len(admin_way)

(154, 166)

In [293]:
import shapely.geometry as shpgeo

def lon_lats_closed(lon_lats):
    return lon_lats[0]==lon_lats[-1]

def way2lineOrpoly(osm_data, way):
    nodes = [osm_data.get_osm_node_by_id(nid) for nid in way.nodes]
    lon_lats = [(node.lon, node.lat) for node in nodes]
    return shpgeo.Polygon(lon_lats).buffer(0) if lon_lats_closed(lon_lats) and len(lon_lats)>3 else shpgeo.LineString(lon_lats)

def flatten_rltn_shpcltn(shpcltn):
    flat_shpcltn = []
    for shpobjs in shpcltn.values():
        flat_shpcltn.extend(shpobjs)
    return flat_shpcltn

def build_gpdf_list_from_osm(osm_data, nodes, ways, rltns):
    columns = ['id','type','geometry']
    shpobjs_from_nodes = [(node.id,'Node', node2pt(node)) for node in nodes]
    shpobjs_from_ways =  [(way.id,'Way', way2lineOrpoly(osm_data, way)) for way in ways]
    shpobjs_from_rltns = [(rltn.id, 'Relation', shpobj) for rltn in rltns for shpobj in flatten_rltn_shpcltn(rltn2cltn(osm_data,rltn))]
    print len(shpobjs_from_nodes),len(shpobjs_from_ways),len(shpobjs_from_rltns)
    shpobjs = []
    shpobjs.extend(shpobjs_from_nodes)
    shpobjs.extend(shpobjs_from_ways)
    shpobjs.extend(shpobjs_from_rltns)
    gpdf = gp.GeoDataFrame(shpobjs, columns=columns)
    return gpdf

In [187]:
gpdf = build_gpdf_list_from_osm(OSM_DC_BBOX, DC_NODES_WITH_TAG,DC_WAYS_WITH_TAG, DC_RLTNS_WITH_TAG)

37912 190931 7360


In [188]:
gpdf.shape

(236203, 3)

In [294]:
# gpdf_nodes = build_gpdf_list_from_osm(OSM_DC_BBOX, DC_NODES_WITH_TAG,[], [])
gpdf_ways = build_gpdf_list_from_osm(OSM_DC_BBOX, [],DC_WAYS_WITH_TAG, [])
gpdf_rltns = build_gpdf_list_from_osm(OSM_DC_BBOX, [],[], DC_RLTNS_WITH_TAG)

0 190931 0
0 0 7356


In [290]:
def remove_equal_shpobj(objs):
    import rtree
    size = len(objs)
    equal_pair = []
    keep = []
    exclude_idx = set()
    
    tree_idx = rtree.index.Index()
    objs_bounds = [o.bounds for o in objs]
    for i in xrange(size):
        try:
            tree_idx.insert(i, objs_bounds[i])
        except Exception as e:
            print i, objs_bounds[i], objs[i]
            raise e

    for i in xrange(size):
        if i in exclude_idx:
            continue
        keep.append(i)
        js = tree_idx.intersection(objs[i].bounds)
        for j in js:
            if i!=j and objs[i].equals(objs[j]):
                equal_pair.append((i,int(j)))
                exclude_idx.add(j)
                
    return keep, equal_pair

In [273]:
keep_idx, equal_pair_idx = remove_equal_shpobj(gpdf.iloc[:10000].geometry.values)
equal_pair_osmids = [(gpdf.iloc[i].name, gpdf.iloc[j].name) for i, j in equal_pair_idx]


In [283]:
from utils.other_utils import find_tree
def merge_within(shp_gpdf):
    import geopandas as gp
    import pandas as pd
    keep, equal_pair= remove_equal_shpobj(shp_gpdf.geometry.values)
    equal_pair_index = [(shp_gpdf.iloc[i].name, shp_gpdf.iloc[j].name) for i, j in equal_pair]
    gpdf_no_equal = shp_gpdf.iloc[keep]
    print 'keep =',len(keep), 'equal pair =',len(equal_pair), gpdf_no_equal.shape, datetime.datetime.now()
    sjoin = gp.tools.sjoin(gpdf_no_equal,gpdf_no_equal,op='within')
    print 'sjoin.shape =',sjoin.shape, datetime.datetime.now()
    messy_tree_df = pd.DataFrame(zip(sjoin.index.values, sjoin.index_right.values), columns=['child','parent'])
    print 'messy tree shape =', messy_tree_df.shape, datetime.datetime.now()
    clean_tree_df = find_tree(messy_tree_df.copy())
    print 'clean tree shape =', clean_tree_df.shape, datetime.datetime.now()
    top_level_shp_idx = clean_tree_df[clean_tree_df.parent=='root'].child.values
    return gpdf[gpdf.index.isin(top_level_shp_idx)], messy_tree_df, equal_pair_index

In [292]:
gpdf_ways.iloc[187330 ]

id              374179850
type                  Way
geometry    POLYGON EMPTY
Name: 187330, dtype: object

In [295]:
# print datetime.datetime.now()
# merged_gpdf_nodes, messy_tree_df_nodes, equal_pair_index_nodes = merge_within(gpdf_nodes)
print datetime.datetime.now()
merged_gpdf_ways, messy_tree_df_ways, equal_pair_index_ways = merge_within(gpdf_ways)
print datetime.datetime.now()
merged_gpdf_rltns, messy_tree_df_rltns, equal_pair_index_rltns = merge_within(gpdf_rltns)
print datetime.datetime.now()

2016-11-10 20:51:20.763000


MemoryError: 

In [284]:
merged_gpdf, messy_tree_df, equal_pair_index = merge_within(gpdf)

TopologyException: side location conflict at -77.047313000000003 38.907462299999999
ERROR:shapely.geos:TopologyException: side location conflict at -77.047313000000003 38.907462299999999


keep = 233022 equal pair = 3181 (233022, 3) 2016-11-10 20:16:43.145000


AttributeError: 'PreparedGeometry' object has no attribute 'is_valid'

In [278]:
temp_df = messy_tree_df.merge(gpdf[['id']], how='left', left_on='child', right_index=True).merge(gpdf[['id']], how='left',left_on='parent', right_index=True)
temp_df

Unnamed: 0,child,parent,id_x,id_y
0,0,0,30066952,30066952
1,1,1,30066953,30066953
2,2,2,30066954,30066954
3,3,3,30066955,30066955
4,4,4,49715946,49715946
5,5,5,49715951,49715951
6,6,6,49715974,49715974
7,7,7,49716326,49716326
8,8,8,49716545,49716545
9,9,9,49716602,49716602


In [279]:
merged_gpdf[merged_gpdf.id.isin([358960137, 358960140, 358960142, 358955201])]

Unnamed: 0,id,type,geometry
5411,358955201,Node,POINT (-77.0920599 38.9362035)


In [280]:
merged_gpdf.shape

(9960, 3)

In [281]:
messy_tree_df.parent.value_counts()

2047    1
5424    1
1338    1
7481    1
5432    1
9526    1
3379    1
1330    1
7473    1
9518    1
5408    1
3371    1
1322    1
7465    1
5416    1
9510    1
3363    1
1314    1
3387    1
9534    1
5440    1
7489    1
7513    1
5464    1
9558    1
3411    1
1362    1
7505    1
5456    1
9550    1
       ..
6790    1
645     1
2692    1
8833    1
4735    1
6782    1
637     1
2684    1
8825    1
4727    1
4751    1
2708    1
6830    1
661     1
685     1
2732    1
8873    1
4775    1
6822    1
677     1
2724    1
8865    1
4767    1
6814    1
669     1
2716    1
8857    1
4759    1
6806    1
0       1
dtype: int64