# Load raw FS and OSM data

In [1]:
from shapely.ops import cascaded_union
from ast import literal_eval
import sys, os
sys.path.insert(0, os.path.abspath('../../'))
from Cycling_Safe.utils.osm_helper import *
from Cycling_Safe.utils.str_similarity import similarity
import geopandas as gp
from osmread import Node, Way, Relation
import datetime
import pandas as pd


data_dir = '../data/'
osm_data_dir = data_dir + 'osm/'
category_in_dc_path = osm_data_dir +'osm_category_in_dc.txt'

FRSQ_DIR ='../data/frsq/'
FRSQ_GRAPH_DIR = FRSQ_DIR+'graphs/'
SUFFIX = '_nearby'
frsq_venues_path = FRSQ_DIR+ 'frsq_extracted_venues{}.geojson'.format(SUFFIX)
frsq_venues_mapped_path = FRSQ_DIR+ 'frsq_extracted_venues_mapped{}.geojson'.format(SUFFIX)

In [2]:
category_in_dc_df = pd.read_csv(category_in_dc_path,index_col=0)
category_in_dc_df.category = category_in_dc_df.category.apply(eval)
exclude = 'exclude'
category_in_dc_df['has_category'] = category_in_dc_df.category.apply(lambda x: (len(x)>0) and (exclude not in x))
# category_in_dc_df[category_in_dc_df.category==set(['exclude'])]
category_in_dc_df[category_in_dc_df.has_category].shape,\
category_in_dc_df[category_in_dc_df.category.apply(len)>0].shape

((33003, 4), (35284, 4))

In [3]:

OSM_DC_BBOX_PATH = '../data/osm/osm_dc_bbox.osm'
OSM_DC_BBOX = osm_container(OSM_DC_BBOX_PATH)
print OSM_DC_BBOX.data_size()


begin reading osm 2016-12-12 13:27:44.910000
finish reading osm 2016-12-12 13:28:55.385000
["len of <class 'osmread.elements.Node'> = 2546260", "len of <class 'osmread.elements.Relation'> = 3427", "len of <class 'osmread.elements.Way'> = 306332"]


In [4]:
def keep_obj(osm_objs, df,osmtype, osmtype_str):
    osm_ids = set(df[(df.type==osmtype_str) & (df['has_category']==True)].id.values)
    return [obj for obj in osm_objs.osm_objs[osmtype] if obj.id in osm_ids]

nodes_with_category = keep_obj(OSM_DC_BBOX, category_in_dc_df,Node, 'Node')
ways_with_category = keep_obj(OSM_DC_BBOX, category_in_dc_df,Way, 'Way')
rltns_with_category = keep_obj(OSM_DC_BBOX, category_in_dc_df,Relation, 'Relation')
len(nodes_with_category), len(ways_with_category), len(rltns_with_category)

(11769, 20017, 1217)

# transform foursquare, OSM data into geopandas

In [5]:
sys.path.insert(0, os.path.abspath('../../'))
from Cycling_Safe.utils.osm.osm2shp import *

## resolve osm relation

In [6]:
rltn_pts = []
rltn_lns = []

for r in rltns_with_category:
    shps = rltn2dictShp(OSM_DC_BBOX,r)
    cnt = 0
    pts = [(r.id,'{}_{}'.format(r.id,i+cnt), r.tags, pt, 'Relation') for i, pt in enumerate(shps['Point'])]
    rltn_pts.extend(pts)
    cnt+=len(shps['Point'])
    lns = [(r.id,'{}_{}'.format(r.id,i+cnt), r.tags, pt, 'Relation') for i, pt in enumerate(shps['LineString'])]
    rltn_lns.extend(lns)
    cnt+=len(shps['LineString'])
    polys = [(r.id,'{}_{}'.format(r.id,i+cnt), r.tags, pt, 'Relation') for i, pt in enumerate(shps['Polygon'])]
    rltn_lns.extend(polys)
    
rltn_pts = pd.DataFrame(rltn_pts, columns=['id','id_keep','tag','geometry','type']).merge(category_in_dc_df)
rltn_pts = rltn_pts.drop(['id','has_category'], axis=1)
rltn_pts.columns = ['id','tag','geometry','type','category']

rltn_lns = pd.DataFrame(rltn_lns, columns=['id','id_keep','tag','geometry','type']).merge(category_in_dc_df)
rltn_lns = rltn_lns.drop(['id','has_category'], axis=1)
rltn_lns.columns = ['id','tag','geometry','type','category']


## get pts

In [7]:
pts_bfr = 5

In [8]:
pts = [(nd.id, nd.tags, node2pt(nd),'Node') for nd in nodes_with_category]
pts_gpdf = gp.GeoDataFrame(pts, columns=['id','tag', 'geometry','type'])

pts_gpdf = pts_gpdf.merge(category_in_dc_df).drop('has_category',axis=1)
pts_gpdf = pts_gpdf.append(rltn_pts,ignore_index=True)
# pts_gpdf.tag = pts_gpdf.tag.apply(str)

pts_gpdf.crs = {'init': u'epsg:4326'}
pts_gpdf_crs = pts_gpdf.to_crs(epsg=3559)
pts_gpdf_crs.geometry = pts_gpdf_crs.buffer(pts_bfr)

pts_gpdf_bfr = pts_gpdf_crs.to_crs(epsg=4326)
pts_gpdf_bfr.columns=['id','tag','geometry','type','category']

In [9]:
venues_gpdf = gp.read_file(frsq_venues_mapped_path)

venues_gpdf_crs = venues_gpdf.to_crs(epsg=3559)
venues_gpdf_crs.geometry = venues_gpdf_crs.buffer(pts_bfr)

venues_gpdf_bfr = venues_gpdf_crs.to_crs(epsg=4326)
venues_gpdf_bfr = venues_gpdf_bfr.drop(['checkins','tips','users'],axis=1)
venues_gpdf_bfr['type']='fq'
venues_gpdf_bfr.columns=['tag','geometry','id','category','name','type']
venues_gpdf_bfr = venues_gpdf_bfr[venues_gpdf_bfr.category!='no category']
venues_gpdf_bfr.category = venues_gpdf_bfr.category.apply(lambda x: set([x]))

## get lns or polys, append rltn lns

In [10]:
lns = [(way.id, way.tags, way2lineOrpoly(OSM_DC_BBOX, way),'Way') for way in ways_with_category]
lns_gpdf = gp.GeoDataFrame(lns, columns=['id','tag', 'geometry','type'])

lns_gpdf = lns_gpdf.merge(category_in_dc_df[category_in_dc_df.has_category]).drop('has_category', axis=1)
print lns_gpdf.shape

lns_gpdf = lns_gpdf.append(rltn_lns,ignore_index=True)
print lns_gpdf.shape

# lns_gpdf.tag = lns_gpdf.tag.apply(str)

lns_gpdf.columns=['id','tag','geometry','type','category']

Ring Self-intersection at or near point -77.020059900000007 38.815362499999999


(20017, 5)
(23956, 5)




# find overlap

## overlap between osm nodes and fq

In [11]:
fq_pts = gp.tools.sjoin(pts_gpdf_bfr, venues_gpdf_bfr)

In [12]:
fq_pts.shape

(3952, 11)

In [13]:
fq_pts
# over kill example
# fq_pts[(fq_pts.apply(lambda x: len(x.category_right - x.category_left)==0, axis=1))]
# fq_pts[(fq_pts.id_left==2276095966)]

Unnamed: 0,id_left,tag_left,geometry,type_left,category_left,index_right,tag_right,id_right,category_right,name,type_right
8,49716664,"{u'source': u'Bing', u'highway': u'crossing'}",POLYGON ((-77.00900476913097 38.88750470445696...,Node,set([cycling facilities]),11142,Government Building,4be09e4c98f2a5931cd8c25a,set([professional service]),Office of Rep. Chris Van Hollen (MD-08),fq
22,49719415,{u'highway': u'traffic_signals'},POLYGON ((-77.05696555480179 38.90525342812081...,Node,set([cycling facilities]),1506,Men's Store,5391f13f498e8a555acb7023,set([retail shop]),Ike Behar,fq
58,49732053,{u'highway': u'traffic_signals'},POLYGON ((-77.05805425481758 38.90523442865798...,Node,set([cycling facilities]),1475,Bank,4bc0fb9874a9a593f019d1f6,set([professional service]),Wells Fargo,fq
64,49732286,{u'highway': u'traffic_signals'},"POLYGON ((-77.02800236543074 38.8920883138304,...",Node,set([cycling facilities]),10258,Art Gallery,4c27ccfc97d00f4725f13eea,set([art]),Michael Jackson's Fedora @Apollo Exhibit,fq
9321,3016605404,{u'highway': u'traffic_signals'},POLYGON ((-77.02814476543053 38.89208861390066...,Node,set([cycling facilities]),10258,Art Gallery,4c27ccfc97d00f4725f13eea,set([art]),Michael Jackson's Fedora @Apollo Exhibit,fq
72,49734010,{u'highway': u'stop'},POLYGON ((-77.01360948760406 38.86462800672898...,Node,set([cycling facilities]),12979,Government Building,4de4542fd164df8575287ffe,set([professional service]),National Response Center,fq
76,49734611,{u'highway': u'traffic_signals'},POLYGON ((-77.04331293640715 38.92800152138454...,Node,set([cycling facilities]),14338,Bike Rental / Bike Share,4e54f80ad4c0fe0342802d27,set([cycling facilities]),Adams Mill & Harvard Bikestation,fq
77,49734612,"{u'source': u'survey', u'highway': u'traffic_s...",POLYGON ((-77.03876393272729 38.93255011914005...,Node,set([cycling facilities]),13548,Laundry Service,4be0a15bcb81c9b6bb7e668b,set([retail shop]),Lamont Cleaners,fq
10139,3378524030,"{u'operator': u'United States Postal Service',...",POLYGON ((-77.03892363276702 38.93250101921883...,Node,set([professional service]),13548,Laundry Service,4be0a15bcb81c9b6bb7e668b,set([retail shop]),Lamont Cleaners,fq
11197,4082394066,"{u'stop': u'minor', u'highway': u'stop'}",POLYGON ((-77.03888013276854 38.93249911919739...,Node,set([cycling facilities]),13548,Laundry Service,4be0a15bcb81c9b6bb7e668b,set([retail shop]),Lamont Cleaners,fq


In [14]:
fq_pts['name_left'] = fq_pts.tag_left.apply(lambda x: x.get('name',''))
fq_pts['name_sml'] = fq_pts.apply(lambda x: similarity(x.name_left, x['name']),axis=1)

In [15]:
def connected_nodes(edges):
    import networkx as nx
    G = nx.Graph()
    G.add_edges_from(edges)
    return [x.nodes() for x in nx.connected_component_subgraphs(G)]
def similar_name(row,threshold=0.8):
    return row.name_sml>threshold
def no_name_but_same_category(row):
    if not row.name_sml==-1.0:
        return False
    return len(row.category_right - row.category_left)==0

In [16]:

pairs = fq_pts[['id_left','id_right']].values
conn_nodes = connected_nodes(pairs)
print 'conn components: ',len(conn_nodes)
mask_df = lambda df, x: df[(df.id_left.isin(x))|df.id_right.isin(x)]
over_lap_dfs = []
for idx,x in enumerate(conn_nodes):
    x = set(x)
    mask = mask_df(fq_pts,x)
    assert set(mask[['id_left','id_right']].values.flatten())==x, (idx, x,mask)
    mask_name_sml = mask[mask.apply(similar_name,axis=1)].copy()
    if mask_name_sml.shape[0]>0:
        mask_name_sml['criteria'] = 'similar name'
        mask_name_sml['component_idx'] = idx
        over_lap_dfs.append(mask_name_sml)
    else:
        mask_same_cateory = mask[mask.apply(no_name_but_same_category,axis=1)].copy()
        mask_same_cateory['criteria'] = 'same category'
        mask_same_cateory['component_idx'] = idx
        over_lap_dfs.append(mask_same_cateory)    
    
fq_pts_overlap = pd.concat(over_lap_dfs)
print fq_pts_overlap.shape
print fq_pts_overlap.criteria.value_counts()
print fq_pts_overlap.component_idx.value_counts().value_counts()

conn components:  1882
(531, 15)
similar name     433
same category     98
dtype: int64
1    407
2     44
3      6
5      2
4      2
dtype: int64


In [17]:
overlap_nodeid = np.unique(fq_pts_overlap.id_left)
overlap_fqid = np.unique(fq_pts_overlap.id_right)
overlap_nodeid.shape, overlap_fqid.shape

((513L,), (521L,))

In [18]:
pts_not_overlap = pts_gpdf_bfr[~pts_gpdf_bfr.id.isin(overlap_nodeid)]
fqs_not_overlap = venues_gpdf_bfr[~venues_gpdf_bfr.id.isin(overlap_fqid)]

In [19]:
overlap_merged = []
for idx, row in fq_pts_overlap.iterrows():
    oid, fqid = row.id_left, row.id_right
    category = row.category_left | row.category_right
    tag = str(row.tag_left) + '<br> FS:'+ row.tag_right
    pts = [row.geometry]
    venues_gpdf_bfr[venues_gpdf_bfr.id==fqid]
    pts.append(venues_gpdf_bfr[venues_gpdf_bfr.id==fqid].geometry.values[0])
    union_pts = cascaded_union(pts)
    tp = row.type_left+'_fq'
    overlap_merged.append((oid, tag, category, union_pts,tp))    
overlap_merged = gp.GeoDataFrame(overlap_merged,columns=['id','tag','category','geometry','type'])

In [21]:
pts_merged = pd.concat([pts_not_overlap, fqs_not_overlap, overlap_merged],ignore_index=True)
pts_not_overlap.shape, fqs_not_overlap.shape, overlap_merged.shape, pts_merged.shape

((11268, 5), (17576, 6), (531, 5), (29375, 6))

## overlap between osm ways

In [22]:
from Cycling_Safe.utils.geofunc import remove_equal_shpobj, merge_within

In [23]:
def get_sub_tree(tree_df, node_id, looping=0):
    mask_node = (tree_df.node == node_id)
    mask_children = (tree_df.parent == node_id)
    assert looping < 100, 'looping too much nodeid:' + str(node_id)

    node = tree_df[mask_node]
    nid = node.node.values[0]
    subtree = {nid}

    children_id = tree_df[mask_children].node.values
    if len(children_id) == 0:
        return subtree, node.shape
    
    subtree.update(children_id)

    for cid in children_id:
        c_subtree, c_tree_shape = get_sub_tree(tree_df, cid, looping + 1)
        subtree.update(c_subtree)
    return subtree, node.shape

In [24]:
tree, tree_all_parent, tree_direct_parent, equal_pair_index = merge_within(lns_gpdf)
tree = tree.apply(lambda x: x)

begin merge within 2016-12-12 13:30:13.921000
keep = 23319 equal pair = 637 (23319, 5) 2016-12-12 13:30:23.747000
sjoin.shape = (34072, 10) 2016-12-12 13:31:05.812000
messy tree shape = (34072, 2) 2016-12-12 13:31:05.839000
clean tree shape = (23319, 3) 2016-12-12 13:31:18.782000


In [25]:
df_epi = pd.DataFrame(equal_pair_index,columns=['keep','equal'])

In [26]:
nodes_have_children = tree_direct_parent[
    (tree_direct_parent.node.isin(tree_direct_parent.parent.value_counts().index.tolist()))
    & (tree_direct_parent.parent == -1)].node.tolist()
print len(nodes_have_children)
print datetime.datetime.now()
sub_trees = []
shape_error = []
assert_error = []
for node_id in nodes_have_children:
    try:
        subtree, shape = get_sub_tree(tree_direct_parent, node_id)
    except Exception as e:
        assert_error.append((node_id, e))
        continue
    if shape != (1, 7):
        shape_error.append((node_id, subtree, shape))
    sub_trees.append(subtree)
print datetime.datetime.now()
print len(assert_error)
sub_trees = dict(zip(nodes_have_children, sub_trees))

1209
2016-12-12 13:31:18.832000
2016-12-12 13:31:32.250000
0


In [27]:
new_category = []
for idx, a in tree.iterrows():
    equal_objs = lns_gpdf.loc[df_epi[df_epi.keep==idx].equal].category.values
    c = set.union(*equal_objs) if len(equal_objs)>0 else set()
    c.update(a.category)
    if not idx in sub_trees:
        new_category.append(c)
    else:
        subtree = sub_trees[idx]
        c.update(set.union(*lns_gpdf.loc[list(subtree)].category.values))
        new_category.append(c)


In [28]:
tree['category'] = new_category

In [29]:
lns_merged = gp.GeoDataFrame(tree)

lns_merged.crs={'init': 'epsg:4326', 'no_defs': True}
lns_merged.shape

(14554, 5)

##  (discard)overlap between ways and points

In [30]:
# lns_pts_overlap = gp.tools.sjoin(lns_merged, pts_merged)
# # lns_pts_overlap = lns_pts_overlap[lns_pts_overlap.category_left==lns_pts_overlap.category_right]
# lns_pts_overlap = lns_pts_overlap[lns_pts_overlap.apply(lambda x: len(x.category_right - x.category_left)==0, axis=1)]
# pts_in_lns = pd.unique(lns_pts_overlap.index_right)
# pts_not_in_lns = pts_merged.drop(pts_in_lns)

### visualize data on the map

In [32]:
dc_segments = gp.read_file('../data/opendc_segments.geojson')

In [33]:
outliners_segid = eval(open('dc_outliners.txt').readlines()[0])
outliners_segid = set(outliners_segid)
dc_outliners = dc_segments[dc_segments.STREETSEGID.isin(outliners_segid)][['STREETSEGID','geometry']].copy()

In [34]:
dc_outliners['color']='#F00'

In [40]:
lns_merged_vis = lns_merged.copy()
lns_merged_vis['color'] = lns_merged_vis.apply(lambda x: {'Way':'#00F','Relation':'#0FF'}[x.type], axis=1)
lns_merged_vis.category = lns_merged_vis.category.apply(str)
lns_merged_vis.tag = lns_merged_vis.tag.apply(str)



pts_merged_vis = pts_merged.copy()
pts_merged_vis['color'] = pts_merged_vis.apply(lambda x: {'Node':'#F00', 'fq':'#0F0', 'Node_fq':'#FF0','Relation_fq':'#FF0', 'Relation':'#0FF'}[x.type],axis=1)
pts_merged_vis.category = pts_merged_vis.category.apply(str)
pts_merged_vis.tag = pts_merged_vis.tag.apply(lambda x: str({k.encode('utf-8'):v.encode('utf-8') for k, v in x.items()}) if type(x)==dict else x.encode('utf-8'))

import Cycling_Safe.utils.leaflet_creation_v2;reload(Cycling_Safe.utils.leaflet_creation_v2)
from Cycling_Safe.utils.leaflet_creation_v2 import get_color_for_df, create_map_visualization
html_title = 'osm_frsq_with_category'
file_path = '../data/'
file_name = 'osm_frsq_with_category'
lon, lat = -77.0163424758, 38.9047829846
zoom = 12
init_layer = ['light']
map_layer = ['light','streets','satellite']
binding_data = [['pts', 'points'], ['osm_lns','osm lines'],['dc_outliners','dc_outliners']]
gpdfs = [pts_merged_vis, lns_merged_vis,dc_outliners]
create_map_visualization(html_title, file_path, file_name, lat, lon, zoom, init_layer, map_layer, binding_data, gpdfs)

## output category distribution

### get data near segment

In [82]:
def crs_prepossess(gpdf, init_crs, bfr_crs):
    gpdf_crs = gpdf.copy()
    if gpdf_crs.crs==None:
        gpdf_crs.crs = {'init': u'epsg:{}'.format(init_crs)}
    return gpdf_crs.to_crs(epsg=bfr_crs)

def bfr_20m(seg):
    return seg.geometry.buffer(20)

def get_objs_nearby(segments, objs, bfr_func, init_crs, bfr_crs):
    seg_crs = crs_prepossess(segments, init_crs, bfr_crs)
    obj_crs = crs_prepossess(objs, init_crs, bfr_crs)
    seg_crs.geometry = seg_crs.apply(bfr_20m,axis=1)

    sjoin = gp.tools.sjoin(seg_crs, obj_crs,op='intersects')
    obj_ids_nearby = set(sjoin.index_right)
    objs_nearby = objs[objs.index.isin(obj_ids_nearby)]
    return sjoin,objs_nearby


In [112]:

semantic_data = lns_merged.append(pts_merged,ignore_index=True)
semantic2seg, semantic_near = get_objs_nearby(dc_segments, semantic_data, bfr_20m, 4326, 3559)

In [114]:
semantic2seg.shape, semantic_near.shape

((113138, 28), (38233, 6))

### output category count distribution

In [115]:
list_mapped = semantic_near.category.apply(list).values
len(list_mapped)

38233

In [116]:
from collections import defaultdict
def get_sum_categories(list_mapped):
    sum_categories = defaultdict(int)
    for mapped in list_mapped:
        for c in list(mapped):
            sum_categories[c]+=1
    return sum_categories
pd.DataFrame(get_sum_categories(list_mapped).items(),columns=['category','osm_plus_fq']).to_csv('../data/category_osm_fq.csv')

### output category segment distribution

In [117]:
semantic_per_segment = {}
for segid, grp in semantic2seg.groupby('STREETSEGID'):  
#     print segid
    semantic_per_segment[segid] = dict(get_sum_categories(grp.category.values))
len(semantic_per_segment), dc_segments.shape

(12522, (13522, 22))

In [118]:
dc_segment_category = dc_segments[['STREETSEGID']].merge(pd.DataFrame(semantic_per_segment.items(),columns=['STREETSEGID','category']),how='left')

In [119]:
dc_segment_category['total'] = dc_segment_category.category.apply(lambda x: sum(x.values()) if type(x)!=float else 0)

In [120]:
labels = ['art','outdoors and recreation', 'retail shop', 'professional service', 'food', 'nightlife spot','residence','schools&university','cycling facilities','transportation']
for l in labels:
    dc_segment_category[l] = dc_segment_category.category.apply(lambda x: x.get(l,0) if type(x)!=float else 0)

In [121]:
dc_segment_category.drop('category',axis=1).to_csv('../data/category_osm_fq_per_segment.csv')