Need to process streets into links and nodes, with nodes as intersections

In [1]:
import os
import pandas as pd
import json

from mapboxgl.utils import create_color_stops, df_to_geojson
from mapboxgl.viz import CircleViz,MapViz
token = os.getenv('MAPBOX_TOKEN')

# let's plot the missing nodes
# Generate data breaks and color stops from colorBrewer
def show_points(to_show,data):
    nodes = data.set_index('node')
    data = df_to_geojson(nodes.loc[to_show].reset_index(),
                         properties=['node','name','street'],
                         lat='lat', lon='lon')

    # Create the viz from the dataframe
    viz = CircleViz(data,
                    access_token=token,
                    radius=1,
                    stroke_color='black',
                    stroke_width=0.2,
                    center=(-73.97, 40.77),
                    zoom=12,
                    below_layer='waterway-label')
    viz.show()

streets = pd.read_csv('nyc_streets.csv')

In [2]:
print(streets['type'].value_counts())
print(streets.shape)
streets.head()

residential      12733
secondary         8605
primary           6639
tertiary          1567
unclassified      1127
living_street       60
Name: type, dtype: int64
(30731, 7)


Unnamed: 0,street,name,type,oneway,node,lat,lon
0,5668966,West 106th Street,secondary,,4205830390,40.798208,-73.960425
1,5668966,West 106th Street,secondary,,3602678205,40.798324,-73.960705
2,5668966,West 106th Street,secondary,,4205830391,40.798485,-73.96109
3,5668968,West 80th Street,residential,yes,42421778,40.785593,-73.982011
4,5668968,West 80th Street,residential,yes,8727756094,40.785083,-73.980801


In [3]:
# are there any lonely ways?
streets['street'].value_counts()[::-1]

# no

962829745      2
584971404      2
420877078      2
156994969      2
584971388      2
            ... 
404253364     67
68674962      71
664893485     80
5669386       94
828561963    143
Name: street, Length: 5481, dtype: int64

An intersecion is a node with shared streets

### Useful info
The nodes defining the geometry of the way are enumerated in the correct order, and indicated only by reference using their unique identifier. These nodes must have been already defined separately with their coordinates. 

### Tasks
- crawl nodes and create edges
- join edges that are part of the same street
- find nodes that are intersections - some are just streets broken up into many ways
- join streets to leave legit intersections
- removed intersections at the same geographical locations

In [4]:
# first create a links data set
def create_links(df):
    link_start = df.iloc[:-1,:5].rename(columns=dict(node='start'))
    link_end = df.iloc[1:,4].rename('end')
    link_start['end'] = link_end.values
    return link_start

edges = streets.groupby(['street']).apply(create_links).reset_index(drop=True)
edges = edges.reset_index().rename(columns=dict(index='edge_id'))
edges.head()

Unnamed: 0,edge_id,street,name,type,oneway,start,end
0,0,5668966,West 106th Street,secondary,,4205830390,3602678205
1,1,5668966,West 106th Street,secondary,,3602678205,4205830391
2,2,5668968,West 80th Street,residential,yes,42421778,8727756094
3,3,5668968,West 80th Street,residential,yes,8727756094,42421776
4,4,5668968,West 80th Street,residential,yes,42421776,8727756090


In [5]:
# check if there are any streets that have been dropped
street_ids = edges['street'].unique()
missing = []
for sid in streets['street']:
    if sid not in street_ids:
        missing.append(sid)
print(len(missing))

0


Now let's try to only keep the intersections, that intersect with a different street.

In [39]:
%%time
# loop and join edges until
deadends = set()
done = set()

# get connects edges
def get_connected(node):
    """
    returns a list of connected edges
    """    
    # get all of the edges connected to this node
    connected = []
    to_check = edges[~edges['edge_id'].isin(done)]
    ins = to_check[to_check['end']==node].\
                     drop(['end'],axis=1)
    outs = to_check[to_check['start']==node].\
                     drop(['start'],axis=1)
    
    if not ins.empty:
        connected.append(ins.rename(columns=dict(start='next')))
    if not outs.empty:
        connected.append(outs.rename(columns=dict(end='next')))
    
    # concatenate the results
    if connected:
        return pd.concat(connected,axis=0), True
    return connected, False

# recurse to get the connected edges
def get_next(node,back=False):
    
    # get the in edges to this list
    nexts,is_valid = get_connected(node)
    
    # if there is more than one connecting edge
    # assume we're at an intersection
    if is_valid:
        if len(nexts) > 1:
            return []
        else:
            next_edge = nexts.iloc[0]['edge_id']
            next_node = nexts.iloc[0]['next']
            done.add(next_edge)
            return [(next_edge,next_node),] + get_next(next_node)
    else:
        return []

# loop by the nodes
new_edges = []
for idx, edge in edges.iterrows():
    
    # because we are constructing complete chains, 
    # we can check if this edges is in done, and skip
    if idx in done:
        continue
        
    done.add(idx)
           
    # if the node joins two edges only of the same street name
    # join this as a single edge
    
    # first check for edges coming into and out of the start node
    start = edge['start']
    reverse = get_next(start)[::-1]
    end = edge['end']
    forward = get_next(end)
    
    # stick them all together
    chain = reverse + [(idx,start),] + [(idx,end),] + forward
    new_edges.append(tuple(chain))
        
print(len(deadends))
print(len(new_edges))

0
3494
CPU times: user 2min 49s, sys: 2.92 s, total: 2min 52s
Wall time: 2min 47s


In [265]:
# for all of the new_edges, get the start and end nodes,
# these are the intersections
intersections = set()
skip = [287,507,508,543,2320,2321,2676,2677,3396,]
for i,path in enumerate(new_edges):
    if i in skip:
        continue
        
    intersections |= set([path[0][1],path[-1][1]])
print(len(intersections))

4204


In [263]:
show_points(intersections,streets)

In [266]:
for s in skip:
    print(new_edges[s])

((996, 42434140), (996, 3919359298), (997, 4235738201), (998, 42434142))
((1755, 42434839), (1755, 5804849353), (1756, 42438779))
((2641, 42430131), (2640, 4890621100), (1757, 42438779), (1757, 4890553386), (1758, 42438781))
((11488, 42439249), (11489, 3919350757), (1916, 595407639), (1916, 3919350751), (1917, 42434140))
((14944, 42440737), (14945, 8309479683), (14951, 4205565491), (14952, 6139477287), (14953, 8309479673), (14938, 42436942), (14939, 8309479675), (14940, 4205565490), (14941, 8309479599), (9355, 42442870), (9355, 8309479600), (9356, 3420766639), (9357, 8309479268), (9358, 42442862), (9359, 4236104525), (9360, 3420766638), (9361, 42430898), (9362, 3420766637), (9363, 3420766636), (9364, 42442857), (9365, 3420766635), (9366, 3420766634), (9367, 8309479234), (9368, 42434142))
((23730, 7490266268), (23729, 3834459546), (23728, 8309479527), (23727, 4235738199), (23726, 4376122106), (9369, 42434142), (9369, 3420766633), (9370, 3420766632), (9371, 42434201))
((11186, 42455051),