In [8]:
def crs_prepossess(gpdf, init_crs, bfr_crs):
    gpdf_crs = gpdf.copy()
    if gpdf_crs.crs == None:
        gpdf_crs.crs = {'init': u'epsg:{}'.format(init_crs)}
    return gpdf_crs.to_crs(epsg=bfr_crs)

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    from math import radians, cos, sin, asin, sqrt
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    m = km *1000
    return m   


def ptfromln(pt, ln):
    """
    crs: espg: 4326, crs that uses lat lon
    get the projection of lonlat point to line
    then get distance on earth between point and line
    """
    n_pt = ln.interpolate(ln.project(pt))
    lon1, lat1 = n_pt.coords[0]
    lon2, lat2 = pt.coords[0]
    return haversine(lon1, lat1, lon2, lat2)

def pts2seg(gp_pts, gp_segs, init_crs=4326, bfr_crs=3559, near_dis_thres=5, buffer_dis=50):
    """
    pts and segs are assumed as geopandas.GeoDataFrame with crs:4326, which means (lon,lat) points
    1. check crs and change crs to epsg:3559 (NAD83(NSRS2007) / Maryland)
    2. get segid of near seg(s) based on var:near_dis_thres for each point
    3. for those points without any near segs
     - buffer them var:buffer_dis meters to find near segs
     - use func:ptfromln to get on earth distance from point to line
     - get one segid of the nearest seg
    """
    import geopandas as gp
    import pandas as pd

    gp_pts_crs = crs_prepossess(gp_pts,init_crs,bfr_crs)
    gp_segs_crs = crs_prepossess(gp_segs,init_crs,bfr_crs)
    
    gp_pts_crs_bfr = gp_pts_crs.copy()
    gp_pts_crs_bfr.geometry = gp_pts_crs_bfr.buffer(near_dis_thres)

    close_jn = gp.tools.sjoin(gp_pts_crs_bfr, gp_segs_crs)[['index_right']]

    processed_pts = set(pd.unique(close_jn.index))
    mask = (~gp_pts_crs_bfr.index.isin(processed_pts))
    far_jns = []
    while gp_pts_crs_bfr[mask].shape[0]!=0:
        gp_pts_crs_bfr.loc[mask, 'geometry'] = gp_pts_crs_bfr[mask].buffer(buffer_dis)
        jn = gp.tools.sjoin(gp_pts_crs_bfr[mask], gp_segs_crs)['index_right']
        far_jns.append(jn)
        processed_pts |= set(pd.unique(jn.OBJECTID_left))
        mask = (~gp_pts_crs_bfr.index.isin(processed_pts))
        
    far_jns = pd.concat(far_jns)
    mr_far_jns = pd.merge(gp_segs[['geometry','STREETSEGID']],far_jns , left_on='STREETSEGID', right_on='STREETSEGID_right')
    mr_far_jns = pd.merge(gp_pts[['OBJECTID','geometry','STREETSEGID']],mr_far_jns, left_on='OBJECTID', right_on='OBJECTID_left')
    mr_far_jns['dis']=mr_far_jns.apply(lambda x: ptfromln(x.geometry_x, x.geometry_y),axis=1)

    result = close_jn.groupby('OBJECTID_left')['STREETSEGID_right'].apply(list).append(mr_far_jns.groupby('OBJECTID').apply(lambda x: [x.ix[x.dis.idxmin()].STREETSEGID_y]))
    return pd.DataFrame(result, columns=['segid'])


In [10]:
import geopandas as gp
pts_path = '../data/test/Moving_Violations_in_May_2016.geojson'
segs_path = '../data/opendc_segments.geojson'
pts = gp.read_file(pts_path)
segs = gp.read_file(segs_path)
pts_seg = pts2seg(pts, segs)

NameError: name 'pd' is not defined

In [24]:
pts[pts.OBJECTID==1491386]

Unnamed: 0,ACCIDENTINDICATOR,ADDRESS_ID,AGENCYID,FINEAMT,LOCATION,OBJECTID,PENALTY1,PENALTY2,ROW_,ROW_ID,STREETSEGID,TICKETISSUEDATE,TICKETTYPE,TOTALPAID,VIOLATIONCODE,VIOLATIONDESC,XCOORD,YCOORD,geometry
2013,No,813951,25,150,3700 blk Massachusetts Ave SE nw/b,1491386,,,5252975,5252975,12021,2016-05-01T00:00:00,Photo,0,T120,SPEED 16-20 MPH OVER THE SPEED LIMIT,404220.2,134016.91,POINT (-76.95136873129999 38.8739779324)


In [23]:
segs[segs.STREETSEGID==12021]

Unnamed: 0,BEGINMEASURE,DIRECTIONALITY,ENDMEASURE,FACILITYID,FROMADDRESSLEFTTHEO,FROMADDRESSRIGHTTHEO,FROMNODEID,OBJECTID,OBJECTID_1,QUADRANT,...,SHAPE_Length,SOURCEID,STREETID,STREETSEGID,STREETTYPE,TOADDRESSLEFTTHEO,TOADDRESSRIGHTTHEO,TONODEID,UPDATETIMESTAMP,geometry
10102,0,2,304.873163,SEGID-12021,3630,3701,20623,10103,11560,SE,...,304.873176,35960360,305960,12021,AVE,3764,3765,17170,2005-11-01T12:55:48,LINESTRING (-76.95292520293411 38.874614383151...


In [72]:
gp_pts = pts
gp_segs = segs
init_crs,bfr_crs = 4326, 3559
near_dis_thres=5
buffer_dis=50
gp_pts_crs = crs_prepossess(gp_pts,init_crs,bfr_crs)
gp_segs_crs = crs_prepossess(gp_segs,init_crs,bfr_crs)

gp_pts_crs_bfr = gp_pts_crs.copy()
gp_pts_crs_bfr.geometry = gp_pts_crs_bfr.buffer(near_dis_thres)

close_jn = gp.tools.sjoin(gp_pts_crs_bfr, gp_segs_crs)[['index_right']]



In [73]:

processed_pts = set(pd.unique(close_jn.index))
mask = (~gp_pts_crs_bfr.index.isin(processed_pts))
far_jns = []
while gp_pts_crs_bfr[mask].shape[0]!=0:
    gp_pts_crs_bfr.loc[mask, 'geometry'] = gp_pts_crs_bfr[mask].buffer(buffer_dis)
    jn = gp.tools.sjoin(gp_pts_crs_bfr[mask], gp_segs_crs)[['index_right']]
    far_jns.append(jn)
    processed_pts |= set(pd.unique(jn.index))
    mask = (~gp_pts_crs_bfr.index.isin(processed_pts))

far_jns = pd.concat(far_jns)

In [None]:
far_jns.sort('index_right')

In [74]:
mr_far_jns = pd.merge(gp_segs[['geometry']],far_jns , left_index=True, right_on=['index_right'])
mr_far_jns = pd.merge(gp_pts[['geometry']],mr_far_jns, left_index=True, right_index=True)
mr_far_jns['dis']=mr_far_jns.apply(lambda x: ptfromln(x.geometry_x, x.geometry_y),axis=1)
# group = mr_far_jns.groupby(level=0).apply(lambda x: x.ix[x.dis.idxmin()].index_right)

# result = close_jn.groupby('OBJECTID_left')['STREETSEGID_right'].apply(list).append(mr_far_jns.groupby('OBJECTID').apply(lambda x: [x.ix[x.dis.idxmin()].STREETSEGID_y]))
# return pd.DataFrame(result, columns=['segid'])

In [75]:
mr_far_jns

Unnamed: 0,geometry_x,geometry_y,index_right,dis
1,POINT (-76.9750248217 38.9342833419),LINESTRING (-76.97417599719914 38.933802834687...,8873,49.605006
1,POINT (-76.9750248217 38.9342833419),LINESTRING (-76.9744959463971 38.9341100881109...,8874,23.913758
1,POINT (-76.9750248217 38.9342833419),LINESTRING (-76.97622450517733 38.934105407830...,8892,19.674623
1,POINT (-76.9750248217 38.9342833419),LINESTRING (-76.9744959463971 38.9341100881109...,8893,49.605006
16,POINT (-77.09134686100001 38.9414896542),LINESTRING (-77.09075381584708 38.941256787736...,4769,25.771976
16,POINT (-77.09134686100001 38.9414896542),LINESTRING (-77.09026716990941 38.941106649445...,5535,42.072270
16,POINT (-77.09134686100001 38.9414896542),LINESTRING (-77.0908644199483 38.9415400262199...,5536,33.752943
16,POINT (-77.09134686100001 38.9414896542),LINESTRING (-77.0908644199483 38.9415400262199...,7231,39.305995
20,POINT (-77.00855635870001 38.8724597385),LINESTRING (-77.00908598332917 38.872983851278...,11951,45.990637
21,POINT (-77.00855635870001 38.8724597385),LINESTRING (-77.00908598332917 38.872983851278...,11951,45.990637


In [80]:
for pt_idx, grp in mr_far_jns.groupby(level=0):
    print pt_idx
    print grp.dis.values.argmin()
    print grp.iloc[grp.dis.values.argmin()].index_right
    break

1
2
8892


In [65]:
far_jn_pt2seg = []
for pt_idx, grp in mr_far_jns.groupby(level=0):
    print pt_idx
    dis = grp.dis.values
    print type(dis)
    print dis[dis.argmin()]
    print grp.iloc[dis.argmin()].index_right
    far_jn_pt2seg.append()
    break


    


# .apply(lambda x: x.ix[x.dis.idxmin()].index_right)




1
<type 'numpy.ndarray'>
19.6746226073
8892


In [42]:
pts_seg.segid.apply(len).sum()

80866L

In [15]:
import pandas as pd
from itertools import chain
seg_counts = pd.DataFrame(pd.Series(list(chain.from_iterable(pts_seg.values.ravel()))).value_counts())

In [16]:
seg_counts

Unnamed: 0,0
4361,7911
13794,5865
6993,3873
14658,3557
14688,2801
9851,2122
5134,1609
292,1590
6364,1397
5435,1396
