In [1]:
import pandas as pd
import glob

In [2]:
csvs = []
for fn in glob.glob('data/clean/*.csv'):
    csvs.append(pd.read_csv(fn))

In [3]:
len(csvs)

3598

In [4]:
data = pd.concat(csvs, ignore_index=True)

In [8]:
data[['bikenumber', 'bikedate', 'tripnumber']].nunique()

bikenumber    90
bikedate      27
tripnumber    15
dtype: int64

In [33]:
data['key'] = data[['bikenumber', 'bikedate', 'tripnumber']].apply(lambda x: '{}_{}_{}'.format(*x), axis=1)

In [9]:
a= data.groupby(['bikenumber', 'bikedate', 'tripnumber']).count()
target = ['{}_{}_{}'.format(*x) for x in a[a.durationms==1].index.tolist()]

In [40]:
data[data.key.isin(target)]

Unnamed: 0,durationms,starttrip,startstation,endtrip,endstation,subscriptiontype,bikenumber,time,latitude,longitude,altitude,speed,course,type,distance,essential,bikedate,tripnumber,key
94,301225,4/22/2015 1:40,MLK Library/9th & G St NW,4/22/2015 1:45,12th & L St NW,Member,B00175,01:44:37,38.898026,-77.023727,208.42,864,146,-2,0.0,1.0,422,1,B00175_422_1
3542,235960,4/22/2015 10:19,Washington Blvd & Walter Reed Dr,4/22/2015 10:23,Columbia Pike & S Courthouse Rd,Member,B00221,10:23:43,38.873348,-77.08223,54.610001,3672,295,-2,0.0,1.0,422,1,B00221_422_1
8377,1044151,4/28/2015 7:25,19th & East Capitol St SE,4/28/2015 7:42,Maryland & Independence Ave SW,Member,B00292,07:39:29,38.889278,-76.978226,-95.650002,0,180,-2,0.0,1.0,428,1,B00292_428_1
8637,623243,4/29/2015 6:04,Adams Mill & Columbia Rd NW,4/29/2015 6:14,Georgetown Harbor / 30th St NW,Member,B00292,06:12:49,38.921001,-77.04216,437.57999,0,61,0,0.0,1.0,429,1,B00292_429_1
10181,233514,4/23/2015 16:04,23rd & Crystal Dr,4/23/2015 16:08,Potomac Ave & 35th St S,Member,B00319,16:08:43,38.853363,-77.049728,-27.85,0,265,0,0.0,1.0,423,2,B00319_423_2
26069,271892,5/1/2015 7:07,Massachusetts Ave & Dupont Circle NW,5/1/2015 7:12,New Hampshire Ave & T St NW,Member,B00417,07:11:49,38.909931,-77.044441,4.47,0,70,0,0.0,1.0,501,1,B00417_501_1
31694,264916,5/6/2015 16:50,Washington Blvd & 7th St N,5/6/2015 16:54,Clarendon Metro / Wilson Blvd & N Highland St,Member,B00681,16:54:30,38.880901,-77.091164,78.370003,0,120,0,0.0,1.0,506,1,B00681_506_1
32209,296056,5/8/2015 22:12,Ballston Metro / N Stuart & 9th St N,5/8/2015 22:17,Fairfax Dr & Wilson Blvd,Member,B00681,22:16:42,38.884621,-77.09935,171.87,13068,59,0,1196.2,1.0,508,3,B00681_508_3
52926,240023,4/23/2015 10:15,34th St & Wisconsin Ave NW,4/23/2015 10:19,37th & O St NW / Georgetown University,Member,B01003,10:19:45,38.916454,-77.067871,-79.709999,792,53,0,0.0,1.0,423,1,B01003_423_1
58950,270087,4/29/2015 5:52,King St Metro,4/29/2015 5:57,King St & Patrick St,Member,B01192,05:57:14,38.806305,-77.061043,-37.779999,0,305,0,0.0,1.0,429,1,B01192_429_1


In [None]:
data

In [105]:
data['day'] = data.apply(lambda x: '2015-0{}-{}'.format(str(x.bikedate)[0], str(x.bikedate)[1:]), axis=1)

In [107]:
data['timestamp'] = data.apply(lambda x: '{}T{}Z'.format(x.day, x.time), axis=1)

In [108]:
timestamp = data.groupby('key').apply(lambda x: x.timestamp.tolist()).to_frame()
lon = data.groupby('key').apply(lambda x: x.longitude.tolist()).to_frame()
lat = data.groupby('key').apply(lambda x: x.latitude.tolist()).to_frame()
sub_type = data.groupby('key').apply(lambda x: list(set(x.subscriptiontype.tolist()))).to_frame()
durationms = data.groupby('key').apply(lambda x: list(set(x.durationms.tolist()))).to_frame()
print(sub_type.apply(len))
print(durationms.apply(len))

0    3598
dtype: int64
0    3598
dtype: int64


In [123]:
trips = lon.merge(lat, left_index=True, right_index=True)

trips.columns = ['lon', 'lat']

trips['trace'] = trips.apply(lambda x: list(zip(x.lon, x.lat)), axis=1)

trips.drop(['lon', 'lat'], axis=1, inplace=True)

In [124]:
trips = trips.merge(sub_type.apply(lambda x: x[0], axis=1), left_index=True, right_index=True)\
.merge(durationms.apply(lambda x: x[0], axis=1), left_index=True, right_index=True)\
.merge(timestamp, left_index=True, right_index=True)

In [125]:
trips.columns = ['trace', 'subscription', 'durationms', 'timestamp']

In [126]:
trips['tranceLen'] = trips.trace.apply(len)
trips['timestampLen'] = trips.timestamp.apply(len)

In [127]:
(trips.timestampLen==trips.tranceLen).value_counts()

True    3598
dtype: int64

In [137]:
trips = trips[trips.tranceLen>1]

In [185]:
from importlib import reload
import snap2road; reload(snap2road)
from snap2road import snap2road as snap

In [157]:
from json import JSONDecodeError

In [158]:
snap_res = []
for i, (key, row) in enumerate(trips.iterrows()):
    if (i+1)%100==0:
        print('handled ', i+1, ' traces')
    try:
        res = snap(row.trace, row.timestamp, pause=True)
    except JSONDecodeError as e:
        print(e)
        res={}
    res['key'] = key
    snap_res.append(res)

handled  100  traces
handled  200  traces
handled  300  traces
handled  400  traces
handled  500  traces
handled  600  traces
handled  700  traces
handled  800  traces
handled  900  traces
handled  1000  traces
handled  1100  traces
handled  1200  traces
handled  1300  traces
handled  1400  traces
handled  1500  traces
handled  1600  traces
handled  1700  traces
handled  1800  traces
handled  1900  traces
handled  2000  traces
handled  2100  traces
handled  2200  traces
handled  2300  traces
handled  2400  traces
handled  2500  traces
handled  2600  traces
handled  2700  traces
handled  2800  traces
handled  2900  traces
handled  3000  traces
handled  3100  traces
handled  3200  traces
handled  3300  traces
handled  3400  traces
handled  3500  traces


In [159]:
len(snap_res)

3547

In [164]:
snap_res_df = pd.DataFrame(snap_res).set_index('key')

In [166]:
new_trips = trips.merge(snap_res_df, left_index=True, right_index=True)

In [167]:
new_trips.to_json('data/clean_snap.json')

# find segment

In [169]:
import geopandas as gp

In [179]:
from itertools import chain
import trace2segs; reload(trace2segs)
from trace2segs import trace2segs as t2s

In [170]:
segs = gp.read_file('data/segments_dc.geojson')

In [205]:
seg_res = []
for i, (key, row) in enumerate(new_trips.iterrows()):
    if len(row['snapped'])==0:
        print(key)
        continue
    snapped_df = pd.DataFrame.from_dict(row['snapped'])  
    if not 'snapped' in row:
        print(key)
        continue
    snapped_trace = list(chain(*snapped_df.snapped.values)) 
    res = t2s(segs, snapped_trace, need_snap=False, length_col='SHAPE_Length') 
    seg_this_trace = res['segs']
    seg_this_trace['key'] = key
    seg_this_trace['#pts_no_segs'] = res['#pts_no_segs']
    seg_res.append(seg_this_trace)
    if (i+1)%100==0:
        print('handled ', i+1, ' traces')
seg_res_df = pd.concat(seg_res, ignore_index=True)

  outputs = ufunc(*inputs)


B00292_426_1
handled  100  traces
B00339_430_1
handled  200  traces
handled  300  traces
B00681_430_1
B00681_506_2
B00708_502_3
B00708_502_4
B00708_512_1
handled  400  traces
B00708_513_6
handled  500  traces
B01003_423_3
handled  600  traces
handled  700  traces
B01275_422_6
handled  800  traces
handled  900  traces
B20415_423_1
handled  1000  traces
B20427_424_2
handled  1100  traces
B20522_425_1
B20567_504_4
handled  1200  traces
B20657_424_4
handled  1300  traces
B20686_425_2
B20696_422_5
handled  1500  traces
B20800_426_2
B20844_427_5
handled  1600  traces
B20905_422_2
handled  1700  traces
B20936_503_2
B21001_425_1
handled  1800  traces
handled  1900  traces
B21143_428_3
handled  2000  traces
B21163_430_3
handled  2100  traces
handled  2200  traces
handled  2300  traces
handled  2400  traces
handled  2500  traces
handled  2600  traces
B21753_428_2
handled  2700  traces
B21856_422_11
handled  2800  traces
handled  2900  traces
handled  3000  traces
handled  3100  traces
B22088_506

In [207]:
seg_res_df.to_csv('data/seg_for_bikeshare.csv')