In [1]:
from datetime import datetime
import pandas as pd
from geopy import distance
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import normalize

import statsmodels.api as sm


import pymongo
from pymongo import MongoClient

import matplotlib.pyplot as plt

plt.style.use("ggplot")

%matplotlib inline

  from pandas.core import datetools


In [13]:
client = MongoClient('localhost', 27017)

db = client['avl_pipeline_test']
raw_coll = db['avl_raw']
trip_coll = db['trips']

# SPEED

In [14]:
boopin = list(raw_coll.find({},{'_id':0, 'SPEED':1}))

In [16]:
speeds = [float(item['SPEED']) for item in boopin]

In [19]:
spds = np.array(speeds)

In [20]:
spds.mean()

4.225976030751669

In [21]:
spds.max()

27.222

# DURATION

In [24]:
sucka = list(trip_coll.find({}, {'_id':0, 'trip_duration':1}))
durs = [float(item['trip_duration']) for item in sucka]

In [26]:
durdur = np.array(durs)

In [27]:
durdur.mean()

2938.0595710881653

In [28]:
durdur.max()

4390.0

# LENGTH

In [36]:
shapes = pd.read_csv('sfmta_2017-02-10/shapes.txt')
shapes.head()

Unnamed: 0,shape_id,shape_pt_lon,shape_pt_lat,shape_pt_sequence,shape_dist_traveled
0,141009,-122.446805,37.787266,1,0
1,141009,-122.448481,37.787054,2,149
2,141009,-122.450131,37.786842,3,296
3,141009,-122.450238,37.786822,4,306
4,141009,-122.451771,37.786624,5,443


In [31]:
sched = pd.read_csv('sfmta_2017-02-10/stop_times.txt')

In [32]:
sched.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,7225088,26:00:00,26:00:00,4015,1,,,,
1,7225088,26:00:45,26:00:45,6294,2,,,,
2,7225088,26:01:35,26:01:35,6290,3,,,,
3,7225088,26:02:00,26:02:00,6314,4,,,,
4,7225088,26:02:35,26:02:35,6307,5,,,,


In [35]:
trips = pd.read_csv('sfmta_2017-02-10/trips.txt')
trips.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id
0,11639,1,7311979,Geary + 33rd Avenue,0,108,141009
1,11639,1,7311978,Geary + 33rd Avenue,0,107,141009
2,11639,1,7311977,Geary + 33rd Avenue,0,106,141009
3,11639,1,7311976,Geary + 33rd Avenue,0,105,141009
4,11639,1,7311975,Geary + 33rd Avenue,0,104,141009


In [41]:
sample_id = trip_coll.find_one({},{'_id':0, 'trip_id':1})['trip_id']

In [44]:
shape_id = trips[trips['trip_id'] == sample_id]['shape_id'].values[0]

In [48]:
trip_shape = shapes[shapes['shape_id'] == shape_id]

In [63]:
trip_shape.head()

Unnamed: 0,shape_id,shape_pt_lon,shape_pt_lat,shape_pt_sequence,shape_dist_traveled
19204,141187,-122.456377,37.786997,1,0
19205,141187,-122.455249,37.787142,2,101
19206,141187,-122.455062,37.786213,3,206
19207,141187,-122.455878,37.786109,4,279
19208,141187,-122.456698,37.786005,5,352


In [49]:
trip_shape['shape_dist_traveled'].max()

10494

In [61]:
point_1 = tuple(trip_shape.iloc[0][['shape_pt_lat', 'shape_pt_lon']].values)
point_2 = tuple(trip_shape.iloc[1][['shape_pt_lat', 'shape_pt_lon']].values)

In [62]:
distance.distance(point_1, point_2).m

100.65575826151817

# Stop/Shape intersection?

In [65]:
trip_sched = sched[sched['trip_id'] ==  sample_id]

In [67]:
trip_sched.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
479144,7253717,24:30:00,24:30:00,6293,1,,,,
479145,7253717,24:30:56,24:30:56,3879,2,,,,
479146,7253717,24:31:32,24:31:32,3852,3,,,,
479147,7253717,24:32:28,24:32:28,3644,4,,,,
479148,7253717,24:33:39,24:33:39,3645,5,,,,


In [110]:
# Let's check for the 2nd stop

stops = pd.read_csv('sfmta_2017-02-10/stops.txt')
scnd_trp_stps = stops[stops['stop_id'] == 3879]

In [111]:
jeez = tuple(scnd_trp_stps[['stop_lat', 'stop_lon']].values[0])

In [112]:
jeez

(37.786248, -122.455207)

In [113]:
lat, lon = jeez

In [114]:
shape_mask = (trip_shape['shape_pt_lat'] == lat) & (trip_shape['shape_pt_lat'] == lon)
trip_shape[shape_mask]

Unnamed: 0,shape_id,shape_pt_lon,shape_pt_lat,shape_pt_sequence,shape_dist_traveled,stop_dist


Okay, so shapes and stops don't completely overlap

Let's go by distance

In [115]:
def get_dist(row, stop_tuple):
    shape_lat = row['shape_pt_lat']
    shape_lon = row['shape_pt_lon']
    return distance.distance((shape_lat, shape_lon), stop_tuple).m

In [116]:
trip_shape['stop_dist'] = trip_shape.apply(lambda row: get_dist(row, jeez), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [117]:
trip_shape['stop_dist'].min()

13.350286585201182

### Cool. One more, with the 5th stop...

In [1]:
scnd_trp_stps = stops[stops['stop_id'] == 3645]
jeez = tuple(scnd_trp_stps[['stop_lat', 'stop_lon']].values[0])

shape_mask = (trip_shape['shape_pt_lat'] == lat) & (trip_shape['shape_pt_lat'] == lon)
trip_shape[shape_mask]

trip_shape['stop_dist'] = trip_shape.apply(lambda row: get_dist(row, jeez), axis=1)

trip_shape['stop_dist'].min()

NameError: name 'stops' is not defined