In [87]:
from datetime import datetime
import pandas as pd
from geopy import distance
import numpy as np

import random

import pymongo
from pymongo import MongoClient

In [88]:
client = MongoClient('localhost', 27017)

db = client['avl_pipeline_test']
start_coll = db['redux_starts']
trip_coll = db['redux_labeled_trips']
chnk_coll = db['chunk_info']

In [89]:
trip_coll.find_one()

{'_id': ObjectId('5ae4dcb33ad39e1721401dae'),
 'HEADING': '261.0',
 'LATITUDE': '37.78618',
 'LONGITUDE': '-122.4554',
 'PREDICTABLE': '1',
 'REPORT_TIME': '10/28/2016 00:29:25',
 'REV': '1526',
 'SPEED': '3.333',
 'TRAIN_ASSIGNMENT': '3305',
 'VEHICLE_TAG': '5419',
 'time_stamp': 1477639765.0,
 'trip_id_iso': '7253717_2016-10-28_JRMFT'}

In [90]:
all_trip_ids = trip_coll.distinct('trip_id_iso')

In [91]:
rand_100 = random.sample(all_trip_ids, 100)

In [92]:
avg_time = 2935
chunks = 2
chunk_block = avg_time/chunks

In [93]:
sched = pd.read_csv('stop_seq_dist_latlon.csv')
sched

Unnamed: 0,stop_id,stop_sequence,seq_str,stop_distance,stop_lat,stop_lon,stop_name
0,6293,1,1,0.0,37.786908,-122.45656,Sacramento St & Cherry St
1,3879,2,2,206.0,37.786248,-122.455207,California St & Maple St
2,3852,3,3,352.0,37.786042,-122.456835,California St & Cherry St
3,3644,4,4,575.0,37.78556,-122.459254,Arguello Blvd & California St
4,3645,5,5,852.0,37.783071,-122.45907,Arguello Blvd & Clement St
5,3649,6,6,1067.0,37.781429,-122.458955,Arguello Blvd & Geary Blvd
6,3642,7,7,1513.0,37.777075,-122.458645,Arguello Blvd & Balboa St
7,4224,8,8,1842.0,37.774282,-122.458003,Fulton St & Arguello Blvd
8,6479,9,9,2184.0,37.774604,-122.454714,Stanyan St & Fulton St
9,6481,10,10,2381.0,37.772766,-122.454335,Stanyan St & Hayes St


In [94]:
def get_avg_dist(row, location_list):

    dists = []
    
    stop_tup = (float(row['stop_lat']), float(row['stop_lon']))

    for loc in location_list:
        
        dists.append(distance.distance(stop_tup, loc).m) 
    
    return np.array(dists).mean()

In [95]:
def locations_at_timestamp(sample_trip_ids, chunk_time):
    
    locations = []
    
    for samp_id in sample_trip_ids:
        
        trip_start = start_coll.find_one({"trip_id_iso": samp_id})
        
        start_ts = trip_start['time_stamp']
        frst_tm_chnk = start_ts + chunk_time

        search = {
            'trip_id_iso': samp_id,
            'time_stamp': {"$gt": frst_tm_chnk}
        }

        chnk_end = list(trip_coll.find(search).sort("time_stamp").limit(1))

        if chnk_end:

            locations.append((chnk_end[0]['LATITUDE'], chnk_end[0]['LONGITUDE']))  
        
    return locations
        
    

In [96]:
avg_time = 2935
chunks = [2,6,12]


chunk_docs = []


for chunk_size in chunks:
    
    chunk_block = avg_time/chunk_size
    
    chunk_summary = {}
    chunk_summary['number_chunks'] = chunk_size
    
    chunk_summary['chunks'] = {}
    
    
    for chunk in range(chunk_size):

        chunk_seq = chunk+1

        chunk_summary['chunks'][str(chunk_seq)] = {}

        chunk_dict = chunk_summary['chunks'][str(chunk_seq)]

        if chunk_seq == chunk_size:
            cnk_stp = sched.iloc[-1]
        else:

            time_forward = chunk_block*chunk_seq

            loc_at_chunk = locations_at_timestamp(rand_100, time_forward)

            sched['avg_chnk_dist'] = sched.apply(lambda row: get_avg_dist(row, loc_at_chunk), axis=1)

            cnk_stp = sched.iloc[sched['avg_chnk_dist'].idxmin()]

        chunk_dict['chunk_stop_id'] = int(cnk_stp['stop_id'])
        chunk_dict['chunk_dist'] = int(cnk_stp['stop_distance'])
        chunk_dict['chunk_stop_seq'] = int(cnk_stp['stop_sequence'])
        chunk_dict['chunk_stop_lat'] = float(cnk_stp['stop_lat'])
        chunk_dict['chunk_stop_lon'] = float(cnk_stp['stop_lon'])
        chunk_dict['chunk_stop_name'] = cnk_stp['stop_name']
        
    chunk_docs.append(chunk_summary)

In [97]:
chunk_docs

[{'number_chunks': 2,
  'chunks': {'1': {'chunk_stop_id': 3329,
    'chunk_dist': 5837,
    'chunk_stop_seq': 24,
    'chunk_stop_lat': 37.760702,
    'chunk_stop_lon': -122.437294,
    'chunk_stop_name': '18th St & Diamond St'},
   '2': {'chunk_stop_id': 3511,
    'chunk_dist': 10494,
    'chunk_stop_seq': 45,
    'chunk_stop_lat': 37.751407,
    'chunk_stop_lon': -122.40668,
    'chunk_stop_name': '25th St & Potrero Ave'}}},
 {'number_chunks': 6,
  'chunks': {'1': {'chunk_stop_id': 4224,
    'chunk_dist': 1842,
    'chunk_stop_seq': 8,
    'chunk_stop_lat': 37.774282,
    'chunk_stop_lon': -122.458003,
    'chunk_stop_name': 'Fulton St & Arguello Blvd'},
   '2': {'chunk_stop_id': 7295,
    'chunk_dist': 3703,
    'chunk_stop_seq': 14,
    'chunk_stop_lat': 37.767305,
    'chunk_stop_lon': -122.446473,
    'chunk_stop_name': 'Ashbury St & Frederick St'},
   '3': {'chunk_stop_id': 3329,
    'chunk_dist': 5837,
    'chunk_stop_seq': 24,
    'chunk_stop_lat': 37.760702,
    'chunk_stop_l

In [98]:
import folium

map_2 = folium.Map(location=[37.770373, -122.436064],
                   tiles='Stamen Terrain',
                   zoom_start=13)

for key, value in chunk_docs[2]['chunks'].items():
    
    lat = value['chunk_stop_lat']
    lon = value['chunk_stop_lon']
    
    folium.Marker([float(lat), float(lon)]).add_to(map_2)
    
map_2
    

In [99]:
for chunk_data in chunk_docs:
    chnk_coll.insert_one(chunk_data)

In [100]:
chnk_coll.find_one()

{'_id': ObjectId('5aeb76b23ad39e3335a55b08'),
 'number_chunks': 2,
 'chunks': {'1': {'chunk_stop_id': 3329,
   'chunk_dist': 5837,
   'chunk_stop_seq': 24,
   'chunk_stop_lat': 37.760702,
   'chunk_stop_lon': -122.437294,
   'chunk_stop_name': '18th St & Diamond St'},
  '2': {'chunk_stop_id': 3511,
   'chunk_dist': 10494,
   'chunk_stop_seq': 45,
   'chunk_stop_lat': 37.751407,
   'chunk_stop_lon': -122.40668,
   'chunk_stop_name': '25th St & Potrero Ave'}}}