In [306]:
import pandas as pd
from ftplib import FTP
from datetime import datetime
import time
import numpy as np
import pymongo
from pymongo import MongoClient
from geopy import distance

import get_recent_days as gtdys
import muni_etl
import labelling as lblng

Build a Dataframe which has data on each day we will be loading and labeling

In [307]:
file_df = gtdys.x_recent_days(30)

In [308]:
file_df

Unnamed: 0,ftp_filename,iso_string,time_stamp,gtfs_directory
0,sfmtaAVLRawData12052016.csv,2016-12-05,1480925000.0,sfmta_2017-02-10
1,sfmtaAVLRawData12042016.csv,2016-12-04,1480838000.0,sfmta_2017-02-10
2,sfmtaAVLRawData12032016.csv,2016-12-03,1480752000.0,sfmta_2017-02-10
3,sfmtaAVLRawData11232016.csv,2016-11-23,1479888000.0,sfmta_2017-02-10
4,sfmtaAVLRawData11222016.csv,2016-11-22,1479802000.0,sfmta_2017-02-10
5,sfmtaAVLRawData11212016.csv,2016-11-21,1479715000.0,sfmta_2017-02-10
6,sfmtaAVLRawData11202016.csv,2016-11-20,1479629000.0,sfmta_2017-02-10
7,sfmtaAVLRawData11192016.csv,2016-11-19,1479542000.0,sfmta_2017-02-10
8,sfmtaAVLRawData11182016.csv,2016-11-18,1479456000.0,sfmta_2017-02-10
9,sfmtaAVLRawData11172016.csv,2016-11-17,1479370000.0,sfmta_2017-02-10


Load in the Block and Sign reference dataframes

This gives us the appropriate blocks for each day

In [309]:
blockref = pd.read_csv('data/lookUpBlockIDToBlockNumNam.csv')
signref = pd.read_csv('data/lookUpSignUpPeriods.csv', parse_dates=[2,3])

Connect to MongoDB, create our database and two tables

In [310]:
client = MongoClient('localhost', 27017)

db = client['avl_pipeline_test']
in_collection = db['avl_raw']
out_collection = db['labeled_trips']

# Optional - Clean the collections
in_collection.delete_many({})
out_collection.delete_many({});

In [311]:
out_collection.delete_many({});

For each day, filter and load in the data from the FTP server

In [312]:
for series in file_df.iterrows():
    print ("Loading Data from: " + series[1]['iso_string'])
    series_transform = series[1].to_frame().T
    etl = muni_etl.MuniETL(series_transform, blockref, signref, 'avl_pipeline_test', 'avl_raw')
    etl.run_everything()

Loading Data from: 2016-12-05
Loading Data from: 2016-12-04
Loading Data from: 2016-12-03
Loading Data from: 2016-11-23
Loading Data from: 2016-11-22
Loading Data from: 2016-11-21
Loading Data from: 2016-11-20
Loading Data from: 2016-11-19
Loading Data from: 2016-11-18
Loading Data from: 2016-11-17
Loading Data from: 2016-11-16
Loading Data from: 2016-11-15
Loading Data from: 2016-11-14
Loading Data from: 2016-11-13
Loading Data from: 2016-11-12
Loading Data from: 2016-11-11
Loading Data from: 2016-11-10
Loading Data from: 2016-11-09
Loading Data from: 2016-11-08
Loading Data from: 2016-11-07
Loading Data from: 2016-11-06
Loading Data from: 2016-11-05
Loading Data from: 2016-11-04
Loading Data from: 2016-11-03
Loading Data from: 2016-11-02
Loading Data from: 2016-11-01
Loading Data from: 2016-10-31
Loading Data from: 2016-10-30
Loading Data from: 2016-10-29
Loading Data from: 2016-10-28


Verify the amount of data in our collection

In [313]:
in_collection.find().count()

283822

Great! Now let's label everything

Let's create a class instance

In [314]:
labeler = lblng.Labeling(in_collection, out_collection)

Let's find all the trip starts

In [315]:
labeler.label_single_starts()

Total Start Intersection Count:  13999


Start Count:  1617


Duplicate ID Count:  0




Finally, let's use all those starts to label the rest of the data!

In [316]:
labeler.label_trips()

Total Good Trips:  1474


Total Emtpy Trips:  1


Total Sparse Trips:  106


Total Dense Trips:  22


Total 'Endless' Trips:  36


Sweet!! How many documents in our labeled collection?

In [320]:
out_collection.find().count()

111798

In [321]:
out_collection.find_one()

{'_id': ObjectId('5ae4dcb33ad39e1721401dad'),
 'HEADING': '82.0',
 'LATITUDE': '37.78693',
 'LONGITUDE': '-122.4565',
 'PREDICTABLE': '1',
 'REPORT_TIME': '10/28/2016 00:27:57',
 'REV': '1526',
 'SPEED': '0.0',
 'TRAIN_ASSIGNMENT': '3305',
 'VEHICLE_TAG': '5419',
 'sched_time_diff_seconds': 123,
 'service_id': 1,
 'time_stamp': 1477639677.0,
 'trip_id': 7253717,
 'trip_id_iso': '7253717_2016-10-28_HKH3O',
 'trip_start': 1}

In [319]:
labeler.gtfs_directory

'sfmta_2017-02-10'

In [15]:
labeler.sched_trps[labeler.sched_trps['stop_sequence'] == 1]['stop_id'].unique()

array([6293])

In [20]:
routes = pd.read_csv('data/gtfs/sfmta_2017-02-10/routes.txt')
trips = pd.read_csv('data/gtfs/sfmta_2017-02-10/trips.txt')
sched = pd.read_csv('data/gtfs/sfmta_2017-02-10/stop_times.txt')

In [23]:
routes[routes['route_short_name'] == '33']

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
39,11668,SFMTA,33,ASHBURY-18TH ST,,3,,,


In [29]:
east_33 = trips[(trips['route_id'] == 11668) & (trips['direction_id'] == 0)]

In [36]:
trip_ids = east_33['trip_id'].unique()

In [43]:
trip_sched = sched[(sched['trip_id'].isin(trip_ids)) & (sched['stop_sequence'] == 1)]
trip_sched['stop_id'].unique()

array([6293])