In [68]:
import pandas as pd
from ftplib import FTP
from datetime import datetime
import time
import numpy as np
import pymongo
from pymongo import MongoClient
from geopy import distance

import get_recent_days as gtdys
import muni_etl
import labelling as lblng

Build a Dataframe which has data on each day we will be loading and labeling

In [77]:
file_df = gtdys.x_recent_days(20)

In [78]:
file_df = file_df.sample(1)

In [79]:
file_df

Unnamed: 0,ftp_filename,iso_string,time_stamp,gtfs_directory
1,sfmtaAVLRawData12042016.csv,2016-12-04,1480838000.0,sfmta_2017-02-10


Load in the Block and Sign reference dataframes

This gives us the appropriate blocks for each day

In [80]:
blockref = pd.read_csv('data/lookUpBlockIDToBlockNumNam.csv')
signref = pd.read_csv('data/lookUpSignUpPeriods.csv', parse_dates=[2,3])

Connect to MongoDB, create our database and two tables

In [81]:
client = MongoClient('localhost', 27017)

db = client['avl_pipeline_test']
in_collection = db['avl_raw']
out_collection = db['labeled_trips']

# Optional - Clean the collections
in_collection.delete_many({})
out_collection.delete_many({});

In [82]:
out_collection.delete_many({});

For each day, filter and load in the data from the FTP server

In [83]:
for series in file_df.iterrows():
    print ("Loading Data from: " + series[1]['iso_string'])
    series_transform = series[1].to_frame().T
    etl = muni_etl.MuniETL(series_transform, blockref, signref, 'avl_pipeline_test', 'avl_raw')
    etl.run_everything()

Loading Data from: 2016-12-04


Verify the amount of data in our collection

In [84]:
in_collection.find().count()

9515

Great! Now let's label everything

Let's create a class instance

In [85]:
labeler = lblng.Labeling(in_collection, out_collection)

Let's find all the trip starts

In [86]:
labeler.label_single_starts()

Start Count:  53


Duplicate ID Count:  0




Finally, let's use all those starts to label the rest of the data!

In [87]:
labeler.label_trips()

Total Good Trips:  52


Total Emtpy Trips:  0


Total Sparse Trips:  0


Total Dense Trips:  0


Total 'Endless' Trips:  1


Sweet!! How many documents in our labeled collection?

In [10]:
out_collection.find().count()

10554

In [16]:
labeler.gtfs_directory

'sfmta_2017-02-10'

In [15]:
labeler.sched_trps[labeler.sched_trps['stop_sequence'] == 1]['stop_id'].unique()

array([6293])

In [20]:
routes = pd.read_csv('data/gtfs/sfmta_2017-02-10/routes.txt')
trips = pd.read_csv('data/gtfs/sfmta_2017-02-10/trips.txt')
sched = pd.read_csv('data/gtfs/sfmta_2017-02-10/stop_times.txt')

In [23]:
routes[routes['route_short_name'] == '33']

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
39,11668,SFMTA,33,ASHBURY-18TH ST,,3,,,


In [29]:
east_33 = trips[(trips['route_id'] == 11668) & (trips['direction_id'] == 0)]

In [36]:
trip_ids = east_33['trip_id'].unique()

In [43]:
trip_sched = sched[(sched['trip_id'].isin(trip_ids)) & (sched['stop_sequence'] == 1)]
trip_sched['stop_id'].unique()

array([6293])