In [1]:
import pandas as pd
from ftplib import FTP
from datetime import datetime
import time
import numpy as np
import pymongo
from pymongo import MongoClient
from geopy import distance

import get_recent_days as gtdys
import muni_etl
import labelling as lblng

Build a Dataframe which has data on each day we will be loading and labeling

In [2]:
file_df = gtdys.x_recent_days(30)

Load in the Block and Sign reference dataframes

This gives us the appropriate blocks for each day

In [3]:
blockref = pd.read_csv('data/lookUpBlockIDToBlockNumNam.csv')
signref = pd.read_csv('data/lookUpSignUpPeriods.csv', parse_dates=[2,3])

Connect to MongoDB, create our database and two tables

In [4]:
client = MongoClient('localhost', 27017)

db = client['avl_pipeline_test']
in_collection = db['avl_raw']
out_collection = db['labeled_trips']

# Optional - Clean the collections
in_collection.delete_many({})
out_collection.delete_many({});

For each day, filter and load in the data from the FTP server

In [5]:
for series in file_df.iterrows():
    print ("Loading Data from: " + series[1]['iso_string'])
    series_transform = series[1].to_frame().T
    etl = muni_etl.MuniETL(series_transform, blockref, signref, 'avl_pipeline_test', 'avl_raw')
    etl.run_everything()

Loading Data from: 2016-12-05
Loading Data from: 2016-12-04
Loading Data from: 2016-12-03
Loading Data from: 2016-11-23
Loading Data from: 2016-11-22
Loading Data from: 2016-11-21
Loading Data from: 2016-11-20
Loading Data from: 2016-11-19
Loading Data from: 2016-11-18
Loading Data from: 2016-11-17
Loading Data from: 2016-11-16
Loading Data from: 2016-11-15
Loading Data from: 2016-11-14
Loading Data from: 2016-11-13
Loading Data from: 2016-11-12
Loading Data from: 2016-11-11
Loading Data from: 2016-11-10
Loading Data from: 2016-11-09
Loading Data from: 2016-11-08
Loading Data from: 2016-11-07
Loading Data from: 2016-11-06
Loading Data from: 2016-11-05
Loading Data from: 2016-11-04
Loading Data from: 2016-11-03
Loading Data from: 2016-11-02
Loading Data from: 2016-11-01
Loading Data from: 2016-10-31
Loading Data from: 2016-10-30
Loading Data from: 2016-10-29
Loading Data from: 2016-10-28


Verify the amount of data in our collection

In [6]:
in_collection.find().count()

283822

Great! Now let's label everything

Let's create a class instance

In [7]:
labeler = lblng.Labeling(in_collection, out_collection)

Let's find all the trip starts

In [8]:
labeler.label_single_starts()

Start Count:  301


Duplicate ID Count:  0




Finally, let's use all those starts to label the rest of the data!

In [9]:
labeler.label_trips()

Total Good Trips:  285


Total Emtpy Trips:  0


Total Sparse Trips:  11


Total Dense Trips:  4


Total 'Endless' Trips:  5


Sweet!! How many documents in our labeled collection?

In [10]:
out_collection.find().count()

22232