In [14]:
import pandas as pd
import numpy as np
import pickle
from os import listdir

In [15]:
df_files = [f for f in listdir() if f[-2:] == '.p'] # list all file names of pickled dataframes

In [16]:
bus_line = 203 # bus line number of interest

In [17]:
journey_cols = ['timestamp', 'event', 'line', 'vehicle_id', 'longitude', 'latitude', 'journey_number', 'segment_number']

In [24]:
df_buslines = pd.DataFrame(columns=journey_cols).astype(dtype={
                    'timestamp': 'object', 
                    'event': 'object',
                    'line': 'int64',
                    'vehicle_id': 'int64', 
                    'longitude': 'float64', 
                    'latitude': 'float64',
                    'journey_number': 'int64',
                    'segment_number': 'int64'
})

In [19]:
one_file = ['20180216-9.p']

In [18]:
df_files 

['20180216-9.p',
 '20180216-3.p',
 '20180216-10.p',
 '20180216-11.p',
 '20180216-8.p',
 '20180216-12.p',
 '20180216-4.p',
 '20180216-6.p',
 '20180216-2.p',
 '20180216-7.p',
 '20180216-5.p',
 '20180216-1.p']

In [25]:
journey_number = 1
# iterate over all pickled dataframes
for file in df_files:
    print(f'** Starting on new file: {file} **')
    df = pickle.load(open(file, 'rb'))
    # get the unique vehicle ids that drove bus line of interest
    vehicle_ids = df[(df['line'] == bus_line)]['vehicle_id'].unique()
    # iterate over all vehicle ids
    for vid in vehicle_ids:
        print(f'Startng on new vehicle with id: {vid}')
        vals = list()
        started = False
        # extract slice of df for this vehicle id and sort by timestamp
        df_ = df[df['vehicle_id'] == vid].copy().sort_values('timestamp')
        for row in df_.itertuples():
            # start collecting data at when JourneyStartedEvent fires
            if not started and row[2] == 'JourneyStartedEvent' and row[3] == bus_line:
                started = True
                segment_number = 1
                vals.append([x for x in row[1:]] + [journey_number, segment_number])
                continue
            # as long as the bus line is the same, we are on the same route, 
            # since we are only looking at a single vehicle and events are sorted by timestamp
            # keep collecting data until we hit a JourneyCompletedEvent
            if started and (row[3] == bus_line or row[3] == 0):
                if row[2] == 'EnteredEvent':
                    segment_number += 1
                vals.append([x for x in row[1:]] + [journey_number, segment_number])
                # when the JourneyCompletedEvent is hit, we save this journey to our df and start over
                if row[2] == 'JourneyCompletedEvent':
                    df2 = df2.append(pd.DataFrame(data=vals, columns=journey_cols)).astype(dtype={
                            'timestamp': 'object', 
                            'event': 'object',
                            'line': 'int64',
                            'vehicle_id': 'int64', 
                            'longitude': 'float64', 
                            'latitude': 'float64',
                            'journey_number': 'int64',
                            'segment_number': 'int64'
                    })
                    print(f'Successfully collected {journey_number} journeys!')
                    started = False
                    vals = list()
                    journey_number += 1
            # if bus line changed anywhere in the middle of this sequence, something went wrong
            # we then scrap the collected data and start over
            else:
                started = False
                vals = list()

** Starting on new file: 20180216-9.p **
Startng on new vehicle with id: 5479
Startng on new vehicle with id: 5486
Startng on new vehicle with id: 5431
Successfully collected 1 journeys!
Startng on new vehicle with id: 5484
Startng on new vehicle with id: 5422
Startng on new vehicle with id: 5490
Successfully collected 2 journeys!
Startng on new vehicle with id: 5441
Successfully collected 3 journeys!
Successfully collected 4 journeys!
Startng on new vehicle with id: 5478
Successfully collected 5 journeys!
Startng on new vehicle with id: 5457
Successfully collected 6 journeys!
Startng on new vehicle with id: 5485
Successfully collected 7 journeys!
Startng on new vehicle with id: 5418
Startng on new vehicle with id: 5428
Successfully collected 8 journeys!
** Starting on new file: 20180216-3.p **
Startng on new vehicle with id: 5490
Startng on new vehicle with id: 5436
Successfully collected 9 journeys!
Startng on new vehicle with id: 5453
Successfully collected 10 journeys!
Startng on n

Startng on new vehicle with id: 5488


In [26]:
 pickle.dump(df2, open('buslines/bus203.p', 'wb'))