# Imports

In [7]:
import gzip
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from IPython.display import display

%matplotlib inline

sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
sns.set_context("talk")
sns.set_palette('Set2', 10)
pd.set_option('display.max_columns', None)

In [8]:
plt.rcParams['figure.figsize'] = (20, 10) # set default size of plots

# Getting SIRI data
## aggregate_route - aggregates data of specific routes when the files exist in a folder

## process_dates - aggregates data of routes from the cloud in a specific timespan

In [47]:
import os

def aggregate_route(folder, date, route_ids):
    # Aggregate information about multiple routes in specified time
    # stitch - uncompleted dataframe from previous day
    
    def get_merged_df(folder, date, route_ids):
        # Creates dataframe from multipile files in a folder
        file_list = [x for x in os.listdir(folder) if date in x and x.endswith('gz')]
        frame = pd.DataFrame()
        for file in file_list:
            path = os.path.join(folder, file)
            with gzip.open(path) as f:
                data = pd.read_csv(path)

            data = data[data.route_id.isin(route_ids)]
            data['file'] = file
            frame = frame.append(data)
        return frame
    
    def create_start_date_timestamp(df):
        # Creates timestamp with correct date of start time
        df_mask = df.groupby(['bus_id', 'planned_start_time', 'date'])['time_recorded'].apply(min).reset_index()

        def actual_date(row):
            # Returns the actual date the bus started the route
            date = row['date']
            time = pd.Timestamp(date + ' ' + row['time_recorded'])
            day_start = pd.Timestamp(f'{date} 00:00:00')
            if (time - day_start).total_seconds() < 180:
                # Checking if the first recording of the bus falls inside the 3 minutes at the start of new day
                if pd.Timestamp(row['planned_start_time']).hour > 12:
                    # The date is yesterday assuming timestamp was taken a minute after 00:00:00 and time is after 12
                    return (pd.Timestamp(date) - pd.DateOffset(1)).strftime('%Y-%m-%d')
            return date

        df_mask['departure_date'] = df_mask.apply(actual_date, axis=1)
        df_mask = df_mask.drop(['time_recorded', 'date'], axis=1)
        df = df.merge(df_mask, on=['bus_id', 'planned_start_time'])
        df['planned_start_time'] = df.apply(lambda x: pd.Timestamp(x['departure_date'] + ' ' + x['planned_start_time']), axis=1)
        df = df.drop(['departure_date'], axis=1)
        return df
    
    def time_delta(row):
        # Returns time delta in minutes
        delta = row['timestamp'] - row['predicted_end_time']
        if delta.days < 0:
            return -abs(delta).seconds / 60
        return delta.seconds / 60
    
    def final_filter(row):
        if (next_day - row['timestamp']).total_seconds() < 120 or row['time_delta'] < -20:
            return False
        return True
    
    data = get_merged_df(folder, date, route_ids)

    # converting time objects
    data['timestamp'] = data.apply(lambda x: pd.Timestamp(x['timestamp']), axis=1)
    data = create_start_date_timestamp(data)
    data['predicted_end_time'] = data.apply(lambda x: pd.Timestamp(date + ' ' + x['predicted_end_time']), axis=1)
    
    data = data.groupby(['route_id', 'planned_start_time'])['predicted_end_time', 'timestamp', 'file'].agg(
        {'predicted_end_time': min, 'timestamp': max, 'file': max}).reset_index()
    data['start_hour'] = data.planned_start_time.dt.time
    # filter results from next day
    next_day = pd.Timestamp(date) + pd.offsets.Day(1)
    
    data['time_delta'] = data.apply(time_delta, axis=1)
    data['route_completed'] = data.apply(final_filter, axis=1)
    return data

In [10]:
from gtfs.gtfs_utils.gtfs_utils.configuration import configuration
from gtfs.gtfs_utils.gtfs_utils import environment
from gtfs.gtfs_utils.gtfs_utils import s3_wrapper

environment.init_conf()
configuration.s3
crud = s3_wrapper.S3Crud.from_configuration(configuration.s3)

In [11]:
def process_dates(download_folder, start_date, days, route_ids, remove=False):
    date_range = [start_date + datetime.timedelta(days=x) for x in range(days)]
    date_range = [datetime.datetime.strftime(x, '%Y-%m-%d') for x in date_range]
    df = pd.DataFrame()

    for date in date_range:
        # Download the dates files if they don't exist
        siri_files = s3_wrapper.list_content(crud, regex_argument=date)
        for file in siri_files:
            file_name = file['Key'].split('/')[-1]
            local_file = os.path.join(download_folder, file_name)
            if not os.path.exists(local_file):
                print('Downloading', file_name)
                s3_wrapper.download(crud, local_file, file['Key'])

        # Process the files
        print('Processing date', date)
        df = df.append(aggregate_route(download_folder, date, route_ids))

        if remove:
            # Removing the files
            print('Removing files')
            for file in os.listdir(download_folder):
                os.remove(os.path.join(download_folder, file))
    df = df[df.route_completed]     
    df = df.drop('route_completed', axis=1)
    return df

In [50]:
other = aggregate_route('testing\\siri', '2019-04-01', [17906, 17905, 23821, 23822, 2482, 2484])

In [51]:
other

Unnamed: 0,route_id,planned_start_time,predicted_end_time,timestamp,file,start_hour,time_delta,route_completed
0,2482,2019-04-01 05:40:00,2019-04-01 06:17:00,2019-04-01 06:22:14.454,siri_rt_data.2019-04-01.0.csv.gz,05:40:00,5.233333,True
1,2482,2019-04-01 05:55:00,2019-04-01 06:32:00,2019-04-01 06:32:17.402,siri_rt_data.2019-04-01.0.csv.gz,05:55:00,0.283333,True
2,2482,2019-04-01 06:10:00,2019-04-01 06:49:00,2019-04-01 06:52:21.133,siri_rt_data.2019-04-01.0.csv.gz,06:10:00,3.350000,True
3,2482,2019-04-01 06:20:00,2019-04-01 06:59:00,2019-04-01 07:05:25.100,siri_rt_data.2019-04-01.0.csv.gz,06:20:00,6.416667,True
4,2482,2019-04-01 06:30:00,2019-04-01 07:13:00,2019-04-01 07:15:25.733,siri_rt_data.2019-04-01.0.csv.gz,06:30:00,2.416667,True
...,...,...,...,...,...,...,...,...
343,23822,2019-04-01 21:40:00,2019-04-01 22:36:00,2019-04-01 22:38:14.843,siri_rt_data.2019-04-01.9.csv.gz,21:40:00,2.233333,True
344,23822,2019-04-01 22:00:00,2019-04-01 22:54:00,2019-04-01 22:56:14.805,siri_rt_data.2019-04-01.9.csv.gz,22:00:00,2.233333,True
345,23822,2019-04-01 22:20:00,2019-04-01 23:12:00,2019-04-01 23:17:14.828,siri_rt_data.2019-04-01.9.csv.gz,22:20:00,5.233333,True
346,23822,2019-04-01 22:40:00,2019-04-01 23:24:00,2019-04-01 23:23:14.782,siri_rt_data.2019-04-01.9.csv.gz,22:40:00,-0.750000,True


In [52]:
start_date = '2019-04-01'
start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
days = 10
download_folder = 'C:\\Users\\Liad\\Projects\\open-bus\\testing\\siri'
df = process_dates(download_folder, start_date, days, [17906, 17905, 23821, 23822, 2482, 2484])

Processing date 2019-04-01
Processing date 2019-04-02
Processing date 2019-04-03
Processing date 2019-04-04
Processing date 2019-04-05
Processing date 2019-04-06
Processing date 2019-04-07
Processing date 2019-04-08
Processing date 2019-04-09
Processing date 2019-04-10


# Getting GTFS data

In [54]:
routes_file = 'gtfs_stats_hack\\2019-04-01_route_stats.pkl.gz'
trips_file = 'gtfs_stats_hack\\2019-04-01_trip_stats.pkl.gz'
with gzip.open(routes_file, 'rb') as f:
    routes = pickle.load(f)
with gzip.open(trips_file, 'rb') as f:
    trips = pickle.load(f)

def get_route(name, agency_id):
    return routes[(routes.route_short_name==str(name)) & (routes.agency_id==str(agency_id))]

def plot_route(line):
    df[df.route_id==line].groupby(['start_hour'])['time_delta'].mean().plot(grid=True)

def merge_siri_gtfs(siri_df, gtfs_df):
    gtfs = gtfs_df.copy()
    gtfs['direction'] = gtfs.start_stop_city + ' -> ' + gtfs.end_stop_city
    gtfs = gtfs[['route_id', 'route_short_name', 'direction']]
    gtfs['route_id'] = gtfs.route_id.apply(int)
    return siri_df.merge(gtfs)

In [59]:
df = merge_siri_gtfs(df, routes)

In [60]:
df

Unnamed: 0,route_id,planned_start_time,predicted_end_time,timestamp,file,start_hour,time_delta,route_short_name,direction
0,2482,2019-04-01 05:40:00,2019-04-01 06:17:00,2019-04-01 06:22:14.454,siri_rt_data.2019-04-01.0.csv.gz,05:40:00,5.233333,49,פתח תקווה -> תל אביב יפו
1,2482,2019-04-01 05:55:00,2019-04-01 06:32:00,2019-04-01 06:32:17.402,siri_rt_data.2019-04-01.0.csv.gz,05:55:00,0.283333,49,פתח תקווה -> תל אביב יפו
2,2482,2019-04-01 06:10:00,2019-04-01 06:49:00,2019-04-01 06:52:21.133,siri_rt_data.2019-04-01.0.csv.gz,06:10:00,3.350000,49,פתח תקווה -> תל אביב יפו
3,2482,2019-04-01 06:20:00,2019-04-01 06:59:00,2019-04-01 07:05:25.100,siri_rt_data.2019-04-01.0.csv.gz,06:20:00,6.416667,49,פתח תקווה -> תל אביב יפו
4,2482,2019-04-01 06:30:00,2019-04-01 07:13:00,2019-04-01 07:15:25.733,siri_rt_data.2019-04-01.0.csv.gz,06:30:00,2.416667,49,פתח תקווה -> תל אביב יפו
...,...,...,...,...,...,...,...,...,...
2770,23822,2019-04-10 21:40:00,2019-04-10 22:32:00,2019-04-10 22:38:41.592,siri_rt_data.2019-04-10.12.csv.gz,21:40:00,6.683333,20,תל אביב יפו -> פתח תקווה
2771,23822,2019-04-10 22:00:00,2019-04-10 22:52:00,2019-04-10 22:58:41.540,siri_rt_data.2019-04-10.12.csv.gz,22:00:00,6.683333,20,תל אביב יפו -> פתח תקווה
2772,23822,2019-04-10 22:20:00,2019-04-10 23:09:00,2019-04-10 23:12:41.486,siri_rt_data.2019-04-10.12.csv.gz,22:20:00,3.683333,20,תל אביב יפו -> פתח תקווה
2773,23822,2019-04-10 22:40:00,2019-04-10 23:24:00,2019-04-10 23:28:41.448,siri_rt_data.2019-04-10.12.csv.gz,22:40:00,4.683333,20,תל אביב יפו -> פתח תקווה


In [58]:
df.route_short_name.unique()

array(['49', '64', '20'], dtype=object)

In [64]:

df[df.route_id==23821]

Unnamed: 0,route_id,planned_start_time,predicted_end_time,timestamp,file,start_hour,time_delta,route_short_name,direction
1899,23821,2019-03-31 23:35:00,2019-04-01 00:16:00,2019-04-01 00:13:59.797,siri_rt_data.2019-04-01.0.csv.gz,23:35:00,-2.000000,20,פתח תקווה -> תל אביב יפו
1900,23821,2019-04-01 05:15:00,2019-04-01 06:10:00,2019-04-01 06:13:20.158,siri_rt_data.2019-04-01.0.csv.gz,05:15:00,3.333333,20,פתח תקווה -> תל אביב יפו
1901,23821,2019-04-01 05:45:00,2019-04-01 06:41:00,2019-04-01 06:41:25.881,siri_rt_data.2019-04-01.0.csv.gz,05:45:00,0.416667,20,פתח תקווה -> תל אביב יפו
1902,23821,2019-04-01 06:00:00,2019-04-01 06:59:00,2019-04-01 07:01:31.007,siri_rt_data.2019-04-01.0.csv.gz,06:00:00,2.516667,20,פתח תקווה -> תל אביב יפו
1903,23821,2019-04-01 06:15:00,2019-04-01 07:19:00,2019-04-01 07:23:33.066,siri_rt_data.2019-04-01.1.csv.gz,06:15:00,4.550000,20,פתח תקווה -> תל אביב יפו
...,...,...,...,...,...,...,...,...,...
2321,23821,2019-04-10 20:55:00,2019-04-10 21:47:00,2019-04-10 21:46:45.225,siri_rt_data.2019-04-10.12.csv.gz,20:55:00,-0.233333,20,פתח תקווה -> תל אביב יפו
2322,23821,2019-04-10 21:15:00,2019-04-10 22:05:00,2019-04-10 22:06:45.163,siri_rt_data.2019-04-10.12.csv.gz,21:15:00,1.750000,20,פתח תקווה -> תל אביב יפו
2323,23821,2019-04-10 21:35:00,2019-04-10 22:24:00,2019-04-10 22:31:45.112,siri_rt_data.2019-04-10.12.csv.gz,21:35:00,7.750000,20,פתח תקווה -> תל אביב יפו
2324,23821,2019-04-10 22:15:00,2019-04-10 23:06:00,2019-04-10 23:08:44.976,siri_rt_data.2019-04-10.12.csv.gz,22:15:00,2.733333,20,פתח תקווה -> תל אביב יפו
