# Import packages

In [1]:
import numpy as np
import altair as alt
import pandas as pd
import geopandas as gpd
from datetime import timedelta
from scipy import stats
import statsmodels.formula.api as sm
from matplotlib import pyplot as plt


from helper_functions import open_delays, open_processed_gtfs, create_timesteps

# alt.data_transformers.disable_max_rows()

# Notebook settings

In [2]:
DELAYS_FILEPATH = r'Intermediates\\delays.csv'
LINKS_FILEPATH = r'Intermediates\\GTFS_20190914\\gtfs_links_corrected.geojson'
STATIONS_FILEPATH = r'Intermediates\\GTFS_20190914\\gtfs_stations_corrected.geojson'

### MODEL SETTINGS
timestep_size = 60  # minutes
timeperiod_start = 5  # Hour going up to 24
timeperiod_end  = 23  # Hour going up to 24

distribution = stats.exponnorm

# Load data

In [3]:
delays = open_delays(DELAYS_FILEPATH)
delays.head(3)

Amount of delay values: 3_873_811


Unnamed: 0,line_name,trip_id,shape_id,stop,arrival_time,previous_stop,next_stop,real_arrival_time,delay
0,BLUE,3121350_18154_2019-09-14,46,G05,2019-09-14 06:54:00,,G04,2019-09-14 06:46:04,-476.0
1,BLUE,3121350_18154_2019-09-14,46,G04,2019-09-14 06:57:00,G05,G03,2019-09-14 06:56:12,-48.0
2,GREEN,3120258_18154_2019-09-14,117,F11,2019-09-14 07:00:00,,F10,2019-09-14 06:45:42,-858.0


In [4]:
links = gpd.read_file(LINKS_FILEPATH)
print(f"Amount of network connections: {len(links):_}")
links.head(3)

Amount of network connections: 189


Unnamed: 0,link_id,from_stop,to_stop,travel_time,geometry
0,A15_to_A14,A15,A14,240,"LINESTRING (-77.16476 39.11999, -77.14669 39.0..."
1,A14_to_A13,A14,A13,180,"LINESTRING (-77.14669 39.08545, -77.12079 39.0..."
2,A13_to_A12,A13,A12,180,"LINESTRING (-77.12079 39.06239, -77.11278 39.0..."


In [5]:
stations = gpd.read_file(STATIONS_FILEPATH)
stations = stations.rename(columns={'stop_id': 'stop'})

print(f"Amount of network stations: {len(stations):_}")
stations.head(3)

Amount of network stations: 91


Unnamed: 0,stop,stop_name,connections,geometry
0,E02,SHAW METRO STATION,"[ [ ""E03"", ""E01"" ], [ ""E01"", ""E03"" ] ]",POINT (-77.02193 38.91455)
1,D11,CHEVERLY METRO STATION,"[ [ ""D10"", ""D12"" ], [ ""D12"", ""D10"" ] ]",POINT (-76.9151 38.91655)
2,G02,CAPITOL HEIGHTS METRO STATION,"[ [ ""G01"", ""G03"" ], [ ""G03"", ""G01"" ] ]",POINT (-76.91331 38.88957)


# Assign delays to timestep

In [6]:
timesteps = create_timesteps(timestep_size, timeperiod_start, timeperiod_end)
delays_assigned = delays.copy()
delays_assigned = delays_assigned.rename(columns={'arrival_time': 'sched_arrival_time'})
delays_assigned['timestep'] = [None] * len(delays_assigned)
for time_step_begin, time_step_end in zip(timesteps[:-1], timesteps[1:]):
    begin_seconds = time_step_begin[0] * 3600 + time_step_begin[1] * 60
    end_seconds = time_step_end[0] * 3600 + time_step_end[1] * 60

    single_step = delays_assigned[[((date.hour * 3600 + date.minute * 60 >= begin_seconds) & \
                                             (date.hour * 3600 + date.minute * 60 <= end_seconds)) for \
                                            date in delays_assigned['sched_arrival_time']]]
    delays_assigned.loc[single_step.index, 'timestep'] = time_step_begin[0] + time_step_begin[1] / 60

delays_assigned = delays_assigned[pd.notna(delays_assigned['timestep'])]
print(f"Amount of values allocated to timesteps: {len(delays_assigned):_}")
delays_assigned.head(3)

Amount of values allocated to timesteps: 3_802_649


Unnamed: 0,line_name,trip_id,shape_id,stop,sched_arrival_time,previous_stop,next_stop,real_arrival_time,delay,timestep
0,BLUE,3121350_18154_2019-09-14,46,G05,2019-09-14 06:54:00,,G04,2019-09-14 06:46:04,-476.0,6.0
1,BLUE,3121350_18154_2019-09-14,46,G04,2019-09-14 06:57:00,G05,G03,2019-09-14 06:56:12,-48.0,6.0
2,GREEN,3120258_18154_2019-09-14,117,F11,2019-09-14 07:00:00,,F10,2019-09-14 06:45:42,-858.0,7.0


# Assign delays to dates

In [7]:
delays_assigned = delays_assigned.copy()
delays_assigned['date'] = [date.date() for date in delays_assigned['sched_arrival_time']]
delays_assigned.head(3)

Unnamed: 0,line_name,trip_id,shape_id,stop,sched_arrival_time,previous_stop,next_stop,real_arrival_time,delay,timestep,date
0,BLUE,3121350_18154_2019-09-14,46,G05,2019-09-14 06:54:00,,G04,2019-09-14 06:46:04,-476.0,6.0,2019-09-14
1,BLUE,3121350_18154_2019-09-14,46,G04,2019-09-14 06:57:00,G05,G03,2019-09-14 06:56:12,-48.0,6.0,2019-09-14
2,GREEN,3120258_18154_2019-09-14,117,F11,2019-09-14 07:00:00,,F10,2019-09-14 06:45:42,-858.0,7.0,2019-09-14


# Assign delays to delay range

In [8]:
delays_assigned['delay_range'] = delays_assigned['delay'] // 60
delays_assigned['delay_range'] = [delay_range if delay_range <= 6.0 else 7.0 for delay_range in delays_assigned['delay_range'].values]
print(f"{len(delays_assigned):_}")
delays_assigned.head(3)

3_802_649


Unnamed: 0,line_name,trip_id,shape_id,stop,sched_arrival_time,previous_stop,next_stop,real_arrival_time,delay,timestep,date,delay_range
0,BLUE,3121350_18154_2019-09-14,46,G05,2019-09-14 06:54:00,,G04,2019-09-14 06:46:04,-476.0,6.0,2019-09-14,-8.0
1,BLUE,3121350_18154_2019-09-14,46,G04,2019-09-14 06:57:00,G05,G03,2019-09-14 06:56:12,-48.0,6.0,2019-09-14,-1.0
2,GREEN,3120258_18154_2019-09-14,117,F11,2019-09-14 07:00:00,,F10,2019-09-14 06:45:42,-858.0,7.0,2019-09-14,-15.0


# Select analysis window

In [9]:
significant_delays = delays_assigned[(delays_assigned['delay'] >= 0)].copy()

# Gather function specific data

In [10]:
significant_delays_connection_split = {connection: data for connection, data in significant_delays.groupby(['stop', 'next_stop'])}
all_delays_connection_split = {connection: data for connection, data in delays_assigned.groupby(['stop', 'next_stop'])}
reaction_delays_connection_datetime_split = {connection: {(date_timestep[0], float(date_timestep[1])): [int(value) for value in list(date_timestep_data['delay'].values)] for date_timestep, date_timestep_data in connection_data.groupby(['date', 'timestep'])} for connection, connection_data in all_delays_connection_split.items()}

In [11]:
# If delay at 9 am and between 0-60 seconds, what are the delays at other hours at other stations across network?

# significant_delays_connection_split = {connection: data for connection, data in significant_delays.groupby(['stop', 'next_stop'])}
# all_delays_connection_split = {connection: data for connection, data in delays_assigned.groupby(['stop', 'next_stop'])}
# reaction_delays_connection_datetime_split = {connection: {(date_timestep[0], float(date_timestep[1])): [int(value) for value in list(date_timestep_data['delay'].values)] for date_timestep, date_timestep_data in connection_data.groupby(['date', 'timestep'])} for connection, connection_data in all_delays_connection_split.items()}

timesteps = list(set(significant_delays['timestep'].to_list()))
timesteps.sort()

for connection, connection_data in significant_delays_connection_split.items():
    if connection != ('D05', 'D06'):
        continue

    print(connection)
    connection_list = []
    delay_range_list = []
    timestep_list = []
    other_connection_list = []
    future_timestep_list = []
    dist_params_list = []

    range_split = {delay_range[0]: data for delay_range, data in connection_data.groupby(['delay_range'])}

    for delay_range, delay_range_data in range_split.items():
        timestep_split = {timestep: data for timestep, data in delay_range_data.groupby('timestep')}

        print('range ', delay_range)

        for timestep, timestep_data in timestep_split.items():
            date_split = {date[0]: data for date, data in timestep_data.groupby(['date'])}

            print('timestep ', timestep)
            
            for other_connection in all_delays_connection_split.keys():
                other_connection_values = reaction_delays_connection_datetime_split[other_connection]

                # print('other connection ', other_connection)

                for future_timestep in [step_i for step_i in timesteps if step_i >= timestep]:

                    total_values_list = []
                    for date, date_data in date_split.items():
                        if (date, future_timestep) in other_connection_values:
                            related_delay_values = other_connection_values[(date, future_timestep)]

                            for _ in range(len(date_data)):
                                total_values_list.extend(related_delay_values)
                    
                    if len(total_values_list) >= 40:
                        
                        params = distribution.fit(total_values_list)

                        connection_list.append(connection)
                        delay_range_list.append(delay_range)
                        timestep_list.append(timestep)
                        other_connection_list.append(other_connection)
                        future_timestep_list.append(future_timestep)
                        dist_params_list.append([float(param) for param in params])

    result_i = pd.DataFrame({'connection': connection_list,
                        'delay_range': delay_range_list,
                        'timestep': timestep_list,
                        'other_connection': other_connection_list,
                        'future_timestep': future_timestep_list,
                        'dist_params': dist_params_list})
    
    from_connection = connection[0].replace('/', '-')
    to_connection = connection[1].replace('/', '-')
    result_i.to_csv(f"First model results\\{from_connection} to {to_connection} {timestep_size}.csv", index=False)
    print(f"Saved to {from_connection}->{to_connection} {timestep_size}.csv")


('D05', 'D06')
range  0.0
timestep  5.0
timestep  6.0
timestep  7.0
timestep  8.0
timestep  9.0
timestep  10.0
timestep  11.0
timestep  12.0
timestep  13.0
timestep  14.0
timestep  15.0
timestep  16.0
timestep  17.0
timestep  18.0
timestep  19.0
timestep  20.0
timestep  21.0
timestep  22.0
range  1.0
timestep  5.0
timestep  6.0
timestep  7.0
timestep  8.0
timestep  9.0
timestep  10.0
timestep  11.0
timestep  12.0
timestep  13.0
timestep  14.0
timestep  15.0
timestep  16.0
timestep  17.0
timestep  18.0
timestep  19.0
timestep  20.0
timestep  21.0
timestep  22.0
range  2.0
timestep  5.0
timestep  6.0
timestep  7.0
timestep  8.0
timestep  9.0
timestep  10.0
timestep  11.0
timestep  12.0
timestep  13.0
timestep  14.0
timestep  15.0
timestep  16.0
timestep  17.0
timestep  18.0
timestep  19.0
timestep  20.0
timestep  21.0
timestep  22.0
range  3.0
timestep  5.0
timestep  6.0
timestep  7.0
timestep  8.0
timestep  9.0
timestep  10.0
timestep  11.0
timestep  12.0
timestep  13.0
timestep  14.0
t