# Import packages

In [1]:
import numpy as np
import altair as alt
import pandas as pd
import geopandas as gpd
from datetime import timedelta
from scipy import stats
import statsmodels.formula.api as sm
from matplotlib import pyplot as plt

from helper_functions import open_delays

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# Notebook settings

In [2]:
DELAYS_FILEPATH = r'Intermediates\\delays_assigned.csv'
# LINKS_FILEPATH = r'Intermediates\\GTFS_20190914\\gtfs_links_corrected.geojson'
# STATIONS_FILEPATH = r'Intermediates\\GTFS_20190914\\gtfs_stations_corrected.geojson'

# Load data

In [3]:
delays = open_delays(DELAYS_FILEPATH)
delays.head(3)

Amount of delay values: 3_802_649


Unnamed: 0,line_name,trip_id,shape_id,stop,sched_arrival_time,previous_stop,next_stop,real_arrival_time,delay,timestep
0,BLUE,3121350_18154_2019-09-14,46,G05,2019-09-14 06:54:00,,G04,2019-09-14 06:46:04,-476.0,6.0
1,BLUE,3121350_18154_2019-09-14,46,G04,2019-09-14 06:57:00,G05,G03,2019-09-14 06:56:12,-48.0,6.0
2,GREEN,3120258_18154_2019-09-14,117,F11,2019-09-14 07:00:00,,F10,2019-09-14 06:45:42,-858.0,7.0


# Find related values

In [4]:
# Find related values next stop same train
delays_sorted = delays.sort_values(by=['trip_id', 'sched_arrival_time']).copy()

stop_values = delays_sorted['stop'].values
delay_values = delays_sorted['delay'].values
trip_ids = delays_sorted['trip_id'].values
sched_arrivals = delays_sorted['sched_arrival_time'].values
real_arrivals = delays_sorted['real_arrival_time'].values
timesteps = delays_sorted['timestep'].values

related_forward = pd.DataFrame({'line': delays_sorted['line_name'].values,
                               'stop': stop_values,
                               'prev_stop': [stop_values[-1], *stop_values[:-1]],
                               'next_stop': [*stop_values[1:], stop_values[0]],
                               'delay': delay_values,
                               'delay_prev_stop': [delay_values[-1], *delay_values[:-1]],
                               'trip_id': trip_ids,
                               'sched_arrival': sched_arrivals,
                               'real_arrival': real_arrivals,
                               'timestep': timesteps,
                               'prev_trip_id': [trip_ids[-1], *trip_ids[:-1]],
                               'next_trip_id': [*trip_ids[1:], trip_ids[0]]})

print(f"Total: {len(related_forward):_}")
related_forward = related_forward[(related_forward['trip_id'] == related_forward['prev_trip_id']) | (related_forward['trip_id'] == related_forward['next_trip_id'])]
related_forward.loc[related_forward['trip_id'] != related_forward['prev_trip_id'], ['prev_stop', 'delay_prev_stop']] = None
related_forward = related_forward.drop(columns=['trip_id', 'next_trip_id', 'prev_trip_id'])

print(f"Filtered to only viable: {len(related_forward):_}")
related_forward.head(3)

Total: 3_802_649
Filtered to only viable: 3_802_596


Unnamed: 0,line,stop,prev_stop,next_stop,delay,delay_prev_stop,sched_arrival,real_arrival,timestep
0,SILVER,G05,,G04,-569.0,,2019-10-14 13:08:00,2019-10-14 12:58:31,13.0
1,SILVER,G04,G05,G03,-88.0,-569.0,2019-10-14 13:11:00,2019-10-14 13:09:32,13.0
2,SILVER,G03,G04,G02,-106.0,-88.0,2019-10-14 13:14:00,2019-10-14 13:12:14,13.0


In [5]:
# Find related values for next train same stop
related_forward = related_forward.sort_values(by=['sched_arrival']).copy()
station_split = {trip_key: data for trip_key, data in related_forward.groupby(['stop', 'next_stop'])}

updated_dfs = []
for key_pair, station_stops in station_split.items():

    station_stops['delay_ptss'] = [None, *station_stops['delay'][:-1]]
    station_stops['real_arrival_prev_train'] = [None, *station_stops['real_arrival'].values[:-1]]
    station_stops['sched_arrival_prev_train'] = [None, *station_stops['sched_arrival'].values[:-1]]

    updated_dfs.append(station_stops)

related_forward = pd.concat(updated_dfs)
print(len(related_forward))
related_forward.head(3)


  related_forward = pd.concat(updated_dfs)
  related_forward = pd.concat(updated_dfs)
  related_forward = pd.concat(updated_dfs)


3802596


Unnamed: 0,line,stop,prev_stop,next_stop,delay,delay_prev_stop,sched_arrival,real_arrival,timestep,delay_ptss,real_arrival_prev_train,sched_arrival_prev_train
1618603,RED,A01/C01,B01/F01,A02,58.0,102.0,2019-09-14 07:31:00,2019-09-14 07:31:58,7.0,,NaT,NaT
1618623,RED,A01/C01,B01/F01,A02,6.0,48.0,2019-09-14 07:42:00,2019-09-14 07:42:06,7.0,58.0,2019-09-14 07:31:58,2019-09-14 07:31:00
1618643,RED,A01/C01,B01/F01,A02,-54.0,-25.0,2019-09-14 07:50:00,2019-09-14 07:49:06,7.0,6.0,2019-09-14 07:42:06,2019-09-14 07:42:00


# Filter out inplace occurences

In [6]:
inplace_occcurences = related_forward[(related_forward['real_arrival_prev_train'] >= related_forward['sched_arrival']) & (related_forward['real_arrival'] > related_forward['real_arrival_prev_train'])].copy()
print(f"Amount of found inplace ocurrences: {len(inplace_occcurences):_}")

related_forward_assigned = related_forward[~related_forward.index.isin(inplace_occcurences.index)]
print(f"New amount of forwardly related delays: {len(related_forward_assigned):_}")
related_forward_assigned.head(3)


Amount of found inplace ocurrences: 79_169
New amount of forwardly related delays: 3_723_427


Unnamed: 0,line,stop,prev_stop,next_stop,delay,delay_prev_stop,sched_arrival,real_arrival,timestep,delay_ptss,real_arrival_prev_train,sched_arrival_prev_train
1618603,RED,A01/C01,B01/F01,A02,58.0,102.0,2019-09-14 07:31:00,2019-09-14 07:31:58,7.0,,NaT,NaT
1618623,RED,A01/C01,B01/F01,A02,6.0,48.0,2019-09-14 07:42:00,2019-09-14 07:42:06,7.0,58.0,2019-09-14 07:31:58,2019-09-14 07:31:00
1618643,RED,A01/C01,B01/F01,A02,-54.0,-25.0,2019-09-14 07:50:00,2019-09-14 07:49:06,7.0,6.0,2019-09-14 07:42:06,2019-09-14 07:42:00


# Calculate time diff

In [7]:
related_forward_with_diff = related_forward_assigned.copy()
related_forward_with_diff['delay_diff'] = related_forward_with_diff['delay'] - related_forward_with_diff['delay_prev_stop']
related_forward_with_diff.head(3)

Unnamed: 0,line,stop,prev_stop,next_stop,delay,delay_prev_stop,sched_arrival,real_arrival,timestep,delay_ptss,real_arrival_prev_train,sched_arrival_prev_train,delay_diff
1618603,RED,A01/C01,B01/F01,A02,58.0,102.0,2019-09-14 07:31:00,2019-09-14 07:31:58,7.0,,NaT,NaT,-44.0
1618623,RED,A01/C01,B01/F01,A02,6.0,48.0,2019-09-14 07:42:00,2019-09-14 07:42:06,7.0,58.0,2019-09-14 07:31:58,2019-09-14 07:31:00,-42.0
1618643,RED,A01/C01,B01/F01,A02,-54.0,-25.0,2019-09-14 07:50:00,2019-09-14 07:49:06,7.0,6.0,2019-09-14 07:42:06,2019-09-14 07:42:00,-29.0


# Find related data per connection

In [8]:
data_connection_timestep_split = {connection_timestep: data for connection_timestep, data in related_forward_with_diff.groupby(['prev_stop', 'stop', 'timestep'])}

timestep_list = []
from_list = []
to_list = []
data_lists = []

timesteps = related_forward_with_diff['timestep'].unique()
timesteps.sort()

for connection_timestep, data in data_connection_timestep_split.items():
    from_stop = connection_timestep[0]
    to_stop = connection_timestep[1]
    timestep = connection_timestep[2]

    data_list = [int(delay) for delay in data['delay_diff'].values]

    timestep_list.append(timestep)
    to_list.append(to_stop)
    from_list.append(from_stop)
    data_lists.append(data_list)

forward_func_data = pd.DataFrame({'timestep': timestep_list,
                                    'from_stop': from_list,
                                    'to_stop': to_list,
                                    'data': data_lists})
forward_func_data.head(5)

Unnamed: 0,timestep,from_stop,to_stop,data
0,5.0,A01/C01,A02,"[-29, -54, -23, -40, -38, -53, -43, -56, -27, ..."
1,6.0,A01/C01,A02,"[-34, -31, -25, -22, -44, -50, -45, -42, -51, ..."
2,7.0,A01/C01,A02,"[-28, -45, 51, -48, -10, -41, -27, -38, -24, -..."
3,8.0,A01/C01,A02,"[-20, -18, 31, -44, -19, -44, 0, -45, -40, -4,..."
4,9.0,A01/C01,A02,"[-18, -43, -40, -39, -42, 2, -33, -36, -42, -2..."


In [9]:
forward_func_data.to_csv(r'Intermediates\\second_model_dist_test_data.csv')