In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import datetime
import warnings
warnings.filterwarnings('ignore')

In [26]:
# Now we read the estimated times
line_estimated = pd.read_csv('arrival_time_data_schedules_final_schedule_65_rightTime_update.csv')
# only for the 2019-09-07
line_estimated = line_estimated[line_estimated.date == 20210907]
# only for punctuality
line_estimated = line_estimated[line_estimated['class'] == 'punctuality']
# and only in direction 0
line_estimated = line_estimated[line_estimated.direction_id == 0]
# we only need the columns:
# - stop_id
# - departure_time
# - punctuality_index
line_estimated = line_estimated[['stop_id', 'departure_time', 'punctuality_index']]
print(line_estimated)

       stop_id departure_time punctuality_index
8610      1283       04:29:00                85
8611      1283       04:49:00                48
8612      1283       05:01:00               625
8613      1283       05:13:00               602
8638      1283       08:11:00               246
...        ...            ...               ...
12601     9060       23:23:00               322
12602     9060       23:38:00               152
12603     9060       23:53:00               287
12604     9060       24:08:00               183
12605     9060       24:23:00                62

[808 rows x 3 columns]


In [36]:
# Let's produce the sequences
# We are going to group in 30 minutes intervals.
# At a given interval:
# For every stop, if the avg different between the departure 
# time and the estimated departure time is bigger than 5 minutes,
# we will consider that the bus is late. It will be added to the
# group of late buses.

# We convert the time to two columns: hour and minute
line_estimated['hour'] = line_estimated['departure_time'].apply(lambda x: int(x.split(':')[0]))
line_estimated['minute'] = line_estimated['departure_time'].apply(lambda x: int(x.split(':')[1]))

# We divide the departure_dates of all data into 30 minute intervals
times = line_estimated[['hour', 'minute']].drop_duplicates()
times['time'] = times['hour'] * 60 + times['minute']
times['interval'] = times['time'].apply(lambda x: int(x / 30))
times = times.drop_duplicates()
times = times.sort_values(by=['interval']).reset_index(drop=True)
print(times)

# We separate each stop
stops = line_estimated.stop_id.unique()
# We create a dictionary to store the results
stop_times = {}
first = True
for stop in stops:
    stop_times[stop] = line_estimated[line_estimated['stop_id'] == stop][['hour', 'minute', 'punctuality_index']]
    # And we identify the interval for each record
    stop_times[stop]['time'] = stop_times[stop]['hour'] * 60 + stop_times[stop]['minute']
    stop_times[stop]['interval'] = stop_times[stop]['time'].apply(lambda x: int(x / 30))
    stop_times[stop] = stop_times[stop][['interval','punctuality_index']]
    stop_times[stop] = stop_times[stop].drop_duplicates()
    stop_times[stop] = stop_times[stop].sort_values(by=['interval']).reset_index(drop=True)
    # We calculate the average difference between the departure time and the estimated departure time
    # for each interval
    stop_times[stop] = stop_times[stop].groupby(['interval']).mean().reset_index()


     hour  minute  time  interval
0       4      29   269         8
1       4      51   291         9
2       4      31   271         9
3       4      41   281         9
4       4      53   293         9
..    ...     ...   ...       ...
443    24      34  1474        49
444    24      44  1484        49
445    24      33  1473        49
446    24      32  1472        49
447    25       3  1503        50

[448 rows x 4 columns]
    interval  punctuality_index
0          8               85.0
1          9               48.0
2         10           312801.0
3         16              246.0
4         41           434134.0
5         42           581691.5
6         43            22598.0
7         44           860536.0
8         45            30512.5
9         46            62628.5
10        47           546096.0
11        48            26642.0
12        49         37306620.0


In [49]:
# Now, we create a dictionary, in which, for each interval, we store
# all records from stop_times in that interval
intervals_times = {}
for interval in times['interval'].unique():
    intervals_times[interval] = {}
    for stop in stops:
        intervals_times[interval][stop] = stop_times[stop][stop_times[stop]['interval'] == interval]
        intervals_times[interval][stop] = intervals_times[interval][stop].dropna()
        intervals_times[interval][stop] = intervals_times[interval][stop].reset_index(drop=True)
        # We calculate the average difference between the departure time and the estimated departure time
        # for each interval
        intervals_times[interval][stop] = intervals_times[interval][stop].groupby(['interval']).mean().reset_index()
print(intervals_times[9])

{1283:    interval  punctuality_index
0         9               48.0, 2067: Empty DataFrame
Columns: [interval, punctuality_index]
Index: [], 2100: Empty DataFrame
Columns: [interval, punctuality_index]
Index: [], 2102: Empty DataFrame
Columns: [interval, punctuality_index]
Index: [], 2294:    interval  punctuality_index
0         9            33359.5, 2796: Empty DataFrame
Columns: [interval, punctuality_index]
Index: [], 2839: Empty DataFrame
Columns: [interval, punctuality_index]
Index: [], 2975: Empty DataFrame
Columns: [interval, punctuality_index]
Index: [], 2976: Empty DataFrame
Columns: [interval, punctuality_index]
Index: [], 2977: Empty DataFrame
Columns: [interval, punctuality_index]
Index: [], 2980: Empty DataFrame
Columns: [interval, punctuality_index]
Index: [], 2986:    interval  punctuality_index
0         9              458.5, 2987:    interval  punctuality_index
0         9            24878.0, 2990: Empty DataFrame
Columns: [interval, punctuality_index]
Index: [], 299

In [88]:
# now, we create a dictionary, in which, for each interval, we store
# those stops such that:
# - interval_times[interval][stop] is not empty
# - interval_times[interval][stop]['punctuality_index'] is bigger than 300
print('A' in intervals_times[9][3042].columns)
intervals_late = {}
for interval in times['interval'].unique():
    intervals_late[interval] = []
    for stop in stops:
        if 'punctuality_index' not in intervals_times[interval][stop].columns:
            continue
        if intervals_times[interval][stop].empty == False:
            if intervals_times[interval][stop]['punctuality_index'][0] > 300:
                intervals_late[interval].append(stop)
print(intervals_late)

False
{8: [], 9: [2294, 2986, 2987, 2991], 10: [1283, 2294, 2986, 2987, 2991], 11: [2839, 2980, 3361], 12: [2796, 2839, 3072, 3355, 6359, 6364, 6410, 6435, 6445, 9060], 13: [2067, 2100, 2102, 2796, 2839, 2975, 2976, 2977, 2980, 2990, 2997, 3072, 3353, 3354, 3355, 3361, 3362, 3414, 5361, 6081, 6210, 6359, 6364, 6410, 6435, 6445, 9060], 14: [2100, 2102, 2839, 2975, 2976, 2977, 2980, 2990, 2997, 3361, 3362, 3414, 5361, 6081, 6210, 6364], 15: [2975, 2976, 2977, 2980, 3361, 3362, 5361, 6081], 16: [2987], 31: [2975, 2976, 2977, 2990], 32: [], 39: [3072, 3353, 6445, 9060], 40: [2100, 2102, 2796, 2839, 3354, 3355, 6359, 6364, 6410, 6435, 6445, 9060], 41: [1283, 2294, 2975, 2986, 2991, 3355, 3361, 3362, 5361, 6081, 9060], 42: [1283, 2067, 2100, 2102, 2294, 2796, 2839, 2975, 2976, 2977, 2986, 2987, 2990, 2991, 3072, 3353, 3354, 3355, 3414, 6210, 6359, 6364, 6410, 6435, 6445, 9060], 43: [1283, 2067, 2100, 2102, 2294, 2796, 2839, 2975, 2976, 2977, 2980, 2986, 2987, 2990, 2991, 2997, 3072, 3353, 33

In [52]:
# now we get the sequences
# first we detect consecutive intervals with late buses
# then we get the stops in those intervals

# from intervals_late.keys, we get consecutive intervals with non empty values
# we store them in intervals_late_non_empty
intervals_late_non_empty = []
for interval in intervals_late.keys():
    if len(intervals_late[interval]) > 0:
        intervals_late_non_empty.append(interval)
# we get the consecutive intervals
consecutive_intervals = []
for i in range(len(intervals_late_non_empty)):
    if i == 0:
        consecutive_intervals.append([intervals_late_non_empty[i]])
    else:
        if intervals_late_non_empty[i] == consecutive_intervals[-1][-1] + 1:
            consecutive_intervals[-1].append(intervals_late_non_empty[i])
        else:
            consecutive_intervals.append([intervals_late_non_empty[i]])
print(consecutive_intervals)

[[9, 10, 11, 12, 13, 14, 15, 16], [31], [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]


In [54]:
# now we get the stops in those intervals
sequences = []
for interval in consecutive_intervals:
    sequence = []
    for i in interval:
        subset = intervals_late[i]
        sequence.append(subset)
    sequences.append(sequence)

print(sequences)

[[[2294, 2986, 2987, 2991], [1283, 2294, 2986, 2987, 2991], [2839, 2980, 3361], [2796, 2839, 3072, 3355, 6359, 6364, 6410, 6435, 6445, 9060], [2067, 2100, 2102, 2796, 2839, 2975, 2976, 2977, 2980, 2990, 2997, 3072, 3353, 3354, 3355, 3361, 3362, 3414, 5361, 6081, 6210, 6359, 6364, 6410, 6435, 6445, 9060], [2100, 2102, 2839, 2975, 2976, 2977, 2980, 2990, 2997, 3361, 3362, 3414, 5361, 6081, 6210, 6364], [2975, 2976, 2977, 2980, 3361, 3362, 5361, 6081], [2987]], [[2975, 2976, 2977, 2990]], [[3072, 3353, 6445, 9060], [2100, 2102, 2796, 2839, 3354, 3355, 6359, 6364, 6410, 6435, 6445, 9060], [1283, 2294, 2975, 2986, 2991, 3355, 3361, 3362, 5361, 6081, 9060], [1283, 2067, 2100, 2102, 2294, 2796, 2839, 2975, 2976, 2977, 2986, 2987, 2990, 2991, 3072, 3353, 3354, 3355, 3414, 6210, 6359, 6364, 6410, 6435, 6445, 9060], [1283, 2067, 2100, 2102, 2294, 2796, 2839, 2975, 2976, 2977, 2980, 2986, 2987, 2990, 2991, 2997, 3072, 3353, 3354, 3355, 3361, 3362, 3414, 5361, 6081, 6210, 6359, 6364, 6410, 6435, 6

In [58]:
# Finally, we export it in the format required by the algorithm
# we want a txt file with the following format:
# each line is a sequence
# the subsets are separated by -1
# the stops are separated by spaces
# the sequences are separated by -2
output = ''
for sequence in sequences:
    for subset in sequence:
        for stop in subset:
            output += str(stop) + ' '
        output += '-1 '
    output += '-2\n'

with open('sequences.txt', 'w') as f:
    f.write(output)

In [119]:
def get_sequences(schedule_estimated, date, length=30, threshold=300):
    # only for the date
    schedule_estimated = schedule_estimated[schedule_estimated.date == date]
    # only for punctuality
    schedule_estimated = schedule_estimated[schedule_estimated['class'] == 'punctuality']
    # and only in direction 0
    schedule_estimated = schedule_estimated[schedule_estimated.direction_id == 0]
    # we only need the columns:
    # - stop_id
    # - departure_time
    # - punctuality_index
    schedule_estimated = schedule_estimated[['stop_id', 'departure_time', 'punctuality_index']]
    print(schedule_estimated)
    # We are going to group in 30 minutes intervals.
    # At a given interval:
    # For every stop, if the avg different between the departure 
    # time and the estimated departure time is bigger than 5 minutes,
    # we will consider that the bus is late. It will be added to the
    # group of late buses.

    # We convert the time to two columns: hour and minute
    schedule_estimated['hour'] = schedule_estimated['departure_time'].apply(lambda x: int(x.split(':')[0]))
    schedule_estimated['minute'] = schedule_estimated['departure_time'].apply(lambda x: int(x.split(':')[1]))

    # We divide the departure_dates of all data into 30 minute intervals
    times = schedule_estimated[['hour', 'minute']].drop_duplicates()
    times['time'] = times['hour'] * 60 + times['minute']
    times['interval'] = times['time'].apply(lambda x: int(x / length))
    times = times.drop_duplicates()
    times = times.sort_values(by=['interval']).reset_index(drop=True)

    # We separate each stop
    stops = schedule_estimated.stop_id.unique()
    # We create a dictionary to store the results
    stop_times = {}
    first = True
    for stop in stops:
        stop_times[stop] = schedule_estimated[schedule_estimated['stop_id'] == stop][['hour', 'minute', 'punctuality_index']]
        # And we identify the interval for each record
        stop_times[stop]['time'] = stop_times[stop]['hour'] * 60 + stop_times[stop]['minute']
        stop_times[stop]['interval'] = stop_times[stop]['time'].apply(lambda x: int(x / length))
        stop_times[stop] = stop_times[stop][['interval','punctuality_index']]
        stop_times[stop] = stop_times[stop].drop_duplicates()
        stop_times[stop] = stop_times[stop].sort_values(by=['interval']).reset_index(drop=True)
        # We calculate the average difference between the departure time and the estimated departure time
        # for each interval
        stop_times[stop] = stop_times[stop].groupby(['interval']).mean().reset_index()
    # Now, we create a dictionary, in which, for each interval, we store
    # all records from stop_times in that interval
    intervals_times = {}
    for interval in times['interval'].unique():
        intervals_times[interval] = {}
        for stop in stops:
            intervals_times[interval][stop] = stop_times[stop][stop_times[stop]['interval'] == interval]
            intervals_times[interval][stop] = intervals_times[interval][stop].dropna()
            intervals_times[interval][stop] = intervals_times[interval][stop].reset_index(drop=True)
            # We calculate the average difference between the departure time and the estimated departure time
            # for each interval
            intervals_times[interval][stop] = intervals_times[interval][stop].groupby(['interval']).mean().reset_index()
    # now, we create a dictionary, in which, for each interval, we store
    # those stops such that:
    # - interval_times[interval][stop] is not empty
    # - interval_times[interval][stop]['punctuality_index'] is bigger than threshold

    intervals_late = {}
    for interval in times['interval'].unique():
        intervals_late[interval] = []
        for stop in stops:
            if 'punctuality_index' not in intervals_times[interval][stop].columns:
                continue
            if intervals_times[interval][stop].empty == False:
                if intervals_times[interval][stop]['punctuality_index'][0] > threshold:
                    intervals_late[interval].append(stop)
    # now we get the sequences
    # first we detect consecutive intervals with late buses
    # then we get the stops in those intervals

    # from intervals_late.keys, we get consecutive intervals with non empty values
    # we store them in intervals_late_non_empty
    intervals_late_non_empty = []
    for interval in intervals_late.keys():
        if len(intervals_late[interval]) > 0:
            intervals_late_non_empty.append(interval)
    # we get the consecutive intervals
    consecutive_intervals = []
    for i in range(len(intervals_late_non_empty)):
        if i == 0:
            consecutive_intervals.append([intervals_late_non_empty[i]])
        else:
            if intervals_late_non_empty[i] == consecutive_intervals[-1][-1] + 1:
                consecutive_intervals[-1].append(intervals_late_non_empty[i])
            else:
                consecutive_intervals.append([intervals_late_non_empty[i]])
    
    # now we get the stops in those intervals
    sequences = []
    for interval in consecutive_intervals:
        sequence = []
        for i in interval:
            subset = intervals_late[i]
            sequence.append(subset)
        sequences.append(sequence)
    return sequences
    

In [130]:
line_estimated = pd.read_csv('arrival_time_data_schedules_final_schedule_65_rightTime_update.csv')
line_estimated = line_estimated[line_estimated['class'] == 'punctuality']
line_estimated = line_estimated[line_estimated['punctuality_index'] != 'None']
line_estimated['punctuality_index'] = [int(x) for x in line_estimated['punctuality_index']]
line_estimated = line_estimated[line_estimated['punctuality_index'] < 20*60]
sequences_7 = get_sequences(line_estimated, 20210907, 30, 10*60)
print(sequences_7)

       stop_id departure_time  punctuality_index
8610      1283       04:29:00                 85
8611      1283       04:49:00                 48
8612      1283       05:01:00                625
8613      1283       05:13:00                602
8638      1283       08:11:00                246
...        ...            ...                ...
12601     9060       23:23:00                322
12602     9060       23:38:00                152
12603     9060       23:53:00                287
12604     9060       24:08:00                183
12605     9060       24:23:00                 62

[746 rows x 3 columns]
[[[1283, 2991]], [[2839], [2975, 2976, 2977, 2980, 2990, 2997, 3361, 5361, 6081], [2980, 2997, 3361, 3362, 5361]], [[9060], [6359, 6435, 6445], [2294, 2975, 2986, 3361, 3362, 5361], [1283, 2975, 2991, 3354, 6210], [2986, 2997, 3361, 3414], [1283, 2839, 3353, 3355], [2294, 2839, 2987, 2991, 3072]], [[1283, 2294, 2986]], [[1283, 2294, 2986, 2987, 2991, 3362, 5361]]]


In [131]:
# Now we get the sequences for the other days, from 6 to 21
sequences_day = []
for i in range(20210906, 20210922):
    sequences_day.append(get_sequences(line_estimated, i))


      stop_id departure_time  punctuality_index
117      1283       20:49:00                808
118      1283       20:59:00                208
119      1283       21:13:00               1016
120      1283       21:26:00                236
122      1283       21:52:00                546
...       ...            ...                ...
3991     9060       23:23:00               1028
3992     9060       23:38:00                128
3993     9060       23:53:00               1068
3994     9060       24:08:00                168
3995     9060       24:23:00                 55

[547 rows x 3 columns]
       stop_id departure_time  punctuality_index
8610      1283       04:29:00                 85
8611      1283       04:49:00                 48
8612      1283       05:01:00                625
8613      1283       05:13:00                602
8638      1283       08:11:00                246
...        ...            ...                ...
12601     9060       23:23:00                322
12602   

In [132]:
# We print all the sequences to a txt
# with the format suitable for the algorithm
# that we are going to use
output = ''
for seq_day in sequences_day:
    for seq in seq_day:
        for subset in seq:
            for stop in subset:
                output += str(stop) + ' '
        output += '\n'

In [133]:
# We save the output to a txt file
with open('sequences.txt', 'w') as f:
    f.write(output)