In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json

In [None]:
stops = pd.read_csv('stops2.txt')
stops = stops.iloc[:, [0,2,4,5,6]]
stops = stops[(stops['zone_id'] == 'A') | (stops['zone_id'] == 'B') | (stops['zone_id'] == 'C')]
stops = stops.iloc[:, 0:4]

stops = stops.rename(columns={'stop_id':'id', 'stop_name': 'name', 'stop_lat': 'lat', 'stop_lon': 'lon'})
stops

In [None]:
stops = stops.set_index('id')
stops.to_json('processed_stops.json', orient='index')
stops

In [None]:
stop_times = pd.read_csv('stop_times.txt')

# drop everything but tuesday (02/11/2022) times, or any specific day
stop_times = stop_times[stop_times['trip_id'].str.contains('20221102_Ti')]

# make sure its sorted properly
stop_times = stop_times.sort_values(by=['trip_id', 'stop_sequence'], ascending=[True, True])

stop_times

In [None]:
def parse_time(time):
    return int(time[0:2])*3600 + int(time[3:5])*60 + int(time[6:8])

# save trips in dic to speed up access times
# dont use pandas slicing -> way too slow
trips = {}

current_trip = []
current_id = stop_times.iloc[0,0]

for trip in tqdm(stop_times.iterrows(), total=len(stop_times)):
    
    data = trip[1]
    
    if (data['trip_id'] == current_id):
        current_trip.append([parse_time(data['arrival_time']), parse_time(data['departure_time']), data['stop_id']])
    else:
        trips[current_id] = current_trip
        current_id = data['trip_id']
        current_trip = [[parse_time(data['arrival_time']), parse_time(data['departure_time']), data['stop_id']]]

In [None]:
stop_data = {}

for s in tqdm(stop_times.iterrows(), total=len(stop_times)):
    data = s[1]
    stop_id = data['stop_id']
    stop_sequence = data['stop_sequence']
    trip_id = data['trip_id']
    
    if stop_id not in stop_data:
        stop_data[stop_id] = []
    
    stop_data[stop_id].append([parse_time(data['departure_time']), trip_id, stop_sequence])

for s in stop_data:
    d = stop_data[s]
    stop_data[s] = sorted(d, key=lambda x: x[0])

# pretty confident its correct since it matches the service statistics from hsl

In [None]:
with open('processed_stop_data.json', 'x') as f:
    json.dump(stop_data, f)

with open('processed_trips.json', 'x') as f:
    json.dump(trips, f)   