In [1]:
# pip install folium

In [None]:
import os
import pandas as pd
import zipfile
import shutil
import random
from datetime import datetime
import numpy as np
# Path to the GTFS zipped folders
gtfs_folders_path = 'inputs/GTFS_Baseline10may2023/'
# gtfs_folders_path = 'inputs/GTFS_test/'

import os
#increase freq and same speed - add vehicles
#decrease freq and same speed - remove vehicles
#increase speed and frequency - same vehicles BRT
#decrease speed and frequency - same vehicles BRT
freq_scales =  [1, 0.5, 0.75, 1.5, 2,  0.5, 0.75, 1.5, 2,]
speed_scales = [1, 1,   1,    1,   1,  0.5, 0.75, 1.5, 2, ] # should be the same as freq or 1
n_min_trips = 5
day_of_week = 'friday'  # for example: 'monday', 'tuesday', etc.
specific_date = 20170922  # format: 'YYYYMMDD'

def change_speed(time_series, speed_factor):
    # Convert the time series from a timedelta string to timedelta objects
    time_deltas = pd.to_timedelta(time_series)

    # Initialize the list with the first time unchanged
    adjusted_times = [time_deltas.iloc[0]]

    # Iterate over the time delta objects
    for i in range(1, len(time_deltas)):
        # Calculate the interval in seconds and adjust by the speed factor
        interval_seconds = (time_deltas.iloc[i] - time_deltas.iloc[i - 1]).total_seconds() / speed_factor
        
        # Add the adjusted interval to the last adjusted time
        adjusted_time = adjusted_times[i - 1] + pd.to_timedelta(interval_seconds, unit='s')
        
        # Append the new time to the adjusted times list
        adjusted_times.append(adjusted_time)

    # Convert the adjusted times list back to the same format as the input series
    adjusted_times_str = ["0 days " + str(adjusted_time).split(' days ')[-1] for adjusted_time in adjusted_times]

    return pd.Series(adjusted_times_str)


def standardize_dep_time(time_value):
    global wrong_value_count
    # If the value is a string, we assume it could be in the 'HH:MM:SS' format
    if isinstance(time_value, str):
        time_value = time_value.strip()
        if time_value:
            # Check if the string is in 'HH:MM:SS' format
            try:
                # If this doesn't raise an exception, the format is correct
                pd.to_datetime(time_value, format='%H:%M:%S').time()
                # Return the time string as is
                return time_value
            except ValueError:
                # If there is an error, increment the wrong value counter
                wrong_value_count += 1
                # Return a placeholder for invalid format or handle as needed
                return None
        else:
            # If it's an empty string, return it as is or your chosen placeholder
            return None
    # If it's a numeric value, we assume it's seconds and convert it to 'HH:MM:SS'
    elif isinstance(time_value, (int, float)):
        # Convert seconds to timedelta
        timedelta_value = pd.to_timedelta(time_value, unit='s')
        # Check if the timedelta conversion was successful and not NaT
        if pd.notna(timedelta_value):
            # Convert to 'HH:MM:SS' format
            return timedelta_value.components.hours, timedelta_value.components.minutes, timedelta_value.components.seconds
        else:
            # If conversion was unsuccessful, increment the wrong value counter
            wrong_value_count += 1
            return None
    else:
        # If it's anything else, increment the wrong value counter and return a placeholder
        wrong_value_count += 1
        return None


def haversine(lon1, lat1, lon2, lat2):
    R = 6371000.0
    
    lon1 = np.radians(lon1)
    lat1 = np.radians(lat1)
    lon2 = np.radians(lon2)
    lat2 = np.radians(lat2)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    
    return distance

# Function to calculate shape_dist_traveled
def calculate_shape_dist_traveled(shapes):

    shapes = shapes.sort_values(by=['shape_id', 'shape_pt_sequence'])
    
    distances = haversine(shapes['shape_pt_lon'].shift(), shapes['shape_pt_lat'].shift(),
                          shapes['shape_pt_lon'], shapes['shape_pt_lat'])
    
    distances = distances.fillna(0)
    
    shapes['shape_dist_traveled'] = distances.cumsum()
    
    return shapes

def get_vmt(group, trips, shapes):
    
    if len(shapes) > 0:
        tot_length = 0
        shape_ids = trips['shape_id'][(trips['route_id'].isin(group['route_id'])&(trips['trip_headsign'].isin(group['trip_headsign'])))]
        for shape_id in shape_ids:
            relevant_shapes = shapes[shapes['shape_id'] == shape_id]
            try:
                tot_length+= relevant_shapes['shape_dist_traveled'].max()
            except:
                shapes_with_distance = calculate_shape_dist_traveled(relevant_shapes)
                tot_length+= shapes_with_distance['shape_dist_traveled'].max()
        return tot_length
    else:
        return 0



def get_vht(group, trips, stop_times):
    
    tot_dur = 0
    trip_ids = trips['trip_id'][(trips['route_id'].isin(group['route_id'])&(trips['trip_headsign'].isin(group['trip_headsign'])))]
    for trip_id in trip_ids:
        relevant_stop_times = stop_times[stop_times['trip_id'] == trip_id]
        for dep_time_aft,dep_time_pre in zip(relevant_stop_times['departure_time'][1:],relevant_stop_times['departure_time'][:-1]):
            try:
                dep_time_aft = pd.to_timedelta(dep_time_aft).total_seconds()
                dep_time_pre = pd.to_timedelta(dep_time_pre).total_seconds()
                tot_dur += dep_time_aft-dep_time_pre
            except:
                print('WARNING:dep_time',dep_time_aft, dep_time_pre)
                continue
                
    return tot_dur
        
        
def save_dataframe_to_csv(df, temp_dir, filename):
    file_path = os.path.join(temp_dir, filename)
    df.to_csv(file_path, index=False)

def zip_directory(input_dir, output_zip):

    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(input_dir):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, input_dir))

def cleanup_temp_dir(temp_dir):

    shutil.rmtree(temp_dir)

def has_time_greater_than_24(group):
    # return any(int(time.split(':')[0]) >= 24 for time in group['departure_time'])
    return any(int(time.strip().split(':')[0]) >= 24 for time in group['departure_time'] if time.strip())

def get_dep_times(group , stop_times):

    departing_times = []
    for index, row in group.iterrows():
        trip_id = row['trip_id'] 
        stop_times_trip = stop_times[stop_times.trip_id == trip_id]
        departing_times.append(min(stop_times_trip.departure_time))
    departing_times = [datetime.strptime(time, '%H:%M:%S') for time in departing_times]
    return departing_times

    
def get_data(zip_file, day_of_week, specific_date):
    
    global wrong_value_count
    if zip_file.endswith('.zip'):
        zip_path = os.path.join(gtfs_folders_path, zip_file)
        # Extract the ZIP file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Create a temporary directory for the extracted files
            temp_dir = os.path.join(gtfs_folders_path, zip_file.split('.')[0])
            os.makedirs(temp_dir, exist_ok=True)
            zip_ref.extractall(temp_dir)

            #Get GTFS Files
            trips = pd.read_csv(os.path.join(temp_dir, 'trips.txt'))
            stop_times =  pd.read_csv(os.path.join(temp_dir, 'stop_times.txt'))
            routes =  pd.read_csv(os.path.join(temp_dir, 'routes.txt'))
            try:
                shapes =  pd.read_csv(os.path.join(temp_dir, 'shapes.txt'))
            except:
                shapes = []
                print('WARNING: no shapes for', temp_dir)
            print(len(trips), 'len trips')
            print(len(stop_times), 'len stop_times')
            stop_times['departure_time'] = stop_times['departure_time'].apply(standardize_dep_time)
            # stop_times['departure_time'] = stop_times['departure_time'].apply(standardize_dep_time, args=(wrong_value_count,))

            print('wrong_value_count',wrong_value_count)
            stop_times = stop_times[stop_times['departure_time'] != '']  
            stop_times = stop_times.dropna(subset=['departure_time'])
            
            # Delete trips with at least a stop_time after 24:00:00
            stop_times = stop_times.groupby('trip_id').filter(lambda x: not has_time_greater_than_24(x))
            trips = trips[trips.trip_id.isin(list(stop_times['trip_id']))]
            # stop_times = stop_times[stop_times.trip_id.isin(list(stop_times_filtered_grouped['trip_id']))]
            print(len(trips), 'len trips')
            print(len(stop_times), 'len stop_times')
            calendar = pd.read_csv(os.path.join(temp_dir, 'calendar.txt'))
            # Define the specific day you want to filter trips for

            # Find service IDs that are active on the day of the week
            active_services = calendar[(calendar[day_of_week] == 1)&
                                       (calendar['start_date'].astype(int)<specific_date)&
                                       (calendar['end_date'].astype(int)>specific_date)]['service_id']
            # Filter the trips by the active service IDs
            filtered_trips = trips[trips['service_id'].isin(active_services)]
            grouped_filtered_trips = filtered_trips.groupby(['route_id','trip_headsign'])
        
        return trips, stop_times, shapes, routes, grouped_filtered_trips, temp_dir
    else:
        return [],[],[], [],[],[]
    
def change_frequency_speed(trips, stop_times, departing_times, min_dep_time, max_dep_time, n_current_trips, n_new_trips):
    
    n_trips_to_add = n_new_trips-n_current_trips
    if n_trips_to_add>=0:
        new_trip_ids = random.choices(range(0, n_current_trips - 1), k=n_trips_to_add)
    elif  n_trips_to_add<0:
        new_trip_ids = random.sample(range(0, n_current_trips - 1), n_new_trips)
    #Determine new departing times
    # min_dep_time = pd.to_timedelta(min(departing_times)).total_seconds()
    # max_dep_time = pd.to_timedelta(max(departing_times)).total_seconds()
    new_frequency = (max_dep_time-min_dep_time)/n_new_trips


    #ADJUST times and ADD additional Trips  
    dep_time = min_dep_time
    i = 0
    for (index, row), pre_dep_time in zip(group.iterrows(), departing_times):
        trip_id = row['trip_id']
        if n_trips_to_add>=0:
            #Determine Duplicates
            n_duplicates = 0
            for trip_index in new_trip_ids:
                if trip_index == i:
                    n_duplicates+=1

            if n_duplicates>0:
                for j in range(n_duplicates):
                    #New Trips
                    index_to_duplicate = trips[trips['trip_id'] == trip_id].index[0]
                    trip_to_duplicate = trips.loc[index_to_duplicate].copy()
                    trip_to_duplicate['trip_id'] = str(trip_id)+'_dupli_'+str(j)
                    trips = trips.append(trip_to_duplicate, ignore_index=True)
                    stop_times_to_duplicate = stop_times[stop_times['trip_id'] == trip_id].copy()
                    stop_times_to_duplicate['trip_id'] = str(trip_id)+'_dupli_'+str(j)
                    time_diff = dep_time-pre_dep_time
                    stop_times_to_duplicate['departure_time'] += time_diff
                    if speed_scale != 1:
                        stop_times_to_duplicate['departure_time'] = list(change_speed(stop_times_to_duplicate['departure_time'], speed_scale))

                    stop_times_to_duplicate['arrival_time'] = stop_times_to_duplicate['departure_time']
                    stop_times = pd.concat([stop_times, stop_times_to_duplicate], ignore_index=True)
                    dep_time+=new_frequency

            #Shift times     
            time_diff =  dep_time-pre_dep_time
            index_to_update = stop_times['trip_id'] == trip_id
            stop_times.loc[index_to_update, 'departure_time'] = stop_times.loc[index_to_update, 'departure_time'] + time_diff
            if speed_scale != 1:
                stop_times.loc[index_to_update, 'departure_time'] = list(change_speed(stop_times.loc[index_to_update, 'departure_time'], speed_scale))
            stop_times.loc[index_to_update, 'arrival_time'] = stop_times.loc[index_to_update, 'departure_time'] 
            dep_time+=new_frequency
            
        elif  n_trips_to_add<0:
            
            if i in new_trip_ids:

                #Shift times     
                time_diff =  dep_time-pre_dep_time
                index_to_update = stop_times['trip_id'] == trip_id
                stop_times.loc[index_to_update, 'departure_time'] = stop_times.loc[index_to_update, 'departure_time'] + time_diff
                if speed_scale != 1:
                    stop_times.loc[index_to_update, 'departure_time'] = list(change_speed(stop_times.loc[index_to_update, 'departure_time'], speed_scale))
                stop_times.loc[index_to_update, 'arrival_time'] = stop_times.loc[index_to_update, 'departure_time'] 
                dep_time+=new_frequency
                
            else:
                trips = trips[trips['trip_id']!=trip_id]
        i+=1
    return trips, stop_times, new_frequency
    
def create_freq_directory(gtfs_folders_path, freq_scale, speed_scale):
    freq_dir_name = f"GTFS_freq_{freq_scale}_speed_{speed_scale}"
    freq_dir_path = os.path.join(gtfs_folders_path, freq_dir_name)
    os.makedirs(freq_dir_path, exist_ok=True)
    return freq_dir_path
    
    
    




for freq_scale, speed_scale in zip(freq_scales, speed_scales):
    
    freq_dir_path = create_freq_directory(gtfs_folders_path, freq_scale, speed_scale)

    Summary_table = pd.DataFrame()


    VMT_ST = []
    VHT_ST = []
    freqency_ST = []
    min_depart_ST = []
    max_depart_ST = []
    VMT_ST_after = []
    VHT_ST_after = []
    freqency_ST_after = []
    route_ids_ST = []
    trip_headsign_ST = []
    agencies = []
    n_trips_ST = []
    n_trips_ST_after = []
    route_names_ST=[]
    route_names_long_ST=[]
    route_descr_ST=[]


    for zip_file in os.listdir(gtfs_folders_path):

        agency = zip_file[:2]
        #GET DATA
        wrong_value_count = 0
        trips, stop_times, shapes, routes, grouped_filtered_trips, temp_dir = get_data(zip_file,day_of_week, specific_date)
        print('wrong_value_count',wrong_value_count)
        #########

        if len(trips)>0:
            print(zip_file,len(grouped_filtered_trips))
            for name, group in grouped_filtered_trips:

                #Save data
                route_ids_ST.append(name[0])
                try:
                    route_names_ST.append(list(routes['route_short_name'][routes['route_id']==name[0]])[0])
                except:
                    route_names_ST.append('')
                try:
                    route_names_long_ST.append(list(routes['route_long_name'][routes['route_id']==name[0]])[0])
                except:
                    route_names_long_ST.append('')
                try:
                    route_descr_ST.append(list(routes['route_desc'][routes['route_id']==name[0]])[0])
                except:
                    route_descr_ST.append('')

                trip_headsign_ST.append(name[1])
                n_trips_ST.append(len(group))
                agencies.append(agency)
                VMT_ST.append(get_vmt(group, trips, shapes))
                VHT_ST.append(get_vht(group, trips, stop_times))
                print('name', name)

                #GET DEP TIMES
                departing_times = get_dep_times(group , stop_times)
                ##############
                min_dep_time = min(departing_times)
                max_dep_time = max(departing_times)
                duration = max_dep_time - min_dep_time
                duration_in_seconds = duration.total_seconds()
                min_depart_ST.append(min_dep_time)
                max_depart_ST.append(max_dep_time)

                freqency_ST.append((duration_in_seconds)/len(group))

                if len(group)>n_min_trips:

                    n_current_trips = len(departing_times)
                    n_new_trips = int(freq_scale*n_current_trips)
                    print('n_new_trips', n_new_trips)


                        #ADD TRIPS
                    len_trips_before = len(trips)
                    trips, stop_times, new_frequency = change_frequency_speed(trips, stop_times,departing_times, min_dep_time, max_dep_time, n_current_trips, n_new_trips)
                    ##########
                    n_trips_ST_after.append(len(trips)-len_trips_before)
                    VMT_ST_after.append(get_vmt(group, trips, shapes))
                    VHT_ST_after.append(get_vht(group, trips, stop_times))
                    freqency_ST_after.append(new_frequency.total_seconds())

                    # elif n_new_trips<n_current_trips:
                    #     pass
                    #     # n_trips_to_rem = n_current_trips-n_new_trips
                    #     # rem_trip_ids = random.sample(range(0, n_current_trips - 1), n_trips_to_rem)
                else:
                    n_trips_ST_after.append(0)
                    VMT_ST_after.append(get_vmt(group, trips, shapes))
                    VHT_ST_after.append(get_vht(group, trips, stop_times))
                    freqency_ST_after.append(freqency_ST[-1])

            stop_times['departure_time'] = pd.to_timedelta(stop_times['departure_time'])
            stop_times['departure_time'] = stop_times['departure_time'].dt.components.apply(
                lambda x: f"{int(x.hours):02d}:{int(x.minutes):02d}:{int(x.seconds):02d}", axis=1)
            stop_times['arrival_time'] = pd.to_timedelta(stop_times['arrival_time'])
            stop_times['arrival_time'] = stop_times['arrival_time'].dt.components.apply(
                lambda x: f"{int(x.hours):02d}:{int(x.minutes):02d}:{int(x.seconds):02d}", axis=1)

            #Save new GTFS Files

            save_dataframe_to_csv(trips, temp_dir, 'trips.txt')
            save_dataframe_to_csv(stop_times, temp_dir, 'stop_times.txt')

            output_zip = os.path.join(freq_dir_path, zip_file)
            zip_directory(temp_dir, output_zip)

            cleanup_temp_dir(temp_dir)


            #Get Departing Times
            # for name, group in grouped_trips:
            #     n_trips_ST_after.append(len(group))

                # Estimate additional number of vehicles and VMT
                #keep track of data about lines, VMT, increased VMT etc...to be added a ridership column, and shift from car column

    Summary_table['Agency'] = agencies
    Summary_table['Route'] = route_ids_ST
    Summary_table['Route Name'] = route_names_ST
    Summary_table['Route Long Name'] = route_names_long_ST
    Summary_table['Route Descr'] = route_descr_ST
    Summary_table['Trip Head Sign'] = trip_headsign_ST
    Summary_table['Number of Trips'] = n_trips_ST
    Summary_table['New Trips'] = n_trips_ST_after
    VMT_ST = [-1 if x is None or pd.isna(x) else x for x in VMT_ST]
    VHT_ST = [-1 if x is None or pd.isna(x) else x for x in VHT_ST]
    VMT_ST_after = [-1 if x is None or pd.isna(x) else x for x in VMT_ST_after]
    VHT_ST_after = [-1 if x is None or pd.isna(x) else x for x in VHT_ST_after]
    Summary_table['VMT [km]'] = pd.Series(VMT_ST).astype(int)/1000
    Summary_table['VMT per Trip [km]'] = Summary_table['VMT [km]']/Summary_table['Number of Trips']
    Summary_table['VMT per Trip [km]'] = Summary_table['VMT per Trip [km]'].astype(int)
    Summary_table['VMT - After [km]'] = pd.Series(VMT_ST_after).astype(int)/1000
    Summary_table['VMT per Trip - After [km]'] = Summary_table['VMT - After [km]']/(Summary_table['Number of Trips']+Summary_table['New Trips'])
    Summary_table['VMT per Trip - After [km]'] = Summary_table['VMT per Trip - After [km]'].astype(int)
    Summary_table['VHT [minutes]'] = pd.Series(VHT_ST).astype(int)/60
    Summary_table['VHT per Trip [minutes]'] = Summary_table['VHT [minutes]']/Summary_table['Number of Trips']
    Summary_table['VHT per Trip [minutes]'] = Summary_table['VHT per Trip [minutes]'].astype(int)
    Summary_table['VHT - After [minutes]'] = pd.Series(VHT_ST_after).astype(int)/60
    Summary_table['VHT per Trip - After [minutes]'] = Summary_table['VHT - After [minutes]']/(Summary_table['Number of Trips']+Summary_table['New Trips'])
    Summary_table['VHT per Trip - After [minutes]'] = Summary_table['VHT per Trip - After [minutes]'].astype(int)
    Summary_table['AV Speed [km/h]'] = Summary_table['VMT [km]']/Summary_table['VHT [minutes]']*60
    Summary_table['AV Speed - After [km/h]'] = Summary_table['VMT - After [km]']/Summary_table['VHT - After [minutes]']*60
    Summary_table['Min Depart'] = min_depart_ST
    Summary_table['Min Depart'] = Summary_table['Min Depart'].dt.time
    Summary_table['Max Depart'] = max_depart_ST
    Summary_table['Max Depart'] = Summary_table['Max Depart'].dt.time
    Summary_table['Frequency [minutes]'] = pd.Series(np.array(freqency_ST)/60).astype(int)
    Summary_table['Frequency - After [minutes]'] = pd.Series(np.array(freqency_ST_after)/60).astype(int)

    Summary_table.to_csv(f'outputs/ST_GTFS_freq_{freq_scale}_speed_{speed_scale}.csv')


#add estimated new vehicles required
#add ridership
#add captured Personal Vehicle Trip

734 len trips
10267 len stop_times
wrong_value_count 7059
734 len trips
3208 len stop_times
wrong_value_count 7059
FF.zip 32
name (190, 'Davis Sacramento')
name (190, 'Davis/Fairfield')
name (190, 'Dixon')
name (190, 'Downtown Sacramento')
name (190, 'Fairfield')
name (190, 'Vacaville Davis')
name (190, 'Vacaville Fairfield')
name (193, 'Fairfield Vacaville')
n_new_trips 13
name (196, 'FTC')
name (196, 'Vacaville')
n_new_trips 8
name (196, 'WC PH BART')
n_new_trips 13
name (259, 'El Cerrito Del Norte BART Station AC Transit')
n_new_trips 33
name (259, 'FTC')
n_new_trips 17
name (259, 'Suisun City Amtrak')
n_new_trips 14
name (973, 'FTC')
n_new_trips 27
name (973, 'WALMART')
n_new_trips 27
name (974, 'HUNTINGTON CT')
n_new_trips 27
name (974, 'SOLANO TOWN CTR')
n_new_trips 27
name (975, 'FTC')
n_new_trips 28
name (975, 'WALMART')
n_new_trips 27
name (976, 'DAVID GRANT')
n_new_trips 14
name (976, 'SMART AND FINAL')
n_new_trips 14
name (977, 'FTC')
n_new_trips 14
name (977, 'SUISUN SENIOR

In [None]:
VMT_ST

In [None]:
len(n_trips_ST_after)

In [None]:
Summary_table

In [None]:
group





In [None]:
trips





In [None]:

stop_times['departure_time'] = pd.to_timedelta(stop_times['departure_time'])

# Then, convert all timedelta objects to strings in the format 'HH:MM:SS'
stop_times['departure_time'] = stop_times['departure_time'].dt.components.apply(
    lambda x: f"{int(x.hours):02d}:{int(x.minutes):02d}:{int(x.seconds):02d}", axis=1)

# Now all the times should be strings in 'HH:MM:SS' format
print(stop_times['departure_time'])




In [None]:
stop_times['departure_time'] = pd.to_timedelta(stop_times['departure_time'])


In [None]:
stop_times['departure_time']