In [1]:
# Import libraries
import pandas as pd
import datetime
import pandas_geojson as pd_geo

In [2]:
# Load data with ferry stop times and location
df = pd.read_csv('data/google_transit/stop_times.txt')

# Load data with stop long lat
df_stops = pd.read_csv('data/google_transit/stops.txt')

In [3]:
# Create data frame with all possible times slots and respective time id
start_time = '4:45:00' # start time
end_time = '23:00:00' # end time
time_str_lst = pd.date_range(start_time, end_time, freq = '5min') # generate list with 5 min interval
time_lst = time_str_lst.strftime('%H:%M:%S') # modifty list to date.time format

df_times_reference = pd.DataFrame(data = time_lst, columns = ['time']) # create dataframe
df_times_reference['time_id'] = range(0, len(df_times_reference)) # add time_id reference

In [4]:
# Merge ferry stop times and stop long lat
df = pd.merge(df, df_stops, on = 'stop_id').sort_values('trip_id')

# Change name of time variable
df = df.rename(columns = {'arrival_time' : 'time'})

# Change "time" parameter from string to timestamp format
try:
    df['time'] = pd.to_datetime(df['time'])
except Exception:
    pass

# Round data to closest 5 min increment
try:
    df['time'] = pd.Series(df['time']).dt.round('5T')
except Exception:
    pass

# Remove date
for i in range(0,len(df)):
    df.loc[i,'time'] = df.loc[i,'time'].time()

# Change format to string
for i in range(0,len(df)):
    df.loc[i,'time'] = df.loc[i,'time'].strftime('%H:%M:%S')

# Add time references
df = pd.merge(df, df_times_reference, on = 'time', how = 'left')

## TODO: get routes to work

In [5]:
def subset_data(data,trip_id):
    ''''The objective of this function is to get the local id of the dataset. 
    You input the data set and the requested trip_id. 
    The function then outputs a dataframe with only the relevent data with the stop sequence sorted'''

    # Get data for a single trip
    data_1 = data.loc[df['trip_id'] == trip_id].sort_values('stop_sequence')

    # Remove all non-essential data
    return(data_1[['trip_id','time','stop_id','stop_sequence','stop_lat','stop_lon', 'stop_name']])

def inter_time(data, trip_id, time_ref):
    '''The objective of this function is to determine the longitude and latitude that are not given.
    You input the data set and trip_id.
    The function outputs a dataframe with the interpolated data'''

    # Get data with a single trip_id
    data = subset_data(data,trip_id)

    # Merge the datasets with the time id data and keep all time intervals
    data = pd.merge(data,time_ref,on = 'time', how = 'right')

    # Remove data that are on either side of the entire 
    data = data.loc[data['stop_id'].first_valid_index():data['stop_id'].last_valid_index(),:]

    # Make all the trip_ids the same
    data['trip_id'] = trip_id

    # Interpolate longitude data
    data['stop_lon'] = data['stop_lon'].interpolate()

    # Interploate latitude data
    data['stop_lat'] = data['stop_lat'].interpolate()

    return(data)

In [6]:
# Set up inital variables for loop
df_export = pd.DataFrame()
df_old = pd.DataFrame()

# Loop through all unique trip_ids and append them together
for i in df['trip_id'].unique():
    df_export = pd.concat([df_old,inter_time(df,i,df_times_reference)], ignore_index = True)
    df_old = df_export

In [7]:
geo_json = pd_geo.to_geojson(df = df_export, lat = 'stop_lat', lon = 'stop_lon',
                    properties = ['trip_id','time','time_id'])

pd_geo.write_geojson(geo_json, filename = 'stop-times.geojson', indent = 4)