# Import packages

In [1]:
import pandas as pd
import geopandas as gpd

from helper_functions import open_processed_gtfs, open_wmata_station_data

# Notebook settings

In [2]:
GTFS_FOLDER = 'GTFS_20190914'

# Load in Data

In [3]:
# WMATA stations
wmata_stations = open_wmata_station_data(r'Data\GIS_RAIL_STN_PNT.csv')
wmata_stations.head(3)

Unnamed: 0,NAME,geometry,STATION_ID
0,SPRING HILL,POINT (-77.24189 38.9292),N04
1,WIEHLE-RESTON EAST,POINT (-77.34031 38.94782),N06
2,MCLEAN,POINT (-77.21053 38.92433),N01


In [4]:
# Processed GTFS stations
gtfs_stations = open_processed_gtfs(f"Intermediates\\{GTFS_FOLDER}\\gtfs_stations.geojson")
print(len(gtfs_stations))
gtfs_stations.head(3)

Skipping field from_stops: unsupported OGR type: 1
Skipping field to_stops: unsupported OGR type: 1


91


Unnamed: 0,stop_id,stop_name,connections,geometry
0,308,SHAW METRO STATION,"[[1418, 10344], [10344, 1418]]",POINT (-77.02193 38.91455)
1,999,CHEVERLY METRO STATION,"[[5030, 2124], [2124, 5030]]",POINT (-76.9151 38.91655)
2,1305,CAPITOL HEIGHTS METRO STATION,"[[4613, 13107], [13107, 4613]]",POINT (-76.91331 38.88957)


# Find WMATA station for each GTFS station and correct station names

In [5]:
# Set both gdf's to a local XY crs
wmata_stations = wmata_stations.to_crs('EPSG:2248')
gtfs_stations = gtfs_stations.to_crs('EPSG:2248')

corrected_gtfs_stations = gtfs_stations.copy()
# Save correction to save time correcting other files
correction_dict = {}
# First find matching stations and correct stop_id and stop_name
for index, stop in gtfs_stations.iterrows():
    wmata_stations['distance'] = wmata_stations.distance(stop['geometry'])
    wmata_stations = wmata_stations.sort_values(by='distance')
    
    matching_station = wmata_stations['STATION_ID'].values[0]
    corrected_gtfs_stations.at[index, 'stop_id'] = matching_station
    correction_dict[stop['stop_id']] = matching_station

# Then correct from-to stop lists
for index, stop in corrected_gtfs_stations.iterrows():

    connections = []
    for connection in stop['connections']:
        connection_1 = connection[0]
        connection_2 = connection[1]
        if connection_1 != 'None':
            connection_1 = correction_dict[connection_1]
        if connection_2 != 'None':
            connection_2 = correction_dict[connection_2]

        connections.append([connection_1, connection_2])
    corrected_gtfs_stations.at[index, 'connections'] = connections

corrected_gtfs_stations = corrected_gtfs_stations.to_crs('EPSG:4326')  # All gdf are saved in geodesic, so we convert back
corrected_gtfs_stations.to_file(f"Intermediates\\{GTFS_FOLDER}\\gtfs_stations_corrected.geojson", driver="GeoJSON")
corrected_gtfs_stations.head(5)

  corrected_gtfs_stations.at[index, 'stop_id'] = matching_station


Unnamed: 0,stop_id,stop_name,connections,geometry
0,E02,SHAW METRO STATION,"[[E03, E01], [E01, E03]]",POINT (-77.02193 38.91455)
1,D11,CHEVERLY METRO STATION,"[[D10, D12], [D12, D10]]",POINT (-76.9151 38.91655)
2,G02,CAPITOL HEIGHTS METRO STATION,"[[G01, G03], [G03, G01]]",POINT (-76.91331 38.88957)
3,E03,U STREET METRO STATION,"[[E04, E02], [E02, E04]]",POINT (-77.02917 38.91702)
4,D12,LANDOVER METRO STATION,"[[D11, D13], [D13, D11]]",POINT (-76.89 38.93399)


# Correct links

In [None]:
gtfs_shapes = gpd.read_file(f"Intermediates\\{GTFS_FOLDER}\\gtfs_links.geojson")

gtfs_shapes_corrected = gtfs_shapes.copy()
for index, shape in gtfs_shapes.iterrows():
    gtfs_shapes_corrected.at[index, 'from_stop'] = correction_dict[shape['from_stop']]
    gtfs_shapes_corrected.at[index, 'to_stop'] = correction_dict[shape['to_stop']]
    gtfs_shapes_corrected.at[index, 'link_id'] = f"{gtfs_shapes_corrected.at[index, 'from_stop']}_to_{gtfs_shapes_corrected.at[index, 'to_stop']}"

gtfs_shapes_corrected.to_file(f"Intermediates\\{GTFS_FOLDER}\\gtfs_links_corrected.geojson", driver="GeoJSON", index=False)
gtfs_shapes_corrected.head(3)

  gtfs_shapes_corrected.at[index, 'from_stop'] = correction_dict[shape['from_stop']]
  gtfs_shapes_corrected.at[index, 'to_stop'] = correction_dict[shape['to_stop']]


Unnamed: 0,link_id,from_stop,to_stop,travel_time,geometry
0,A15_to_A14,A15,A14,240,"LINESTRING (-77.16476 39.11999, -77.14669 39.0..."
1,A14_to_A13,A14,A13,180,"LINESTRING (-77.14669 39.08545, -77.12079 39.0..."
2,A13_to_A12,A13,A12,180,"LINESTRING (-77.12079 39.06239, -77.11278 39.0..."


# Correct stop_datetimes

In [None]:
gtfs_stoptimes = gpd.read_file(f"Intermediates\\{GTFS_FOLDER}\\gtfs_stop_datetimes.csv", index=False)
gtfs_stoptimes.head(3)

Unnamed: 0,line_name,trip_id,shape_id,stop,arrival_time,previous_stop,next_stop
0,BLUE,3121350_18154_2019-09-14,46,4697,2019-09-14 06:54:00,,4664
1,BLUE,3121350_18154_2019-09-14,46,4664,2019-09-14 06:57:00,4697.0,13107
2,GREEN,3120258_18154_2019-09-14,117,21110,2019-09-14 07:00:00,,10142


In [8]:
correction_dict_stings = {str(key): value for key, value in correction_dict.items()}

gtfs_stoptimes_corrected = gtfs_stoptimes.copy()
gtfs_stoptimes_corrected['stop'] = gtfs_stoptimes_corrected['stop'].replace(correction_dict_stings)
gtfs_stoptimes_corrected['previous_stop'] = gtfs_stoptimes_corrected['previous_stop'].replace(correction_dict_stings)
gtfs_stoptimes_corrected['next_stop'] = gtfs_stoptimes_corrected['next_stop'].replace(correction_dict_stings)

gtfs_stoptimes_corrected.to_csv(f"Intermediates\\{GTFS_FOLDER}\\gtfs_stop_datetimes_corrected.csv", index=False)
gtfs_stoptimes_corrected.head(3)

Unnamed: 0,line_name,trip_id,shape_id,stop,arrival_time,previous_stop,next_stop
0,BLUE,3121350_18154_2019-09-14,46,G05,2019-09-14 06:54:00,,G04
1,BLUE,3121350_18154_2019-09-14,46,G04,2019-09-14 06:57:00,G05,G03
2,GREEN,3120258_18154_2019-09-14,117,F11,2019-09-14 07:00:00,,F10


In [11]:
print(f"{len(gtfs_stoptimes_corrected):_}")
# Get first and last dates to filter wmata stop datetimes by in next notebook
print(gtfs_stoptimes_corrected['arrival_time'].min(), gtfs_stoptimes_corrected['arrival_time'].max())

4_809_781
2019-09-14 06:54:00 2020-03-12 00:15:00
