In [1]:
import osmnx as ox
import geopandas as gpd

# download
place_name = "Pennsylvania, USA"
G = ox.graph_from_place(place_name, network_type='drive')

# convert to a GeoDataFrame
gdf_edges = ox.graph_to_gdfs(G, nodes=False)
print(gdf_edges.head())

  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


                                                   osmid            name  \
u        v        key                                                      
53448467 53584047 0                            549672418        Lay Road   
53452578 53582089 0                              6756166  Lake View Road   
53496150 53612694 0                              6733147    Orchard Road   
         53608642 0                            833253852    Orchard Road   
         53496211 0    [6672939, 1245422676, 1245422677]     Cooper Road   

                           highway oneway reversed   length  \
u        v        key                                         
53448467 53584047 0    residential  False     True  188.310   
53452578 53582089 0    residential  False     True  183.839   
53496150 53612694 0    residential  False     True  803.282   
         53608642 0    residential  False    False  408.870   
         53496211 0    residential  False    False  863.941   

                         

In [None]:
print(gdf_edges.dtypes)

osmid         object
name          object
highway       object
oneway        object
reversed      object
length       float64
geometry    geometry
bridge        object
maxspeed      object
lanes         object
ref           object
tunnel        object
access        object
service       object
junction      object
width         object
area          object
dtype: object


In [4]:
# Ensure all object columns contain only strings
for column in gdf_edges.select_dtypes(include=['object']).columns:
    gdf_edges[column] = gdf_edges[column].astype(str)

In [5]:
# Save edges to a Shapefile(for arcgis)
# gdf_edges.to_file("output/pennsylvania_roads.shp")

In [25]:
from shapely.geometry import LineString, MultiLineString

# Extract lat/lng for each road segment
road_segments = []

for index, row in gdf_edges.iterrows():
    if isinstance(row.geometry, LineString):
        coords = list(row.geometry.coords)
    elif isinstance(row.geometry, MultiLineString):
        coords = [list(line.coords) for line in row.geometry]
    road_segments.append({'id': row['osmid'], 'name': row['name'], 'coords': coords})

for segment in road_segments[:5]:
    print(segment)

{'id': '549672418', 'name': 'Lay Road', 'coords': [(-76.2733911, 39.7665376), (-76.2737825, 39.7666525), (-76.2741398, 39.7666703), (-76.2744317, 39.7666349), (-76.2747249, 39.7665602), (-76.2754428, 39.7662528)]}
{'id': '6756166', 'name': 'Lake View Road', 'coords': [(-76.2790655, 39.7659438), (-76.2797657, 39.7659885), (-76.2799825, 39.7659865), (-76.2803705, 39.7658867), (-76.2807163, 39.7657747), (-76.2811049, 39.7656048)]}
{'id': '6733147', 'name': 'Orchard Road', 'coords': [(-76.2727801, 39.7313838), (-76.273476, 39.731046), (-76.274404, 39.730693), (-76.2749359, 39.7304721), (-76.2751323, 39.7303906), (-76.275181, 39.730369), (-76.2759303, 39.7300465), (-76.276079, 39.7299747), (-76.2768931, 39.729547), (-76.2772635, 39.7293635), (-76.2778234, 39.7290533), (-76.2783629, 39.7287211), (-76.279117, 39.7281207), (-76.2802862, 39.7271465)]}
{'id': '833253852', 'name': 'Orchard Road', 'coords': [(-76.2727801, 39.7313838), (-76.2726394, 39.7315704), (-76.272398, 39.7319995), (-76.27191

In [26]:
import pandas as pd
df_road_segments = pd.DataFrame(road_segments)

# Combine coordinates with the same road name
def combine_coords(coord_list):
    combined = []
    for coords in coord_list:
        if isinstance(coords[0], list):  # Handle MultiLineString case
            combined.extend([item for sublist in coords for item in sublist])
        else:
            combined.extend(coords)
    return combined
combined_segments = df_road_segments.groupby('name')['coords'].apply(combine_coords).reset_index()

In [29]:
print(combined_segments.head())

                  name                                             coords
0         Hycrest Lane  [(-80.2028818, 40.7950576), (-80.2029258, 40.7...
1   10 Acres Wood Lane  [(-79.3113291, 40.1509223), (-79.3114132, 40.1...
2        10 Point Lane  [(-80.1286767, 40.6889168), (-80.1282986, 40.6...
3  10 Point Lodge Road  [(-75.8003135, 40.9816957), (-75.8016558, 40.9...
4       10 School Road  [(-79.3630866, 40.4248068), (-79.3612869, 40.4...


In [30]:
crash = pd.read_csv('output/crash_flag_cycle_roadway.csv')
crash.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11438 entries, 0 to 11437
Data columns (total 92 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   CRN                             11438 non-null  int64  
 1   BELTED_DEATH_COUNT              11438 non-null  int64  
 2   BELTED_SUSP_SERIOUS_INJ_COUNT   11438 non-null  int64  
 3   COUNTY                          11438 non-null  int64  
 4   CRASH_MONTH                     11438 non-null  int64  
 5   CRASH_YEAR                      11438 non-null  int64  
 6   DAY_OF_WEEK                     11438 non-null  int64  
 7   DEC_LAT                         11438 non-null  float64
 8   DEC_LONG                        11438 non-null  float64
 9   FATAL_COUNT                     11438 non-null  int64  
 10  HOUR_OF_DAY                     11438 non-null  float64
 11  ILLUMINATION                    11438 non-null  int64  
 12  INJURY_COUNT                    

In [32]:
merged_data = crash.merge(combined_segments, left_on='STREET_NAME', right_on='name', how='left')
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11438 entries, 0 to 11437
Data columns (total 94 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   CRN                             11438 non-null  int64  
 1   BELTED_DEATH_COUNT              11438 non-null  int64  
 2   BELTED_SUSP_SERIOUS_INJ_COUNT   11438 non-null  int64  
 3   COUNTY                          11438 non-null  int64  
 4   CRASH_MONTH                     11438 non-null  int64  
 5   CRASH_YEAR                      11438 non-null  int64  
 6   DAY_OF_WEEK                     11438 non-null  int64  
 7   DEC_LAT                         11438 non-null  float64
 8   DEC_LONG                        11438 non-null  float64
 9   FATAL_COUNT                     11438 non-null  int64  
 10  HOUR_OF_DAY                     11438 non-null  float64
 11  ILLUMINATION                    11438 non-null  int64  
 12  INJURY_COUNT                    