In [1]:
# Import required libraries
import geopandas as gpd
import pandas as pd
from shapely.ops import nearest_points
import networkx as nx
from shapely.geometry import Point, LineString
import requests
import zipfile
import os
import io
from io import StringIO
from io import BytesIO
from matplotlib import pyplot as plt
from shapely.ops import snap, nearest_points

In [2]:
# User defined variables
my_training_state = 'MN' # State USPS abbreviation
state_name = 'Minnesota'
my_nas_id = 5
# IA = 19; ID = 16; IL = 17; MN = 27; MO = 29; MT = 30; OR = 41;  WA = 53; WI = 55
state_fips = '27' # Replace last 2 digits with your state's FIP code
my_path = 'data/' + my_training_state + '/' # leave this alone   

In [None]:
# Download road shapefile
road_url = f'https://www2.census.gov/geo/tiger/TIGER2022/PRISECROADS/tl_2022_{state_fips}_prisecroads.zip'
local_path = my_path
print('Downloading shapefile...')
r = requests.get(road_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
print("Done")
z.extractall(path=local_path) # extract to folder
filenames = [y for y in sorted(z.namelist()) for ending in ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)] 
print(filenames)
#Download USGS boat access data 
# URL of the compressed file containing multiple files
URL_BASE = "https://www.sciencebase.gov/catalog/file/get/63b81b50d34e92aad3cc004d?facet=Boatramps_United_States_final_20230104"

# Define the desired extensions
desired_extensions = ['.dbf', '.prj', '.shp', '.shx']

# Download the file
response = requests.get(URL_BASE)

# Check if the request was successful
if response.status_code == 200:
    # Save the content to a temporary location (e.g., in memory)
    zip_file = BytesIO(response.content)

    # Extract the ZIP file contents to a folder
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        # Create a directory to store the extracted files
        os.makedirs(my_path, exist_ok=True)
        
        # Loop through the files in the zip and extract only the desired ones
        for file_name in zip_ref.namelist():
            if any(file_name.endswith(ext) for ext in desired_extensions):
                # Extract the file
                zip_ref.extract(file_name, my_path)
                print(f"Extracted: {file_name}")

    print("Files extracted successfully to my_path directory.")
else:
    print(f"Failed to download file. Status code: {response.status_code}")


In [3]:
def nas_api_call(nas_id, state):
    URL_BASE = 'http://nas.er.usgs.gov/api/v2/'
    url_request = f"{URL_BASE}/occurrence/search?species_ID={nas_id}&state={my_training_state}"
    response = requests.get(url_request, timeout=None).json()
    results = pd.json_normalize(response, 'results')
    return results

# Function to get the endpoints of a line geometry
def get_endpoints(geometry):
    # Ensure the geometry is a LineString
    if geometry.geom_type == 'LineString':
        # Get the first and last coordinate of the line
        return [geometry.coords[0], geometry.coords[-1]]
    return []

def snap_points_to_nearest_poly(poly: gpd.GeoDataFrame, point: gpd.GeoDataFrame, snapdist: float) -> gpd.GeoDataFrame:
    """
    Snaps points to the nearest polygon within a given distance.
    
    Parameters:
        poly (GeoDataFrame): GeoDataFrame containing polygon geometries.
        point (GeoDataFrame): GeoDataFrame containing point geometries.
        snapdist (float): Maximum search distance to find the nearest polygon.
    
    Returns:
        GeoDataFrame: Updated point GeoDataFrame with snapped geometries.
    """
    # Ensure both datasets use the same projected CRS (Sweden EPSG:3006 in original, changed to 26915)
    poly = poly.to_crs(3857)
    point = point.set_crs(3857)
    
    # Create unique IDs
    poly["polyid"] = range(poly.shape[0])
    point["pointid"] = range(point.shape[0])
    
    # Store original polygon geometry
    poly["polygeom"] = poly.geometry
    
    # Perform spatial join to find nearest polygons within snap distance
    sj = gpd.sjoin_nearest(left_df=point, right_df=poly, how="left", max_distance=snapdist)
    
    # Measure distances (set to None if no polygon within snapdistance)
    sj["distance"] = sj.apply(lambda x: x.geometry.distance(x.polygeom) if x.polygeom is not None else None, axis=1)
    
    # Sort by distance and drop duplicates (keeping closest polygon match)
    sj = sj.sort_values(by=["pointid", "distance"], ascending=True, na_position="last")
    sj = sj.drop_duplicates(subset="pointid", keep="first")
    
    # Find the nearest point on the polygon
    sj["nearestpoint"] = sj.apply(
        lambda x: nearest_points(x.geometry, x.polygeom)[1] if (x.polygeom is not None and x.distance is not None) else None,
        axis=1
    )
    
    # Snap points to nearest point on polygon if applicable
    sj["geometry"] = sj.apply(
        lambda x: snap(x.geometry, x.nearestpoint, snapdist) if x.nearestpoint is not None else x.geometry,
        axis=1
    )
    
    return sj

def sjoin_nearest_replace_geom(left_gdf, right_gdf, **kwargs):
    """
    Performs a spatial join (nearest) and replaces the geometry of left_gdf 
    with the geometry of right_gdf while retaining left_gdf attributes 
    and carrying over the 'epointID' column from right_gdf.

    Parameters:
    - left_gdf (GeoDataFrame): The GeoDataFrame with attributes to keep.
    - right_gdf (GeoDataFrame): The GeoDataFrame whose geometry will replace the left_gdf geometry.
    - **kwargs: Additional arguments for gpd.sjoin_nearest (e.g., max_distance).

    Returns:
    - GeoDataFrame: Resulting GeoDataFrame with left_gdf attributes, right_gdf geometry, and epointID.
    """
    # Perform spatial join (nearest)
    joined = gpd.sjoin_nearest(left_gdf, right_gdf, how="left", **kwargs)

    # Ensure 'geometry_right' exists (GeoPandas renames conflicting geometry columns)
    if "geometry_right" not in joined.columns:
        joined = joined.rename(columns={"geometry": "geometry_right"})

    # Replace the left geometry with the nearest right geometry
    joined["geometry"] = joined["geometry_right"]

    # Keep original left_gdf columns + 'epointID' from right_gdf
    cols_to_keep = list(left_gdf.columns) + ["epointID"]
    joined = joined[cols_to_keep]

    return joined

In [4]:
# Join ramps to water to get waterbodyID
ramps = gpd.read_file(my_path + 'Boatramps_United_States_final_20230104.shp').set_crs(3857, allow_override=True)

In [5]:
my_ramps = ramps.loc[ramps['State'] == state_name]
ramp_geo = my_ramps[['geometry']]
ramp_geo['ramp_ID'] = range(1, len(ramp_geo) + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [6]:
# Import buffered water
buffered_water = gpd.read_file(my_path + my_training_state + "_buffered_water.shp").set_crs(3857, allow_override = True)

In [7]:
# Get locations of your AIS.  
#If your AIS is not present in your state of interest you could identify the points where the road layer intersects your state border and use those
pos_data = nas_api_call(my_nas_id, my_training_state)
my_data = pos_data[["decimalLatitude", "decimalLongitude"]]
pos_data_gdf = gpd.GeoDataFrame(
    my_data, geometry=gpd.points_from_xy(my_data.decimalLongitude, my_data.decimalLatitude)).dropna().set_crs(3857)#.to_crs(5070)

In [None]:
# Identify positive and negative water
pos_water_check = buffered_water.sjoin(pos_data_gdf, how="left", predicate="contains")
pos_water_check = pos_water_check.drop_duplicates(subset="waterID", keep="first")
neg_water = pos_water_check[pos_water_check['index_right'].isna()].drop(columns=["index_right", "decimalLatitude", "decimalLongitude"], errors="ignore")
pos_water = pos_water_check.dropna(subset=["index_right"]).drop(columns=["index_right", "decimalLatitude", "decimalLongitude"], errors="ignore")
pos_water["Present"], neg_water["Present"] = 1.0, 0.0
water_w_present = pd.concat([pos_water, neg_water])

In [8]:
# Identify ramps that are within waterbodies and snap unjoinable ramps to the nearest waterbody within maximum distance.
ramps_in_water = ramp_geo.sjoin(water_w_present, how="left", predicate="within")
ramps_not_in_water = ramps_in_water[ramps_in_water['index_right'].isna()].drop(columns=["index_right"], errors="ignore").copy()
ramps_in_water = ramps_in_water.dropna(subset=["index_right"]).drop(columns=["index_right"], errors="ignore").set_crs(3857, allow_override = True)
ramps_in_water["waterID"] = ramps_in_water["waterID"].astype("int64")
snapped_ramps = snap_points_to_nearest_poly(water_w_present, ramps_not_in_water, 1000)
ramps_to_add = snapped_ramps[['ramp_ID', 'waterID_right', 'nearestpoint']].rename(columns={'waterID_right': 'waterID', 'nearestpoint': 'geometry'}).set_crs(3857, allow_override = True)
my_ramps = pd.concat([ramps_in_water, ramps_to_add])

In [9]:
# Identify water without ramps; identify point on polygon perimeter closest to a road endpoint
water_w_ramps_list = my_ramps['waterID'].tolist()
water_no_ramps = water_w_present[~water_w_present['waterID'].isin(water_w_ramps_list)]

In [10]:
# Identify road endpoints; remove duplicates
my_roads = gpd.read_file(my_path + 'tl_2022_27_prisecroads.shp').set_crs(5070, allow_override = True)
# Extract endpoints for all lines
endpoints = my_roads['geometry'].apply(get_endpoints).explode()

# Convert the list of endpoints to a DataFrame
endpoints_df = pd.DataFrame(endpoints.tolist(), columns=['x', 'y'])

# Remove duplicate points
endpoints_df = endpoints_df.drop_duplicates()

# Optionally, you can create a GeoDataFrame for the endpoints
endpoints_gdf = gpd.GeoDataFrame(endpoints_df, geometry=gpd.points_from_xy(endpoints_df['x'], endpoints_df['y'])).set_crs(3857, allow_override = True).drop(columns = ['x', 'y'])
endpoints_gdf['epointID'] = range(1, len(endpoints_gdf) + 1)

In [13]:
ramps_in_water_sj = sjoin_nearest_replace_geom(my_ramps, endpoints_gdf)
lakes_no_ramp_sj = sjoin_nearest_replace_geom(water_no_ramps, endpoints_gdf)
my_endpoints = pd.concat([ramps_in_water_sj, lakes_no_ramp_sj])
pos_endpoints = my_endpoints.loc[my_endpoints['Present'] == 1.0]
neg_endpoints = my_endpoints.loc[my_endpoints['Present'] == 0.0]
joined = gpd.sjoin_nearest(neg_endpoints, pos_endpoints, how="left")

In [None]:
# Calculate road distance
source_gdf = neg_endpoints.set_crs(3857, allow_override = True)
target_gdf = pos_endpoints.set_crs(3857, allow_override = True)
network_gdf = my_roads.set_crs(3857, allow_override = True)

# Ensure all GeoDataFrames use the same CRS
if not (source_gdf.crs == target_gdf.crs == network_gdf.crs):
    target_gdf = target_gdf.to_crs(source_gdf.crs)
    network_gdf = network_gdf.to_crs(source_gdf.crs)

# Build a graph from the polyline network
def build_network(gdf):
    G = nx.Graph()
    for _, row in gdf.iterrows():
        line = row.geometry
        if isinstance(line, LineString):
            coords = list(line.coords)
            for i in range(len(coords) - 1):
                G.add_edge(
                    coords[i],
                    coords[i + 1],
                    weight=Point(coords[i]).distance(Point(coords[i + 1]))
                )
    return G

network_graph = build_network(network_gdf)

# Helper function to find the nearest node in the network to a given point
def find_nearest_node(graph, point):
    nodes = list(graph.nodes)
    distances = [Point(node).distance(point) for node in nodes]
    return nodes[distances.index(min(distances))]

# Calculate shortest distances for an array of precomputed nearest neighbor pairs
def calculate_shortest_distances_with_pairs(point_pairs, source_gdf, target_gdf, network_graph):
    results = []
    
    for pair in point_pairs:
        source_id, target_id = pair
        # Get the source and target geometries
        source_row = source_gdf[source_gdf["epointID"] == source_id].iloc[0]
        target_row = target_gdf[target_gdf["epointID"] == target_id].iloc[0]
        
        source_geom = source_row.geometry
        target_geom = target_row.geometry
        
        # Find nearest network nodes for source and target
        source_node = find_nearest_node(network_graph, source_geom)
        target_node = find_nearest_node(network_graph, target_geom)
        
        # Compute the shortest path distance along the network
        try:
            path_length = nx.shortest_path_length(
                network_graph, source_node, target_node, weight="weight"
            )
        except nx.NetworkXNoPath:
            path_length = float("inf")  # No path found
        
        # Record the result
        results.append({
            "source_point_id": source_id,
            "target_point_id": target_id,
            "network_distance": path_length
        })
    
    return results

point_pairs = joined[['epointID_left', 'epointID_right']].to_numpy()
#point_pairs = point_pairs[:5] # Test with subset first
# Calculate distances for the point pairs
shortest_distances = calculate_shortest_distances_with_pairs(point_pairs, source_gdf, target_gdf, network_graph)

# Convert results to a DataFrame
results_df = pd.DataFrame(shortest_distances)

In [20]:
results_df

Unnamed: 0,source_point_id,target_point_id,network_distance
0,1122,1122,0.073961
1,4002,4002,0.001037
2,4002,4002,0.001037
3,3006,3001,0.102876
4,3717,3717,0.0096


In [None]:
results_df.to_csv('road_distance_dres_1.csv')