In [1]:
# Import required libraries
import geopandas as gpd
import pandas as pd
from shapely.ops import nearest_points
import networkx as nx
from shapely.geometry import Point, LineString
import requests
import zipfile
import os
import io
from io import StringIO
from io import BytesIO
from matplotlib import pyplot as plt
from shapely.ops import snap, nearest_points
import glob

In [19]:
def nas_api_call(nas_id, state):
    URL_BASE = 'http://nas.er.usgs.gov/api/v2/'
    url_request = f"{URL_BASE}/occurrence/search?species_ID={nas_id}&state={my_training_state}"
    response = requests.get(url_request, timeout=None).json()
    results = pd.json_normalize(response, 'results')
    return results

# Function to extract vertices from each LineString/MultiLineString
def extract_vertices(geometry):
    if geometry.geom_type == "LineString":
        return list(geometry.coords)  # Extract vertices from LineString
    elif geometry.geom_type == "MultiLineString":
        return [coord for line in geometry.geoms for coord in line.coords]  # Flatten MultiLineString
    return []

def snap_points_to_nearest_line(line: gpd.GeoDataFrame, point: gpd.GeoDataFrame, snapdist: float) -> gpd.GeoDataFrame:
    """
    Snaps points to the nearest polygon within a given distance.
    
    Parameters:
        poly (GeoDataFrame): GeoDataFrame containing polygon geometries.
        point (GeoDataFrame): GeoDataFrame containing point geometries.
        snapdist (float): Maximum search distance to find the nearest polygon.
    
    Returns:
        GeoDataFrame: Updated point GeoDataFrame with snapped geometries.
    """
    # Ensure both datasets use the same projected CRS (Sweden EPSG:3006 in original, changed to 26915)
    line = line.to_crs(4269)
    point = point.set_crs(4269)
    
    # Create unique IDs
    line["lineid"] = range(line.shape[0])
    point["pointid"] = range(point.shape[0])
    
    # Store original polygon geometry
    line["linegeom"] = line.geometry
    
    # Perform spatial join to find nearest polygons within snap distance
    sj = gpd.sjoin_nearest(left_df=point, right_df=line, how="left", max_distance=snapdist)
    
    # Measure distances (set to None if no polygon within snapdistance)
    sj["distance"] = sj.apply(lambda x: x.geometry.distance(x.linegeom) if x.linegeom is not None else None, axis=1)
    
    # Sort by distance and drop duplicates (keeping closest polygon match)
    sj = sj.sort_values(by=["pointid", "distance"], ascending=True, na_position="last")
    sj = sj.drop_duplicates(subset="pointid", keep="first")
    
    # Find the nearest point on the polygon
    sj["nearestpoint"] = sj.apply(
        lambda x: nearest_points(x.geometry, x.linegeom)[1] if (x.linegeom is not None and x.distance is not None) else None,
        axis=1
    )
    
    # Snap points to nearest point on polygon if applicable
    sj["geometry"] = sj.apply(
        lambda x: snap(x.geometry, x.nearestpoint, snapdist) if x.nearestpoint is not None else x.geometry,
        axis=1
    )
    
    return sj

In [20]:
# User defined variables
my_training_state = 'MN' # State USPS abbreviation
state_name = 'Minnesota'
my_nas_id = 5
# IA = 19; ID = 16; IL = 17; MN = 27; MO = 29; MT = 30; OR = 41;  WA = 53; WI = 55
state_fips = '27' # Replace last 2 digits with your state's FIP code
my_path = 'data/' + my_training_state + '/' # leave this alone

In [21]:
# Get NAS data
pos_data = nas_api_call(my_nas_id, my_training_state)
my_data = pos_data[["decimalLatitude", "decimalLongitude"]]
pos_data_gdf = gpd.GeoDataFrame(
    my_data, geometry=gpd.points_from_xy(my_data.decimalLongitude, my_data.decimalLatitude)).dropna().set_crs(4269)#.to_crs(5070)

In [5]:
# Import flowline
# Import stream files
# Find all shapefiles that include "NHDFlowline_" in the filename
shapefiles = glob.glob(os.path.join(my_path + "/shape/", "*NHDFlowline_*.shp")) + glob.glob(os.path.join(my_path + "/shape/", "NHDFlowline.shp"))

# Ensure shapefiles were found
if not shapefiles:
    print("No shapefiles found matching the pattern.")

# Load all shapefiles into a list of GeoDataFrames
gdfs = [gpd.read_file(shp) for shp in shapefiles]

# Optionally, concatenate all shapefiles into a single GeoDataFrame
if gdfs:  # Only concatenate if the list is not empty
    stream_gdf = gpd.pd.concat(gdfs, ignore_index=True)
    print("Successfully merged shapefiles into a single GeoDataFrame.")
else:
    stream_gdf = None
    print("No valid shapefiles to merge.")

# Print summary
print(f"Imported {len(gdfs)} shapefiles")

Successfully merged shapefiles into a single GeoDataFrame.
Imported 1 shapefiles


In [22]:
# Apply function to extract vertices
stream_gdf["vertices"] = stream_gdf["geometry"].apply(extract_vertices)

# Convert to DataFrame with separate rows for each vertex
vertices_gdf = stream_gdf.explode("vertices", ignore_index=True)

# Convert extracted coordinates to Point geometries
vertices_gdf["geometry"] = vertices_gdf["vertices"].apply(lambda v: Point(v))

# Drop the original tuple column
vertices_gdf.drop(columns=["vertices"], inplace=True)

# Ensure the final output remains a GeoDataFrame with the correct CRS
vertices_gdf = gpd.GeoDataFrame(vertices_gdf, geometry="geometry", crs=stream_gdf.crs)

duplicate_mask = vertices_gdf.duplicated(subset=["geometry"], keep=False)
# Get only the duplicate geometries
my_vertices = vertices_gdf[duplicate_mask]

In [None]:
my_vertices.info()

In [23]:
# Snap occurences to lines 
snapped_occ = snap_points_to_nearest_line(stream_gdf, pos_data_gdf,  10000)




In [None]:
# Create a plot
fig, ax = plt.subplots(figsize=(10, 8))

# Plot each dataset with a different style
stream_gdf.plot(ax=ax, color="blue", linewidth=1, label="Streams")  # Line features
my_vertices.plot(ax=ax, color="red", markersize=10, label="Vertices")  # Points
snapped_occ.plot(ax=ax, color="green", markersize=20, marker="*", label="Snapped Occ")  # Highlighted points

# Add a legend
plt.legend()

# Show the plot
plt.show()

In [24]:
joined = gpd.sjoin_nearest(my_vertices, snapped_occ.drop(columns=["index_right"], errors="ignore"), how="left")




In [25]:
joined.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 683491 entries, 0 to 14019335
Data columns (total 42 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   permanent__left   683491 non-null  object  
 1   fdate_left        683491 non-null  object  
 2   resolution_left   683491 non-null  int64   
 3   gnis_id_left      155659 non-null  object  
 4   gnis_name_left    155659 non-null  object  
 5   lengthkm_left     683491 non-null  float64 
 6   reachcode_left    683491 non-null  object  
 7   flowdir_left      683491 non-null  int64   
 8   wbarea_per_left   219153 non-null  object  
 9   ftype_left        683491 non-null  int64   
 10  fcode_left        683491 non-null  int64   
 11  mainpath_left     683491 non-null  int64   
 12  innetwork_left    683491 non-null  int64   
 13  visibility_left   683491 non-null  int64   
 14  SHAPE_Leng_left   0 non-null       object  
 15  ObjectID_left     683491 non-null  int64   
 1

In [None]:
# Load the shapefiles
source_gdf = my_vertices.to_crs(3857)
target_gdf = snapped_occ.to_crs(3857)
network_gdf = stream_gdf.to_crs(3857)

# Ensure all GeoDataFrames use the same CRS
if not (source_gdf.crs == target_gdf.crs == network_gdf.crs):
    target_gdf = target_gdf.to_crs(source_gdf.crs)
    network_gdf = network_gdf.to_crs(source_gdf.crs)

# Build a graph from the polyline network
def build_network(gdf):
    G = nx.Graph()
    for _, row in gdf.iterrows():
        line = row.geometry
        if isinstance(line, LineString):
            coords = list(line.coords)
            for i in range(len(coords) - 1):
                G.add_edge(
                    coords[i],
                    coords[i + 1],
                    weight=Point(coords[i]).distance(Point(coords[i + 1]))
                )
    return G

network_graph = build_network(network_gdf)

# Snap a point to the nearest point on the network
def snap_point_to_network(point, network_gdf, graph):
    nearest_line = network_gdf.loc[network_gdf.distance(point).idxmin()]
    snapped_point = nearest_points(point, nearest_line.geometry)[1]
    snapped_coords = (snapped_point.x, snapped_point.y)
    
    coords = list(nearest_line.geometry.coords)
    for i in range(len(coords) - 1):
        start, end = coords[i], coords[i + 1]
        segment = LineString([start, end])
        if segment.distance(snapped_point) < 1e-6:
            graph.add_edge(start, snapped_coords, weight=Point(start).distance(snapped_point))
            graph.add_edge(snapped_coords, start, weight=Point(start).distance(snapped_point))
            graph.add_edge(end, snapped_coords, weight=Point(end).distance(snapped_point))
            graph.add_edge(snapped_coords, end, weight=Point(end).distance(snapped_point))
            break
    return snapped_coords

# Identify connected and disconnected point pairs
def identify_connected_disconnected_pairs(point_pairs, source_gdf, target_gdf, network_gdf, network_graph):
    connected_pairs = []
    disconnected_pairs = []
    
    for pair in point_pairs:
        source_id, target_id = pair
        
        source_geom = source_gdf[source_gdf["ObjectID"] == source_id].iloc[0].geometry
        target_geom = target_gdf[target_gdf["ObjectID"] == target_id].iloc[0].geometry
        
        source_node = snap_point_to_network(source_geom, network_gdf, network_graph)
        target_node = snap_point_to_network(target_geom, network_gdf, network_graph)
        
        if nx.has_path(network_graph, source_node, target_node):
            connected_pairs.append((pair, source_node, target_node))
        else:
            disconnected_pairs.append(pair)
    
    return connected_pairs, disconnected_pairs

# Load point pairs
point_pairs = joined[['ObjectID_left', 'ObjectID_right']].to_numpy()
#point_pairs = point_pairs[:5]
# Separate point pairs into connected and disconnected
connected_pairs, disconnected_pairs = identify_connected_disconnected_pairs(
    point_pairs, source_gdf, target_gdf, network_gdf, network_graph
)


In [27]:
# Calculate shortest distances for connected point pairs
def calculate_shortest_distances(connected_pairs, network_graph):
    results = []
    for pair, source_node, target_node in connected_pairs:
        source_id, target_id = pair
        
        path_length = nx.shortest_path_length(
            network_graph, source_node, target_node, weight="weight"
        )
        
        results.append({
            "source_point_id": source_id,
            "target_point_id": target_id,
            "network_distance": path_length
        })
    return results

# Add disconnected pairs with "inf" distances
def add_disconnected_pairs_to_results(disconnected_pairs):
    return [
        {"source_point_id": pair[0], "target_point_id": pair[1], "network_distance": float("inf")}
        for pair in disconnected_pairs
    ]

# Step 1: Calculate shortest distances for connected pairs
connected_distances = calculate_shortest_distances(connected_pairs, network_graph)

# Step 2: Add disconnected pairs
disconnected_distances = add_disconnected_pairs_to_results(disconnected_pairs)

# Step 3: Combine results
all_distances = connected_distances + disconnected_distances

# Convert results to a DataFrame
results_df = pd.DataFrame(all_distances)

# Print or save the results
print(results_df)
# results_df.to_csv("carp_distances.csv", index=False)


   source_point_id  target_point_id  network_distance
0                1           152949      82402.308297
1                1           152949      82402.308297
2                2           220601      39474.120888
3                2           220601      39474.120888
4                3             7944       9863.589331


In [None]:
results_df.to_csv("carp_distances.csv", index=False)

In [None]:
# pt_pairs_gpd = gpd.read_file("point_pairs_to_filter.shp")
# shortest_pt_pairs = pt_pairs_gpd.loc[pt_pairs_gpd.groupby("sourceID")["distance"].idxmin()]
# shortest_pt_pairs.to_file('point_pairs_filtered.shp')

In [None]:
# pt_pairs_w_lakeID = gpd.read_file("pairs_w_LakeID_to_filter.shp")
# lake_pt_pairs = shortest_pt_pairs.loc[shortest_pt_pairs.groupby("lakeID")["distance"].idxmin()]
# distance_removed = lake_pt_pairs.drop(columns = 'distance')
# distance_removed.to_file('pairs_w_LakeID_filtered.shp')