# Geometry Validation Notebook

This notebook allows you to interactively run geometry validation checks on the processed Somerset dataset.

In [1]:
import pandas as pd
from shapely import wkt
from shapely.geometry import Point
import os
import ast

# Set display options
pd.set_option('display.max_columns', None)

## 1. Load Data

In [4]:
# Define paths
district = "Burnaby"
base_dir = f"../data/output/{district}"
edges_path = os.path.join(base_dir, f"{district}_driving_simplified_edges_with_h3.csv")
nodes_path = os.path.join(base_dir, f"{district}_driving_simplified_nodes.csv")
id_map_path = os.path.join(base_dir, f"{district}_driving_edge_id.csv")

# Load files
print(f"Loading edges from {edges_path}...")
edges_df = pd.read_csv(edges_path)

print(f"Loading nodes from {nodes_path}...")
nodes_df = pd.read_csv(nodes_path)

print(f"Loading ID mapping from {id_map_path}...")
id_map_df = pd.read_csv(id_map_path)

print("Done.")

Loading edges from ../data/output/Burnaby/Burnaby_driving_simplified_edges_with_h3.csv...
Loading nodes from ../data/output/Burnaby/Burnaby_driving_simplified_nodes.csv...
Loading ID mapping from ../data/output/Burnaby/Burnaby_driving_edge_id.csv...
Done.


## 2. Parse Geometries and Map IDs

In [5]:
# Parse WKT geometries
print("Parsing geometries...")
edges_df['geometry'] = edges_df['geometry'].apply(wkt.loads)
nodes_df['geometry'] = nodes_df['geometry'].apply(wkt.loads)

# Index nodes by ID
nodes_df.set_index('id', inplace=True)

# Parse edge ID mapping to get source/target
print("Mapping edge IDs to source/target...")
edge_connectivity = {}
for _, row in id_map_df.iterrows():
    u, v = ast.literal_eval(row['id'])
    edge_connectivity[row['index']] = (u, v)

# Map to edges_df
if 'edge_index' not in edges_df.columns and 'id' in edges_df.columns:
    edges_df['edge_index'] = edges_df['id']

edges_df['source'] = edges_df['edge_index'].apply(lambda x: edge_connectivity.get(x, (None, None))[0])
edges_df['target'] = edges_df['edge_index'].apply(lambda x: edge_connectivity.get(x, (None, None))[1])

print(f"Loaded {len(edges_df)} edges and {len(nodes_df)} nodes.")
edges_df.head()

Parsing geometries...
Mapping edge IDs to source/target...
Loaded 35217 edges and 14408 nodes.


Unnamed: 0,edge_index,length,maxspeed,geometry,highway,cost,to_cell,from_cell,lca_res,source,target
0,1,98.503,30.0,LINESTRING (-122.92154693603516 49.27764129638...,tertiary_link,11.82036,644733695069033029,644733695069283890,8,250385795,4901855621
1,2,53.933,30.0,LINESTRING (-122.92154693603516 49.27764129638...,tertiary,6.47196,644733695069345412,644733695069283890,9,250385795,3793579800
2,3,684.05,50.0,LINESTRING (-122.92154693603516 49.27764129638...,tertiary,49.2516,644733695063774494,644733695069283890,7,250385795,431382055
3,4,35.488,30.0,LINESTRING (-122.92096710205078 49.27793121337...,tertiary,4.25856,644733695069337920,644733695069345412,10,3793579800,262835717
4,5,53.933,30.0,LINESTRING (-122.92096710205078 49.27793121337...,tertiary,6.47196,644733695069283890,644733695069345412,9,3793579800,250385795


In [14]:
edges_df[edges_df["edge_index"]==0]

Unnamed: 0,edge_index,length,maxspeed,geometry,highway,cost,to_cell,from_cell,lca_res,source,target


## 3. Check for Self-Loops (Parallel Edges)
Ensures that no edge has `source == target`.

In [6]:
self_loops = edges_df[edges_df['source'] == edges_df['target']]

if len(self_loops) > 0:
    print(f"FAIL: Found {len(self_loops)} self-loop edges.")
    display(self_loops)
else:
    print("PASS: No self-loop edges found.")

PASS: No self-loop edges found.


## 4. Check for Unique Source and Target Pairs
While parallel edges (graph theory definition: multiple edges between same two nodes) are allowed in MultiDiGraphs, check if we have them.

In [7]:
duplicates = edges_df[edges_df.duplicated(subset=['source', 'target'], keep=False)]
if len(duplicates) > 0:
    print(f"INFO: Found {len(duplicates)} edges sharing the same source/target (parallel edges).")
    # display(duplicates.sort_values(by=['source', 'target']))
else:
    print("INFO: No parallel edges found (simple graph structure).")

INFO: No parallel edges found (simple graph structure).


## 5. Check Node Geometry Uniqueness
Ensures no two nodes share the exact same coordinates.

In [8]:
node_geoms = nodes_df['geometry'].apply(lambda p: (p.x, p.y))
duplicate_nodes = node_geoms[node_geoms.duplicated()]

if len(duplicate_nodes) > 0:
    print(f"FAIL: Found {len(duplicate_nodes)} duplicate node geometries.")
    display(nodes_df.loc[duplicate_nodes.index])
else:
    print("PASS: All nodes have distinct coordinates.")

PASS: All nodes have distinct coordinates.


## 6. Check Geometry Consistency
Verifies that for every edge, the start point of its geometry matches the source node's location, and the end point matches the target node's location.

In [9]:
mismatches = []

for idx, row in edges_df.iterrows():
    u, v = row['source'], row['target']
    
    # Skip checks if node lookup fails (e.g. pruned nodes)
    if u not in nodes_df.index or v not in nodes_df.index:
        continue
        
    u_pt = nodes_df.loc[u, 'geometry']
    v_pt = nodes_df.loc[v, 'geometry']
    edge_geom = row['geometry']
    
    start_match = (edge_geom.coords[0] == (u_pt.x, u_pt.y))
    end_match = (edge_geom.coords[-1] == (v_pt.x, v_pt.y))
    
    if not start_match or not end_match:
        mismatches.append({
            'edge_index': row['edge_index'],
            'source': u,
            'target': v,
            'start_match': start_match,
            'end_match': end_match,
            'edge_start': edge_geom.coords[0],
            'node_source': (u_pt.x, u_pt.y),
            'edge_end': edge_geom.coords[-1],
            'node_target': (v_pt.x, v_pt.y)
        })

if mismatches:
    print(f"FAIL: Found {len(mismatches)} geometry mismatches.")
    mismatch_df = pd.DataFrame(mismatches)
    display(mismatch_df.head())
else:
    print(f"PASS: All checked edges match their node endpoints.")

PASS: All checked edges match their node endpoints.


## 7. Validate Edge-based Graph Connectivity
The routing engine uses an edge-based graph where nodes are edges from the original graph.
We verify that for every transition `from_edge -> to_edge` in `edge_graph`, the **target node** of `from_edge` is indeed the **source node** of `to_edge`.

In [11]:
edge_graph_path = os.path.join(base_dir, f"{district}_driving_edge_graph.csv")
print(f"Loading edge graph from {edge_graph_path}...")
edge_graph_df = pd.read_csv(edge_graph_path)

print(f"Loaded {len(edge_graph_df)} transitions.")
# edge_graph_df columns: ['from_edge', 'to_edge', 'cost', ...]

# We reuse 'edge_connectivity' dict from Step 2 which maps: edge_index -> (source_node, target_node)
# edge_connectivity = {0: (u0, v0), 1: (u1, v1), ...}

connectivity_mismatches = []

for idx, row in edge_graph_df.iterrows():
    from_idx = int(row['from_edge'])
    to_idx = int(row['to_edge'])
    
    # Get node tuples (u, v) for both edges
    # edge_connectivity uses int keys, so ensuring types match
    from_nodes = edge_connectivity.get(from_idx)
    to_nodes = edge_connectivity.get(to_idx)
    
    if not from_nodes or not to_nodes:
        # Should not happen if data is consistent
        print(f"WARNING: Could not find edge definition for index {from_idx} or {to_idx}")
        continue
        
    # from_edge connects (u1 -> v1)
    # to_edge connects (u2 -> v2)
    # Connectivity requires v1 == u2
    
    target_of_first = from_nodes[1]
    source_of_second = to_nodes[0]
    
    if target_of_first != source_of_second:
        connectivity_mismatches.append({
            'row_idx': idx,
            'from_edge': from_idx,
            'to_edge': to_idx,
            'from_edge_def': from_nodes,
            'to_edge_def': to_nodes,
            'expected_connection': target_of_first,
            'actual_connection': source_of_second
        })

if connectivity_mismatches:
    print(f"FAIL: Found {len(connectivity_mismatches)} connectivity errors in edge graph.")
    display(pd.DataFrame(connectivity_mismatches).head())
else:
    print(f"PASS: All {len(edge_graph_df)} transitions in edge_graph represent valid node connections.")

Loading edge graph from ../data/output/Burnaby/Burnaby_driving_edge_graph.csv...
Loaded 99497 transitions.
PASS: All 99497 transitions in edge_graph represent valid node connections.
