In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import networkx as nx
import json
import sys
sys.path.append('..')
from functions.functions_LLM import *

## Understand how the symbols are connected via network

In [None]:
outputs_dir = Path(outputs_dir)

# --- Load detections from Notebook 1 ---
det_csv = outputs_dir / "detections" / "test_pid_detections.csv"

df = pd.read_csv(det_csv)

# Sort from bottom-right → top-left (used for start node AND tie-breaks)
df_sorted = df.sort_values(by=['cy', 'cx'], ascending=[False, False]).reset_index(drop=True)

# Assign node IDs (if you want them)
df_sorted['node_id'] = [f"{r.label}_{i}" for i, r in df_sorted.iterrows()]

# --- Greedy nearest-neighbor chaining with tie-break by df_sorted order ---
coords = df_sorted[['cx', 'cy']].to_numpy(dtype=float)
n = len(coords)

order = -np.ones(n, dtype=int)
visited = np.zeros(n, dtype=bool)
priority = np.arange(n)  # tie-break priority: earlier in df_sorted wins
tol = 1e-12              # numeric tolerance for distance ties

# Start at bottom-right (the first row after our sort)
current = 0
visited[current] = True
order[current] = 0

for k in range(1, n):
    remaining_mask = ~visited
    remaining_idx = np.flatnonzero(remaining_mask)
    
    # Squared Euclidean distances
    d2 = np.sum((coords[remaining_mask] - coords[current])**2, axis=1)
    min_d2 = d2.min()
    
    # minimum distance
    candidate_pos = np.where(np.isclose(d2, min_d2, rtol=0, atol=tol))[0]
    candidate_global_idx = remaining_idx[candidate_pos]
    
    # Tie-break by original order
    next_idx = candidate_global_idx[np.argmin(priority[candidate_global_idx])]
    
    current = next_idx
    visited[current] = True
    order[current] = k

df_ordered = df_sorted.copy()
df_ordered['order'] = order
df_ordered = df_ordered.sort_values('order').reset_index(drop=True)

G = nx.DiGraph()
G.add_nodes_from(df_ordered['name_ocr_clean'])
path_edges = list(zip(df_ordered['name_ocr_clean'][:-1], df_ordered['name_ocr_clean'][1:]))
G.add_edges_from(path_edges)

df_ordered['label'] = df_ordered['label'].replace({'pressure': 'pressure indicator'})


In [None]:
# Save as JSON 
with open(outputs_dir / "path_edges.json", "w", encoding="utf-8") as f:
    json.dump([list(edge) for edge in path_edges], f, indent=4)

json_data = df_ordered.groupby("label")["name_ocr_clean"].apply(list).to_dict()

with open(outputs_dir / "components.json", "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)


  with open('..\outputs\path_edges.json', 'w') as f:
  with open("..\outputs\components.json", "w", encoding="utf-8") as f:
