# Script to convert Paliwal synthetic P&ID dataset .npy annotation files
# to a unified JSON format (COCO-like or custom) for model training.
# Place the output JSON files in data/processed/annotations/

In [34]:
import os
import numpy as np
import json

In [35]:
# Use raw strings to handle "&" in paths
RAW_DATA_DIR = r"/Users/mokshdutt/developer/P&ID/data/raw/paliwal_dataset"
PROCESSED_ANN_DIR = r"/Users/mokshdutt/developer/P&ID/data/processed/annotations"

In [36]:
# Check if directories exist
if not os.path.exists(RAW_DATA_DIR):
    print(f"ERROR: RAW_DATA_DIR does not exist: {RAW_DATA_DIR}")
    sys.exit(1)
if not os.path.exists(PROCESSED_ANN_DIR):
    os.makedirs(PROCESSED_ANN_DIR)

In [37]:
def convert_symbols_to_json(image_id):
    """
    Converts symbol annotations for one image to a simple JSON format.
    Returns True if successful, False otherwise.
    """
    ann_folder = os.path.join(RAW_DATA_DIR, str(image_id))
    symbol_path = os.path.join(ann_folder, f"{image_id}_symbols.npy")
    if not os.path.exists(symbol_path):
        print(f"Missing: {symbol_path}")
        return False

    # Load symbols
    symbols = np.load(symbol_path, allow_pickle=True)
    # Each row: [symbol_id, [x1, y1, x2, y2], class_id]
    annotations = []
    for row in symbols:
        bbox = list(row[1])  # [x1, y1, x2, y2]
        class_id = int(row[2])
        annotations.append({
            "bbox": bbox,
            "category_id": class_id,
            "symbol_id": row[0]
        })

    # Save as JSON
    out = {
        "image": f"{image_id}.jpg",
        "annotations": annotations
    }
    out_path = os.path.join(PROCESSED_ANN_DIR, f"{image_id}.json")
    with open(out_path, "w") as f:
        json.dump(out, f, indent=2)
    print(f"Saved: {out_path}")
    return True


In [38]:
def batch_process_all():
    # Get all numeric folder names in RAW_DATA_DIR
    image_ids = [name for name in os.listdir(RAW_DATA_DIR) if name.isdigit()]
    image_ids = sorted([int(i) for i in image_ids])

    processed_count = 0
    for img_id in image_ids:
        if convert_symbols_to_json(img_id):
            processed_count += 1

    print(f"Total processed: {processed_count} out of {len(image_ids)}")

In [39]:
if __name__ == "__main__":
    batch_process_all()

Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/0.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/1.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/2.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/3.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/4.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/5.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/6.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/7.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/8.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/9.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/10.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/11.json
Saved: /Users/mokshdutt/developer/P&ID/data/processed/annotations/12.json
Saved: /Users/mokshdutt/developer/P&ID/data/proc

# The code below are used to Check a sample image being converted to JSON and not Batch process

def inspect_sample(image_id=99):
    """
    Loads annotations for image ID=99 from:
    - /Users/mokshdutt/developer/P&ID/data/raw/paliwal_dataset/99/*.npy
    """
    annotation_folder = os.path.join(RAW_DATA_DIR, str(image_id))
    
    if not os.path.exists(annotation_folder):
        raise FileNotFoundError(f"Folder not found: {annotation_folder}")
    
    annotation_files = [f for f in os.listdir(annotation_folder) if f.endswith('.npy')]
    print(f"Annotation files for image {image_id}: {annotation_files}")
    
    for fname in annotation_files:
        arr = np.load(os.path.join(annotation_folder, fname), allow_pickle=True)
        print(f"\n{fname}:")
        print(arr)

def convert_symbols_to_json(image_id=99):
    """
    Converts symbol annotations for one image to a simple JSON format.
    """
    ann_folder = os.path.join(RAW_DATA_DIR, str(image_id))
    symbol_path = os.path.join(ann_folder, f"{image_id}_symbols.npy")
    if not os.path.exists(symbol_path):
        print(f"Missing: {symbol_path}")
        return

    # Load symbols
    symbols = np.load(symbol_path, allow_pickle=True)
    # Each row: [symbol_id, [x1, y1, x2, y2], class_id]
    annotations = []
    for row in symbols:
        bbox = list(row[1])  # [x1, y1, x2, y2]
        class_id = int(row[2])
        annotations.append({
            "bbox": bbox,
            "category_id": class_id,
            "symbol_id": row[0]
        })

    # Save as JSON
    out = {
        "image": f"{image_id}.jpg",
        "annotations": annotations
    }
    out_path = os.path.join(PROCESSED_ANN_DIR, f"{image_id}.json")
    with open(out_path, "w") as f:
        json.dump(out, f, indent=2)
    print(f"Saved: {out_path}")

if __name__ == "__main__":
    # Create processed directory if missing
    if not os.path.exists(PROCESSED_ANN_DIR):
        os.makedirs(PROCESSED_ANN_DIR)
    
    # Test with image ID=99 (adjust if needed)
    inspect_sample(image_id=99)

if __name__ == "__main__":
    if not os.path.exists(PROCESSED_ANN_DIR):
        os.makedirs(PROCESSED_ANN_DIR)
    convert_symbols_to_json(image_id=99)