In [1]:
import os
import json
import cv2
import numpy as np

In [2]:
# The script will look for Label.txt inside this folder.
ANNOTATION_DIR = "../Birth_Certificate/" 
# Path to the folder containing your original, full-size images
IMAGE_DIR = "../" 
# Where to save the final cropped snippets
OUTPUT_DIR = "../data/classifier_data/"


In [3]:

# --- Main Script ---

def crop_and_save_snippets(annotation_dir, image_dir, output_dir):
    print("Starting snippet creation...")
    
    # Create output directories if they don't exist
    os.makedirs(os.path.join(output_dir, "printed"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "handwritten"), exist_ok=True)

    snippet_count = 0
    
    # Define the path to the main annotation file
    label_file_path = os.path.join(annotation_dir, "Label.txt")
    
    if not os.path.exists(label_file_path):
        print(f"Error: Label.txt not found in '{annotation_dir}'")
        return

    print(f"Processing annotation file: {label_file_path}")
    
    with open(label_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # The line contains image_path relative path and a list of annotations
            try:
                img_path, annotations_str = line.strip().split('\t')
                annotations = json.loads(annotations_str)
            except ValueError:
                print(f"  - Could not parse line: {line.strip()}")
                continue

            # Load the original image
            full_image_path = os.path.join(image_dir, img_path)
            if not os.path.exists(full_image_path):
                print(f"  - Image not found: {full_image_path}")
                continue
            image = cv2.imread(full_image_path)

            # Process each annotation (each box you drew)
            for ann in annotations:
                # --- THIS IS THE KEY CORRECTION ---
                # We get the label from the 'transcription' key, not the 'label' key.
                label = ann['transcription']
                
                points = np.array(ann['points'], dtype=np.int32)

                # We only care about our two classes
                if label not in ['printed', 'handwritten']:
                    continue

                # Get the bounding box coordinates from the polygon points
                x, y, w, h = cv2.boundingRect(points)
                
                # Crop the region from the original image
                snippet = image[y:y+h, x:x+w]
                
                if snippet.size == 0:
                    continue

                # Define the output path
                save_path = os.path.join(output_dir, "./", label, f"snippet_{snippet_count}.jpg")
                
                # Save the cropped snippet
                cv2.imwrite(save_path, snippet)
                snippet_count += 1
    
    print(f"\nDone! Created {snippet_count} snippets in '{output_dir}'")

# --- Run the script ---
if __name__ == "__main__":
    crop_and_save_snippets(ANNOTATION_DIR, IMAGE_DIR, OUTPUT_DIR)

Starting snippet creation...
Processing annotation file: ../Birth_Certificate/Label.txt



Done! Created 1677 snippets in '../data/classifier_data/'
