In [1]:
import cv2
import os
import json
from tqdm import tqdm

In [2]:
data_dir = '/data'
output_file_sq= 'valid/annotations.jsonl'

In [3]:
# find the largest light spot in image, write to a jsonl file
def find_light_region(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        return None, None
    _, thresh = cv2.threshold(image, 200, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if len(contours) == 0:
        return None, None
    largest_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest_contour)
    center_x = x + w / 2
    center_y = y + h / 2
    return (x, y, x + w, y + h), (center_x, center_y)

def convert_to_jsonl(data_dir, output_file):
    jsonl_data = []
    for image_file in tqdm(os.listdir(data_dir)):
        if image_file.endswith('.jpg'):
            image_path = os.path.join(data_dir, image_file)
            bbox, center = find_light_region(image_path)
            if bbox and center:
                x, y, x2, y2 = bbox
                suffix = f"light area<loc_{x}><loc_{y}><loc_{x2}><loc_{y2}>"
                data = {
                    "image": image_file,
                    "prefix": "<OD>",
                    "suffix": suffix
                }
                jsonl_data.append(data)
    
    with open(output_file, 'w') as f:
        for entry in jsonl_data:
            json.dump(entry, f)
            f.write('\n')

convert_to_jsonl(data_dir, output_file_sq)

100%|██████████| 81/81 [00:05<00:00, 14.98it/s]


In [4]:
#convert the annotation to xml files
import os
import json
import xml.etree.ElementTree as ET
from xml.dom import minidom
from PIL import Image
from tqdm import tqdm

def create_voc_xml(annotation, img_path, output_dir):
    folder = os.path.basename(os.path.dirname(img_path))
    filename = os.path.basename(img_path)
    img = Image.open(img_path)
    width, height = img.size

    annotation_xml = ET.Element("annotation")
    
    folder_xml = ET.SubElement(annotation_xml, "folder")
    folder_xml.text = folder
    
    filename_xml = ET.SubElement(annotation_xml, "filename")
    filename_xml.text = filename
    
    path_xml = ET.SubElement(annotation_xml, "path")
    path_xml.text = img_path
    
    source_xml = ET.SubElement(annotation_xml, "source")
    database_xml = ET.SubElement(source_xml, "database")
    database_xml.text = "Unknown"
    
    size_xml = ET.SubElement(annotation_xml, "size")
    width_xml = ET.SubElement(size_xml, "width")
    width_xml.text = str(width)
    height_xml = ET.SubElement(size_xml, "height")
    height_xml.text = str(height)
    depth_xml = ET.SubElement(size_xml, "depth")
    depth_xml.text = "3"
    
    segmented_xml = ET.SubElement(annotation_xml, "segmented")
    segmented_xml.text = "0"
    
    obj_xml = ET.SubElement(annotation_xml, "object")
    name_xml = ET.SubElement(obj_xml, "name")
    name_xml.text = "light_region"
    pose_xml = ET.SubElement(obj_xml, "pose")
    pose_xml.text = "Unspecified"
    truncated_xml = ET.SubElement(obj_xml, "truncated")
    truncated_xml.text = "0"
    difficult_xml = ET.SubElement(obj_xml, "difficult")
    difficult_xml.text = "0"
    
    bndbox_xml = ET.SubElement(obj_xml, "bndbox")
    xmin_xml = ET.SubElement(bndbox_xml, "xmin")
    xmin_xml.text = str(annotation[0])
    ymin_xml = ET.SubElement(bndbox_xml, "ymin")
    ymin_xml.text = str(annotation[1])
    xmax_xml = ET.SubElement(bndbox_xml, "xmax")
    xmax_xml.text = str(annotation[2])
    ymax_xml = ET.SubElement(bndbox_xml, "ymax")
    ymax_xml.text = str(annotation[3])
    
    # make XML
    xml_str = minidom.parseString(ET.tostring(annotation_xml)).toprettyxml(indent="   ")
    xml_path = os.path.join(output_dir, filename.replace('.jpg', '.xml'))
    with open(xml_path, "w") as f:
        f.write(xml_str)

# read jsonl
jsonl_file = '/path/to/your/annotations.jsonl'
annotations = []
with open(jsonl_file, 'r') as f:
    for line in f:
        annotations.append(json.loads(line))


data_dir = '/path/to/your/cryoEM/images'
output_dir = '/path/to/output/xml_annotations'
os.makedirs(output_dir, exist_ok=True)

#  Pascal VOC format XML 
for annotation in tqdm(annotations):
    image_file = annotation['image']
    suffix = annotation['suffix']
    coords = suffix.split('<loc_')[1:]
    coords = [int(c.split('>')[0]) for c in coords]
    x, y, x2, y2 = coords
    
    image_path = os.path.join(data_dir, image_file)
    create_voc_xml((x, y, x2, y2), image_path, output_dir)


In [None]:
# Now adjust the box manually with  LabelImg if needed

In [None]:
#After fixing bad boxes, convert back to jsonl for training
import os
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm

def parse_voc_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    filename = root.find('filename').text
    bbox = root.find('object').find('bndbox')
    x = int(bbox.find('xmin').text)
    y = int(bbox.find('ymin').text)
    x2 = int(bbox.find('xmax').text)
    y2 = int(bbox.find('ymax').text)
    suffix = f"2D<loc_{x}><loc_{y}><loc_{x2}><loc_{y2}>"
    return {
        "image": filename,
        "prefix": "<OD>",
        "suffix": suffix
    }

def convert_xml_to_jsonl(xml_dir, output_file):
    jsonl_data = []
    for xml_file in tqdm(os.listdir(xml_dir)):
        if xml_file.endswith('.xml'):
            xml_path = os.path.join(xml_dir, xml_file)
            data = parse_voc_xml(xml_path)
            jsonl_data.append(data)
    
    with open(output_file, 'w') as f:
        for entry in jsonl_data:
            json.dump(entry, f)
            f.write('\n')


xml_dir = '/path/to/modified/xml_annotations'
jsonl_output_file = '/path/to/modified/annotations.jsonl'
convert_xml_to_jsonl(xml_dir, jsonl_output_file)


In [None]:
#normalize the jsonl file above and write to a new jsonl file
def normalize_coordinates(bbox, original_size, target_size=(1000, 1000)):
    original_width, original_height = original_size
    target_width, target_height = target_size
    
    x1, y1, x2, y2 = bbox
    
    x1_norm = (x1 / original_width) * target_width
    y1_norm = (y1 / original_height) * target_height
    x2_norm = (x2 / original_width) * target_width
    y2_norm = (y2 / original_height) * target_height
    
    return [x1_norm, y1_norm, x2_norm, y2_norm]



def normalize_annotations(jsonl_file_path, output_file_path,image_dir):
    normalized_entries = []

    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            image_path = os.path.join(image_dir, data['image'])
            image = Image.open(image_path)
            original_size = image.size
            #print(original_size)
            suffix = data['suffix']
            #print(suffix)

            coords=suffix.split('<loc_')
            coords=coords[1:]
            coords = [int(s[:-1]) for s in coords]
            print(coords)
            bbox = normalize_coordinates(coords, original_size)
            
            new_suffix = f"light area<loc_{int(bbox[0])}><loc_{int(bbox[1])}><loc_{int(bbox[2])}><loc_{int(bbox[3])}>"
            normalized_entry = {
                "image": data["image"],
                "prefix": data["prefix"],
                "suffix": new_suffix
            }
            print(normalized_entry)
            normalized_entries.append(normalized_entry)
    
    with open(output_file_path, 'w') as outfile:
        for entry in normalized_entries:
            json.dump(entry, outfile)
            outfile.write('\n')

#valid_image_dir="/mnt/NCEP-CryoEM/active/merkaa/florence/20210114/"
#normalize_annotations('valid/annotations.jsonl', 'valid/normalized_annotations.jsonl',valid_image_dir)