# Convert PASCAL Objects from roboflow to ESRI

### Organise Roboflow Files

In [None]:
# steps:
    #1: rename the train folder to images
    #2: move the xml files from the images folder into another folder at the root directory under the name labels
    #2: delete the other roboflow

import os
import shutil

# put your roboflow PSCAL VOC training data folder in the same directory as this notbook

labels_folder_name = ## PATH TO DATA FOLDER ##

input_dir = os.path.join(labels_folder_name, 'train')
image_dir = os.path.join(labels_folder_name, 'images')
label_dir = os.path.join(labels_folder_name, 'labels')

if os.path.exists(input_dir):
    os.rename(input_dir, image_dir)
    os.makedirs(label_dir)

    # Loop through the source directory
    for file_name in os.listdir(image_dir):
        if file_name.endswith('.xml'):
            # Construct the full path of the source file
            source_file = os.path.join(image_dir, file_name)
            
            # Move the XML file to the destination directory
            shutil.move(source_file, label_dir)


#### Rename the files

to clean up the roboflow naming convention

In [None]:
# Define the paths to the image and label folders
image_dir = ## PATH TO IMAGE FOLDER ##
label_dir = ## PATH LABELS FOLDER

# Get the list of files in the image folder
image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.jpg')])

# Get the list of files in the label folders
label_files_1 = sorted([f for f in os.listdir(label_dir) if f.endswith('.xml')])

# Check if the number of files in each folder is the same
if len(image_files) != len(label_files_1):
    print("Error: The number of files in the folders doesn't match.")
    exit()

# Rename the files
for i, (image_file, label_file_1) in enumerate(zip(image_files, label_files_1), start=1):
    # Generate the new file names
    new_image_name = f"{i:05}.jpg"
    new_label_name_1 = f"{i:05}.xml"


    # Rename the files in each folder
    os.rename(os.path.join(image_dir, image_file), os.path.join(image_dir, new_image_name))
    os.rename(os.path.join(label_dir, label_file_1), os.path.join(label_dir, new_label_name_1))

    print(f"Renamed: {image_file} to {new_image_name}")
    print(f"Renamed: {label_file_1} to {new_label_name_1}")

### Convert XML labels to ESRI format

this only currently works for one class in the labels

In [None]:
import fileinput
import sys

#set your class name and values
class_name = ## CLASS NAME 
class_value = ## CLASS VALUE 

# iterate through each file in the directory
for filename in os.listdir(label_dir):
    if filename.endswith('.xml'):
        # read in the contents of the file
        with fileinput.input(os.path.join(label_dir, filename), inplace=True) as f:
            for line in f:
                # remove specified lines
                if '<path>' in line or '<pose>' in line or '<truncated>' in line or '<difficult>' in line or '<occluded>' in line or '<segmented>' in line:
                    continue
                # replace specified text
                elif '<database>' in line:
                    sys.stdout.write(line.replace('<database>roboflow.ai</database>', '<annotation>ESRI ArcGIS Pro</annotation>'))
                # replace surfer with 1
                elif '<name>{}</name>'.format(class_name) in line:
                    sys.stdout.write(line.replace('<name>{}</name>'.format(class_name), '<name>{}</name>'.format(class_value)))
                elif '<filename>' in line:
                    new_filename = filename[:-4] + '.jpg'
                    sys.stdout.write(line.replace(line, '\t<filename>{}</filename>\n'.format(new_filename)))
                # add specified line
                elif fileinput.isfirstline():
                    sys.stdout.write('<?xml version="1.0"?>\n')
                    sys.stdout.write(line)
                # write the line as is
                else:
                    sys.stdout.write(line)


### Create map.txt

In [None]:
# set output map.txt directory
out_map = os.path.join(labels_folder_name, 'map.txt')

image_dir_split = os.path.split(image_dir)
label_dir_split = os.path.split(label_dir)

# create a list of image and label files
image_files = sorted(os.listdir(image_dir))
label_files = sorted(os.listdir(label_dir))

# create a list of tuples with the image and label filenames
file_list = list(zip(image_files, label_files))

# create a text file and write the file paths to it
with open(out_map, 'w') as f:
    for image_file, label_file in file_list:
        image_path = os.path.join(image_dir_split[-1], image_file)
        label_path = os.path.join(label_dir_split[-1], label_file)
        line = f"{image_path}  {label_path}\n"
        f.write(line)

### Calculate stats.txt file

### now need to add the .emd file and the esri accumulated stats

maybe create a download link to a template or keep it linked to the notebook

In [None]:
import xml.etree.ElementTree as ET

ANNOTATION_FOLDER = label_dir

# Function to calculate area of a bounding box
def calculate_area(box):
    return (box[2] - box[0]) * (box[3] - box[1])

# Initialize variables for calculating stats
min_area = float('inf')
max_area = 0
total_area = 0
count = 0
annotated_files = 0
total_annotation_files = 0
min_annotations = float('inf')  # Variable to track minimum number of annotations in a file
max_annotations = 0  # Variable to track maximum number of annotations in a file
total_annotations = 0  # Variable to track total number of annotations

class_labels = set()  # Set to store unique class labels

xml_iteration = 0
# Loop through each annotation file
for filename in os.listdir(ANNOTATION_FOLDER):
    if not filename.endswith('.xml'):
        continue
    tree = ET.parse(os.path.join(ANNOTATION_FOLDER, filename))
    root = tree.getroot()

    # Check if the file has annotations
    if len(root.findall('.//bndbox')) > 0:
        annotated_files += 1

    # Increment total_annotation_files for each XML file encountered
    total_annotation_files += 1
    
    num_annotations = len(root.findall('.//bndbox'))

    # Update min_annotations and max_annotations
    if num_annotations < min_annotations:
        min_annotations = num_annotations
    if num_annotations > max_annotations:
        max_annotations = num_annotations
        xml_with_max_annotations = filename

    # Loop through each bounding box in the annotation file
    for box in root.findall('.//bndbox'):
        xmin = int(box.find('xmin').text)
        ymin = int(box.find('ymin').text)
        xmax = int(box.find('xmax').text)
        ymax = int(box.find('ymax').text)
        
        # Check if the <name> tag exists
        if box.find('name') is not None and box.find('name').text is not None:
            class_label = int(box.find('name').text)
            class_labels.add(class_label)

        area = calculate_area([xmin, ymin, xmax, ymax])

        # Update stats
        min_area = min(min_area, area)
        max_area = max(max_area, area)
        total_area += area
        count += 1

    total_annotations += num_annotations
    print("\rReading Annotation File: %d" % xml_iteration, end="")
    xml_iteration += 1
print('   Read all files')

# Calculate mean area
mean_area = total_area / count if count != 0 else 0

# Calculate mean number of annotations per file
mean_annotations = total_annotations / total_annotation_files if total_annotation_files != 0 else 0

# Print stats
print("Class Stats:")
print(" ")
print("Area: ")
print(f"    Min area: {min_area}")
print(f"    Max area: {max_area}")
print(f"    Mean area: {mean_area}")
print(f"    Total area: {total_area}")
print(" ")
print("Annotations: ")
print(f"    Number of bounding boxes: {count}")
print(f"    Number of files with annotations: {annotated_files}")
print(f"    Total number of annotation files: {total_annotation_files}")
print(f"    Min annotations per file: {min_annotations}")
print(f"    Max annotations per file: {max_annotations}")
print(f"    Mean annotations per file: {mean_annotations}")
print(" ")
print(f"XML file with the highest number of annotations: {xml_with_max_annotations}")


#### Print Tile stats

In [None]:
import os
import cv2
import numpy as np
import random

image_folder = "./images"

# Define empty arrays to accumulate pixel values for each band
b_values = np.array([])
g_values = np.array([])
r_values = np.array([])

# Get a list of all the image file names in the folder
image_filenames = [file_name for file_name in os.listdir(image_folder)
                   if file_name.endswith(".png") or file_name.endswith(".jpg")]

# Randomly select 2000 images from the list
image_filenames = random.sample(image_filenames, k=500)

# Iterate through the selected images and accumulate pixel values for each band
for i, file_name in enumerate(image_filenames):
    image_path = os.path.join(image_folder, file_name)
    image = cv2.imread(image_path)
    b, g, r = cv2.split(image)
    b_values = np.concatenate((b_values, b.flatten()))
    g_values = np.concatenate((g_values, g.flatten()))
    r_values = np.concatenate((r_values, r.flatten()))
    print("\rProcessing image: {}/{}".format(i+1, len(image_filenames)), end="")

# Calculate statistics for each band
r_min, r_max, r_mean, r_std = np.min(r_values), np.max(r_values), np.mean(r_values), np.std(r_values)
g_min, g_max, g_mean, g_std = np.min(g_values), np.max(g_values), np.mean(g_values), np.std(g_values)
b_min, b_max, b_mean, b_std = np.min(b_values), np.max(b_values), np.mean(b_values), np.std(b_values)

# Print out the band statistics
print("\n\nTile Statistics")
print("Blue band statistics: min = {:.2f}, max = {:.2f}, mean = {:.2f}, std = {:.2f}".format(b_min, b_max, b_mean, b_std))
print("Green band statistics: min = {:.2f}, max = {:.2f}, mean = {:.2f}, std = {:.2f}".format(g_min, g_max, g_mean, g_std))
print("Red band statistics: min = {:.2f}, max = {:.2f}, mean = {:.2f}, std = {:.2f}".format(r_min, r_max, r_mean, r_std))