# Image & label formatting

The All_Plant_Data folder must be in the same directory as this file to work.
</br>Only change the **end of output_root** or the **log_level** parameters to ensure the notebook runs.
</br>This structure is based on the YOLO structure from [Ultralytics](https://docs.ultralytics.com/datasets/detect/), and will allow the "Plant Identification" notebook to run</br>
**You may have to install some packages for the notebook to run properly**

In [35]:
import shutil
import os
import random

In [36]:
def create_output_dest(dest,sub_folders):
    # Check if dest folder/path exists
    if not(os.path.exists('./TrainModels')):os.mkdir('./TrainModels')
    if not(os.path.exists('./TrainModels/datasets')):os.mkdir('./TrainModels/datasets')  
    if not(os.path.exists(dest)): os.mkdir(dest)    
    # List of folders to create, in order of depth
    folders = [f'{dest}/images/',f'{dest}/labels/']+[f'{dest}/images/{x}' for x in sub_folders] + [f'{dest}/labels/{x}' for x in sub_folders]
    # for each path check if it exists, create the folder if it doesn't
    for path in folders:
        if not(os.path.exists(path)): os.mkdir(path)               

In [37]:
# Input rooot with all folders
input_root = './All_Plant_Data'
all_plants = ['Canada Buffaloberry','Labrador Tea','Prickly Rose','Red-osier Dogwood','Velvet Leaved Blueberry']

# Assign output location
# This format is to ensure the data is usable by YOLOv8
output_root = './TrainModels/datasets/Plant_Dataset_Updated'

# Sets folder structure
images_root = f'{output_root}/images'
labels_root = f'{output_root}/labels'
sub_folders = ['test','train','valid']

# Create output folder(s)
create_output_dest(output_root, sub_folders)

In [38]:
# Read label file to get order/numbers of plants
label_file = open("label.txt", "r")
label_dict = {x.replace('\n',''):i for i,x in enumerate(label_file)}
label_dict

{'Canada Buffaloberry': 0,
 'Labrador Tea': 1,
 'Prickly Rose': 2,
 'Red-osier Dogwood': 3,
 'Velvet Leaved Blueberry': 4}

In [39]:
# Check label # to match with the provided file
def check_label(label_file, label_id, dest_lbl,log_level=0):    
    # Open label file to check id number
    f = open(label_file,'r')    
    if log_level > 2:print(label_file,sep='')
    update_text = ''
    original_text = ''
    updated_id = False
    # Read through each line of the txt file
    for line in f:
        # Store the original text
        original_text += line
        
        # If the first character does not match the label id
        if int(line[0]) != int(label_id):#           
            # Update the first instance of the incorrect label per line
            line = line.replace(line[0],str(label_id),1)
            updated_id = True
        # Store in an updated text variable, incase changes are only made to some cols
        update_text += line        
    f.close()
    # If the text was updated, overwrite the csv with the proper label value
    if len(original_text) == 0: print(f'{label_file} does not contain any data!')
    if updated_id:
        if log_level >= 1:print(f'Updating labels in dataset {dest_lbl}')        
        new = open(dest_lbl,'w')
        new.write(update_text)
        new.close()
        if log_level >= 2: print('Original IDs:\n', original_text,'\n','='*10,sep='')
        if log_level >= 2: print('Updated IDs:\n', update_text,sep='')        
    return updated_id

## Single folder (Image and txt)

In [40]:
# Function for taking combined image/label from a single directory into YOLO folder format
# Assumes all images are .jpg files
def process_image_data(current_plant,log_level=0):
    input_folder = input_root + f'/{current_plant}/'
    print(f'Processing {current_plant} files in {input_folder}')
    
    
    # Get file names, shuffle, remove duplicates
    input_files = os.listdir(input_folder)
    file_names = [i[:-4] for i in input_files] 
    file_names = list(set(file_names)) # Remove duplicates by converting list > set > list
    random.seed(0)
    random.shuffle(file_names)
#     print(input_files)
    
    # Split into test(15%), train(75%), validation(10%)
    num_images = len(file_names)
    train_set = file_names[:int(num_images*0.75)]
    test_set = file_names[int(num_images*0.75):int(num_images*0.90)]
    val_set = file_names[int(num_images*0.90):]

    file_dict = {'train':train_set,'test':test_set,'valid':val_set}

    # Validate splits are not overlapping
    print('Last train file:',train_set[-1],f'| Samples:{len(train_set)}')
    print(f'First test file: {test_set[0]} | Last test file: {test_set[-1]} | Samples:{len(test_set)}')
    print(f'First valid file: {val_set[0]} | Samples:{len(val_set)}\n')

    # Copy each set to respective folders
    for folder in file_dict:    
#         print(f'Processing {folder}ing files...')    
        for file in file_dict[folder]:
            img = f'{file}.jpg'
            label = f'{file}.txt'
            
            # Check if both source files exist otherwise skip the item
            src_lbl = input_folder + label
            dest_lbl = f'{labels_root}/{folder}/{label}'
            src_img = input_folder + img  
            if not(os.path.isfile(src_lbl)) or not(os.path.isfile(src_img)): 
                print(f'Label or Image file not found for {file}, skipping item...')
                print('label:',src_lbl,'| img:',src_img)
                continue
            
            # Copy Images to destination folder                       
            shutil.copyfile(src_img, f'{images_root}/{folder}/{img}')

            # Copy Labels to destination folder
            if check_label(src_lbl, label_dict[current_plant], dest_lbl,log_level): continue
            shutil.copyfile(src_lbl, dest_lbl)
            
    print(f'Completed processing {current_plant} files.')

## Canadian Buffalo Berry

In [41]:
def process_Buffaloberry():
    # Canadian Buffalo Berry was formatted more discretely
    # Place images and labels into correct folders
    current_plant = 'Canada Buffaloberry'

    for folder in sub_folders:
        # ../Canadian_Buffaloberry/test
        current_folder = f'{input_root}/{current_plant}/{folder}'

        # Get all the image files within the subfolder & copy image file to final directory
        img_folder =  f'{current_folder}/images/' 
        for img in os.listdir(img_folder):
            src_img = img_folder + img        
            shutil.copyfile(src_img, f'{images_root}/{folder}/{img}')

        # Get all the label files within the subfolder & copy label file to final directory
        label_folder =  f'{current_folder}/labels/'
        for label in os.listdir(label_folder):        
            src_lbl = label_folder+label
            dest_lbl = f'{labels_root}/{folder}/{label}'
            # Function to check/adjust mislabeled txt files before adding to dataset
            # If label is adjusted, new file is created at destination, copy does not run
            if check_label(src_lbl, label_dict[current_plant], dest_lbl): continue
            shutil.copyfile(src_lbl, dest_lbl)

## Run through all src files

In [42]:
# Cycle through all source folders
for plant in all_plants:
    process_image_data(plant,log_level=1)
    print('====='*10)

Processing Canada Buffaloberry files in ./All_Plant_Data/Canada Buffaloberry/
Last train file: CanadianBuffaloBerry_64699456 | Samples:83
First test file: CanadianBuffaloBerry_186101743 | Last test file: CanadianBuffaloBerry_64125049 | Samples:16
First valid file: CanadianBuffaloBerry_35767192 | Samples:12

Completed processing Canada Buffaloberry files.
Processing Labrador Tea files in ./All_Plant_Data/Labrador Tea/
Last train file: Labroder_tea_087 | Samples:75
First test file: Labroder_tea_056 | Last test file: Labroder_tea_084 | Samples:15
First valid file: Labroder_tea_009 (2) | Samples:11

Completed processing Labrador Tea files.
Processing Prickly Rose files in ./All_Plant_Data/Prickly Rose/
Last train file: Prickly-rose082 | Samples:75
First test file: Prickly-Rose026 | Last test file: Prickly-Rose045 | Samples:15
First valid file: Prickly-rose092 | Samples:10

Updating labels in dataset ./TrainModels/datasets/Plant_Dataset_Updated/labels/train/Prickly-rose091.txt
Updating labe

Updating labels in dataset ./TrainModels/datasets/Plant_Dataset_Updated/labels/train/images (42).txt
Updating labels in dataset ./TrainModels/datasets/Plant_Dataset_Updated/labels/train/images - 2024-02-14T134224.773.txt
Updating labels in dataset ./TrainModels/datasets/Plant_Dataset_Updated/labels/train/images (46).txt
Updating labels in dataset ./TrainModels/datasets/Plant_Dataset_Updated/labels/train/images (15).txt
Updating labels in dataset ./TrainModels/datasets/Plant_Dataset_Updated/labels/train/images (60).txt
Updating labels in dataset ./TrainModels/datasets/Plant_Dataset_Updated/labels/train/images (92).txt
Updating labels in dataset ./TrainModels/datasets/Plant_Dataset_Updated/labels/train/images (24).txt
Updating labels in dataset ./TrainModels/datasets/Plant_Dataset_Updated/labels/train/images (17).txt
Updating labels in dataset ./TrainModels/datasets/Plant_Dataset_Updated/labels/train/images - 2024-02-14T134125.240.txt
Updating labels in dataset ./TrainModels/datasets/Pla

Completed processing Velvet Leaved Blueberry files.
