# Libraries

In [None]:
import glob
import pandas as pd
import numpy as np
from PIL import Image
import os

# Utils functions

### Generate metadata .csv file

In [None]:
def read_txt_folder(folder_path):
  """Read all txt annotation files & return a dataframe containing them
  Input:
    folder_path : folder's path contained txt files
  Output:
    txt_csv: (df) csv file for the labels of those dataset
  """

  txt_csv = list()
  # make sure there's a slash to the folder path 
  folder_path += "" if folder_path[-1] == "/" else "/"
  # get all text files
  txt_files = glob.glob(folder_path + "*.txt")

  # Read each txt file
  for txt_file in txt_files:
    id = [txt_file.strip().split('/')[-1][:-4], 960.0, 960.0]
    # Read the content of file
    with open(txt_file, 'rt') as fd:
      lines = fd.readlines()
      for line in lines:
        box = line.strip().split(' ')
        txt_csv.append(id+box)
  
  return txt_csv


# train_csv = read_txt_folder('./dataset/labels/train')
# anno_train = pd.DataFrame(train_csv, columns=['image_id', 'width', 'height', 'label', 'x', 'y', 'w', 'h'])
# anno_train[['width', 'height', 'label', 'x', 'y', 'w', 'h']].astype(float)

# anno_train.to_csv('./dataset/train_csv.csv', index=False)

### Delete files (.jpg or .txt) from folder based on list of file's names

In [None]:
# DELETE all the added images & annotation files from train_val folder
def delete_files_list(folder_path, indx_lst):
  """Delete all img_{}.jpeg or img_{}.txt files from a given folder given the list of indexes (file's names)
  Inputs:
    folder_path: (str) path direct to folder
    indx_lst: (list) of deleted file's names
  """
  a = 0
  # (OPTION 1) DELETE IMAGE
  # make sure there's a slash to the folder path 
  folder_path += "" if folder_path[-1] == "/" else "/"
  # get all text files
  img_files = glob.glob(folder_path + "*.jpg")

  for img_f in img_files:
    # Extract the name (id) of txt annotation file
    img_f = img_f.strip().split('/')[-1][:-4]

    if int(img_f) in indx_lst:

      # Remove file from folder
      os.remove(f"{folder_path}{img_f}.jpg")
      # print(f"{folder_path}{img_f}.jpg")


  # # (OPTION 2) DELETE TXT FILE
  # # make sure there's a slash to the folder path 
  # folder_path += "" if folder_path[-1] == "/" else "/"
  # # get all text files
  # txt_files = glob.glob(folder_path + "*.txt")

  # for txt_f in txt_files:
  #   # Extract the name (id) of txt annotation file
  #   txt_f = txt_f.strip().split('/')[-1][:-4]

  #   if int(txt_f) in indx_lst:
  #     a += 1
  #     # Remove file from folder
  #     os.remove(f"{folder_path}{txt_f}.txt")
  #     # print(f"{folder_path}{txt_f}.txt")


# delete_files_list('./dataset/images/train', train_lst)
# delete_files_list('./dataset/labels/train', train_lst)

### Delete all files (.jpg or .txt) from a folder based on the name's pattern

In [None]:
def delete_files_base_pattern(folder_path, pattern):
  """Delete all img_{}.jpeg or img_{}.txt files from a given folder base on the name's pattern
  Inputs:
    folder_path: (str) path direct to folder
    pattern: (str) pattern of deleted file's names
  """
  a = 0
  # (OPTION 1) DELETE IMAGE
  # make sure there's a slash to the folder path 
  folder_path += "" if folder_path[-1] == "/" else "/"
  # get all text files
  img_files = glob.glob(folder_path + "*.jpg")

  for img_f in img_files:
    # Extract the name (id) of txt annotation file
    img_f = img_f.strip().split('/')[-1][:-4]
    # Filter only image with name "img_{}.jpeg"
    result = re.findall(pattern, img_f)
    if len(result):
      # Remove file from folder
      os.remove(folder_path+result[0]+'.jpg')
      # print(folder_path+result[0]+'.jpg')


  # # (OPTION 2) DELETE TXT FILE
  # # make sure there's a slash to the folder path 
  # folder_path += "" if folder_path[-1] == "/" else "/"
  # # get all text files
  # txt_files = glob.glob(folder_path + "*.txt")

  # for txt_f in txt_files:
  #   # Extract the name (id) of txt annotation file
  #   txt_f = txt_f.strip().split('/')[-1][:-4]
  #   # Filter only image with pattern's name
  #   result = re.findall(pattern, txt_f)
  #   if len(result):
  #     a += 1
  #     # Remove file from folder
  #     os.remove(folder_path+result[0]+'.txt')
  #     # print(folder_path+result[0]+'.txt')


# delete_files_base_pattern("./dataset_aug/labels/train/", r"img_mosaic_.*")
# delete_files_base_pattern("./dataset_aug/images/train/", r"img_mosaic_.*")

### Bounding boxes types conversion
Types of bounding boxes: https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/

In [None]:
def pascal_to_yolo(xmin, ymin, xmax, ymax, image_width=640, image_height=640):
  """Convert between pascal to yolo bboxes type"""
  x_coord = (xmin + xmax) / 2 / image_width
  y_coord = (ymin + ymax) / 2 / image_height
  shape_width = (xmax - xmin) / image_width
  shape_height = (ymax - ymin) / image_height
  return x_coord, y_coord, shape_width, shape_height


def yolo_to_pascal(x, y, w, h, width, height):
  """Convert between yolo to pascal bboxes type"""
  xmax = int((x*width) + (w * width)/2.0)
  xmin = int((x*width) - (w * width)/2.0)
  ymax = int((y*height) + (h * height)/2.0)
  ymin = int((y*height) - (h * height)/2.0)
  return xmin, ymin, xmax, ymax

def yolo_to_coco(bbox, orig_w, orig_h):
  """Convert between yolo to coco bboxes type"""
  bbox[:, 2] = bbox[:, 2]*orig_w
  bbox[:, 3] = bbox[:, 3]*orig_h
  bbox[:, 1] = bbox[:, 1]*orig_h - (bbox[:, 3]/2)
  bbox[:, 0] = bbox[:, 0]*orig_w - (bbox[:, 2]/2)
  return bbox

### Count number of .jpg & .txt files in the folder

In [None]:
def count_files_img(folder_path):
  """Count all .jpg files from the folder
  Input:
    folder_path: (str) path to folder contained .jpg files
  Outputs:
    jpg_count: (int) total # of .jpf files"""
  # make sure there's a slash to the folder path 
  folder_path += "" if folder_path[-1] == "/" else "/"
  # get all text files
  img_files = glob.glob(folder_path + "*.jpg")
  jpg_count = len(img_files)
  return jpg_count

def count_files_txt(folder_path):
  """Count all .txt files from the folder
  Input:
    folder_path: (str) path to folder contained .txt files
  Outputs:
    txt_count: (int) total # of .txt files"""
  # make sure there's a slash to the folder path 
  folder_path += "" if folder_path[-1] == "/" else "/"
  # get all text files
  img_files = glob.glob(folder_path + "*.txt")
  txt_count = len(img_files)
  return txt_count


# print(f"Number of train's images (jpg) in the folder: {count_files_img('./dataset/images/train')}")
# print(f"Number of train's labels (txt) in the folder: {count_files_txt('./dataset/labels/train')}")

### Load & Copy .jpg images or .txt files from a folder to another folder

In [None]:
# DONE
def load_copy_images(start_folder, end_folder):
  """Load all the .jpg images from the start_folder & save them into end_folder
  Inputs:
    start_folder: (str) path direct to the start's folder
    end_folder: (str) paht direct to the end's folder
  """

  # make sure there's a slash to the folder path 
  start_folder += "" if start_folder[-1] == "/" else "/"
  # get all text files
  img_files = glob.glob(start_folder + "*.jpg")

  for img in img_files:
    # Load the images from start_folder (dataset)
    img_arr = cv2.imread(img, cv2.IMREAD_COLOR)
    assert img_arr is not None, 'Image Not Found ' + imgpath
    img_arr = cv2.cvtColor(img_arr, cv2.COLOR_BGR2RGB)  # BGR to RGB


    # Extract the name (id) of images
    img_id = img.strip().split('/')[-1][:-4]

    # Save images to end_folder (dataset_aug)
    im = Image.fromarray(img_arr, "RGB")
    im.save(f"{end_folder}/{img_id}.jpg")

# load_copy_images('./dataset/images/train', './dataset_aug/images/train')

In [None]:
# DONE
def load_copy_anno_txt(start_folder, end_folder):
  """Load & copy annotation .txt files from start_folder to end_folder
  Inputs:
    start_folder: (str) path direct to the start's folder
    end_folder: (str) paht direct to the end's folder
  """
  # make sure there's a slash to the folder path 
  start_folder += "" if start_folder[-1] == "/" else "/"

  # get all text files
  txt_files = glob.glob(start_folder + "*.txt")
  for txt_f in txt_files:
    # Extract the name (id) of txt annotation file
    txt_id = txt_f.strip().split('/')[-1][:-4]

    with open(txt_f, 'r') as fd:
      lines = fd.readlines()
      # Save the string to txt file in end_folder (dataset_aug)
      txt_file = open(f'{end_folder}/{txt_id}.txt', 'w')
      for line in lines:
        txt_file.write(line)  # Write each txt line into a new file
      txt_file.close()

# public_test txt folder
# load_copy_anno_txt('./dataset/labels/train', './dataset_aug/labels/train')

## Load images & txt labels file based on the name's pattern

In [None]:
# DONE
def pattern_copy_images(start_folder, end_folder, pattern):
  """Load all the .jpeg images from the start_folder & save them into end_folder"""

  # make sure there's a slash to the folder path 
  start_folder += "" if start_folder[-1] == "/" else "/"
  # get all text files
  img_files = glob.glob(start_folder + "*.jpg")

  for img in img_files:
    # Load the images from start_folder (dataset)
    img_arr = cv2.imread(img, cv2.IMREAD_COLOR)
    assert img_arr is not None, 'Image Not Found ' + imgpath
    img_arr = cv2.cvtColor(img_arr, cv2.COLOR_BGR2RGB)  # BGR to RGB

    # Extract the name (id) of images
    img_id = img.strip().split('/')[-1][:-4]
    # Filter only image with pattern's name
    result = re.findall(pattern, img_id)

    if len(result):
      # Save images to end_folder (dataset_aug)
      im = Image.fromarray(img_arr, "RGB")
      print(im, f"{end_folder}/{result[0]}.jpg")
      im.save(f"{end_folder}/{result[0]}.jpg")


## Convert all file's images from .jpeg to .jpg in a folder

In [None]:
def jpeg_to_jpg(start_path, end_path):
  """Convert all image type .jpeg from a start_path (folder) to type .jpg & save them to end_path (folder)"""
  # make sure there's a slash to the folder path 
  start_path += "" if start_path[-1] == "/" else "/"
  # get all text files
  img_files = glob.glob(start_path + "*.jpeg")

  for img_f in img_files:
    # importing the image 
    im = Image.open(img_f)
    # Extract the name (id) of txt annotation file
    img_id = img_f.strip().split('/')[-1][:-5]
    # converting to jpg
    rgb_im = im.convert("RGB")
    # exporting the image
    rgb_im.save(end_path+img_id+'.jpg')


# jpeg_to_jpg('./dataset/images/train_jpeg', './dataset/images/train_jpg')
# jpeg_to_jpg('./dataset/images/val_jpeg', './dataset/images/val_jpg/')