# Access to Google drive folder

Allow access to our drive

In [None]:
# Drive in notebook
from google.colab import drive
drive.mount('/content/drive', force_remount=True) # Mount the folder we want to access

Mounted at /content/drive


## Load libraries

In [None]:
# General libraries
import os # to copy, move, etc directories 
import json
import shutil # copiar las carpetas

from pycocotools.coco import COCO
import pandas as pd # data processing
from collections import defaultdict # Dictionaries to handle the annotations

from sklearn.model_selection import train_test_split

# PREPROCESS DATA

TACO dataset was labeled whit COCO format, to be able to do the training whit YOLOv7 the images labels need to be in the YOLO labeling format and the images and their labels need to be in a specific folder structure.

Folders structure:

    images
        - train
            - img1.jpg
            ...
        - val
        - test
    labels
        - train
            - img1.txt
            ...
        - val
        - test
     
* *images*: Each folder whitin the folder 'images' has the images that were labeled corresponding to each subset (train, val and test).

* *labels*: Folder whit their corresponding txt of these images

* *txt structure*:

    Example: 0 0.69568 0.31823 0.075 0.5362

    * The first number is the class label.
    * The rest are the bounding box coordinates in *x_center*, *y_center*, *width*, *height* format.
    

## Create directories for the folders


Go to folder that stores the dataset


In [None]:
# list files and directories of the current working directory
!ls

drive  sample_data


In [None]:
# change from current working directory to project working directory 
%cd /content/drive/MyDrive/Portafolio/ComputerVision/Object_Detection/Waste_in_the wild/taco-trash-dataset

/content/drive/MyDrive/Portafolio/ComputerVision/Object_Detection/Waste_in_the wild/taco-trash-dataset


In [None]:
# show the full path name of current directory
!pwd
!ls

/content/drive/MyDrive/Portafolio/ComputerVision/Object_Detection/Waste_in_the wild/taco-trash-dataset
data


Create the 'images' folder structure (inside split_dataset folder)

In [None]:
# mkdir creates directories
# -p allows to create directories inside the directory I'm creating
!mkdir -p split_dataset/images/train split_dataset/images/val split_dataset/images/test 

Create the 'labels' folder structure (inside split_dataset folder)

In [None]:
!mkdir -p split_dataset/labels/train split_dataset/labels/val split_dataset/labels/test 

In [None]:
# Show structure created
!ls
!ls 'split_dataset'
!ls 'split_dataset/images'
!ls 'split_dataset/labels'

data  split_dataset
images	labels
test  train  val
test  train  val


## Prepare Dataset

In [None]:
# PATHS

DATASET_PATH = '/content/drive/MyDrive/Portafolio/ComputerVision/Object_Detection/Waste_in_the wild/taco-trash-dataset'
DATA_DIR = 'data'
SPLIT_DATA_DIR = 'split_dataset'

ANNOTATIONS_FILE = os.path.join(DATASET_PATH, DATA_DIR, 'annotations.json')
SPLIT_DATA_PATH = os.path.join(DATASET_PATH, SPLIT_DATA_DIR)


IMAGES_PATH = os.path.join(SPLIT_DATA_PATH, 'images')
LABELS_PATH = os.path.join(SPLIT_DATA_PATH, 'labels')

TRAIN_IMAGES_PATH = os.path.join(IMAGES_PATH, 'train')
TRAIN_LABELS_PATH = os.path.join(LABELS_PATH, 'train')

VALID_IMAGES_PATH = os.path.join(IMAGES_PATH, 'val')
VALID_LABELS_PATH = os.path.join(LABELS_PATH, 'val')

TEST_IMAGES_PATH = os.path.join(IMAGES_PATH, 'test')
TEST_LABELS_PATH = os.path.join(LABELS_PATH, 'test')

In [None]:
# Read annotations.json file

with open(ANNOTATIONS_FILE, 'r') as json_file:
    data_json = json.load(json_file)

In [None]:
# data_json store a dictionary,show keys
data_json.keys()

dict_keys(['info', 'images', 'annotations', 'scene_annotations', 'licenses', 'categories', 'scene_categories'])

In [None]:
# store into a list all the paths of the images
path_imgs = []

path_imgs = [os.path.join(DATASET_PATH, DATA_DIR, img['file_name']) for img in data_json['images']]

path_imgs[:5]

['/content/drive/MyDrive/Portafolio/ComputerVision/Object_Detection/Waste_in_the wild/taco-trash-dataset/data/batch_1/000006.jpg',
 '/content/drive/MyDrive/Portafolio/ComputerVision/Object_Detection/Waste_in_the wild/taco-trash-dataset/data/batch_1/000008.jpg',
 '/content/drive/MyDrive/Portafolio/ComputerVision/Object_Detection/Waste_in_the wild/taco-trash-dataset/data/batch_1/000010.jpg',
 '/content/drive/MyDrive/Portafolio/ComputerVision/Object_Detection/Waste_in_the wild/taco-trash-dataset/data/batch_1/000019.jpg',
 '/content/drive/MyDrive/Portafolio/ComputerVision/Object_Detection/Waste_in_the wild/taco-trash-dataset/data/batch_1/000026.jpg']

## Split dataset

Break off validation set and test set from training data 

In [None]:
# Split into train, val and test the paths list
# 80% train subset
# 20% val subset
train_imgs, val_imgs = train_test_split(path_imgs, train_size= 0.8, random_state= 1) # train_test_split of keras

Break off test set from validation set

In [None]:
# 18% val
# 2% test 
val_imgs, test_imgs = train_test_split(val_imgs, test_size=0.05, random_state=1)

In [None]:
# Show length of each subset
print(len(train_imgs))
print(len(val_imgs))
print(len(test_imgs))

1200
285
15


Copy the images to the corresponding folder (images/train, images/val, images/test) and rename files to unique names in the destination directory

In [None]:
# Copy a list of images to another directory
# Returns a list whit names (changed) of the copied images 

def copy_images (list_paths, destination_dir) :
    list_names = []
    for path in list_paths:
        try:
            if os.path.isfile(path):
                new_name = rename_file(path)
                shutil.copy(path, os.path.join(destination_dir, new_name)) # copy and rename image file 
                list_names.append(new_name)
        except:
            print('COULD NOT COPY THE FILE' + path)
    return list_names

# Rename file
def rename_file (original_name):
    return original_name.split('/')[-2] + '-' + original_name.split('/')[-1] # example 000001.jpg -> batch_x_000001.jpg

In [None]:
# Copy images
copied_train_img = copy_images(train_imgs, TRAIN_IMAGES_PATH) # Copy train images to images/train
copied_valid_img = copy_images(val_imgs, VALID_IMAGES_PATH) # Copy validation images to images/val
copied_test_img = copy_images(test_imgs, TEST_IMAGES_PATH) # Copy test images to images/test

In [None]:
# Show length of each returned list 
print(len(copied_train_img))
print(len(copied_valid_img))
print(len(copied_test_img))

1200
285
15


In [None]:
# display content of test
copied_test_img

['batch_15-000021.jpg',
 'batch_6-000094.JPG',
 'batch_14-000030.jpg',
 'batch_4-000085.JPG',
 'batch_15-000035.jpg',
 'batch_3-IMG_5064.JPG',
 'batch_3-IMG_4860.JPG',
 'batch_14-000046.jpg',
 'batch_5-000023.JPG',
 'batch_14-000024.jpg',
 'batch_4-000008.JPG',
 'batch_10-000003.jpg',
 'batch_13-000007.jpg',
 'batch_13-000020.jpg',
 'batch_4-000004.JPG']

#### Collect labels

Get the annotations of the images

Get image id and create a list of dictionaries with only the necessary annotation data (id_category, bbox)

In [None]:
data_source = COCO(annotation_file= ANNOTATIONS_FILE)


img_id_with_anns = defaultdict(list) # create a list to store dictionaries

# For each image id it stores its annotations (id_category and bbox) as dictionaries
for ann in data_json['annotations']: 
    image_id = ann['image_id']
    img_info = data_source.loadImgs(image_id)[0] # get annotations for the corresponfing image_id
    
    # get width and height of image
    width_img = img_info['width']
    height_img = img_info['height']
    # get bbox coordinates
    (x,y,w,h) = ann['bbox']

    # Normalizer to 0-1 the bbox coordinates
    if w < 1 or h < 1: # skip annotations that have widht/height extremely small
      continue
    # top_x,top_y,width,height ==> cen_x,cen_y,width,height
    coor_x = round((x + w / 2) / width_img, 6)
    coor_y = round((y + h / 2) / height_img, 6)
    width = round(w / width_img, 6)
    height = round(h / height_img, 6) 
    
    # Get only the necessary annotation data
    temp = {
        'category_id' : ann['category_id'],
        'coor_x' :  coor_x,
        'coor_y' :  coor_y,
        'width' :  width,
        'height' :  height
    }
    img_id_with_anns[image_id].append(temp)

loading annotations into memory...
Done (t=0.07s)
creating index...
index created!


In [None]:
# Creating a dictionary to access with file_name as key to the image id 

info_img = {}
for img in data_json['images']:
    info_img[img['file_name']] = img['id']

In [None]:
# Get annotations from a list of images

# Prameter: a list whit the changed names of the images 
# Returns a list whit the annotations corresponding to the images of the list it received

def get_img_anns(list_imgs_names) :
    temp_anns = []

    for img_name in list_imgs_names:
        file_name_img = img_name.replace('-','/') # Get file name
        img_id = info_img[file_name_img] # Get image id
        img_anns = img_id_with_anns[img_id] # Get annotations
        
        temp_anns.append(img_anns)

    return temp_anns

In [None]:
# Get a list of annotations for each subset

anns_train = get_img_anns(copied_train_img)
anns_valid = get_img_anns(copied_valid_img)
anns_test = get_img_anns(copied_test_img)
#len(x)
#x[0]
anns_test[:5]

[[{'category_id': 57,
   'coor_x': 0.61556,
   'coor_y': 0.142822,
   'width': 0.064453,
   'height': 0.057129},
  {'category_id': 57,
   'coor_x': 0.563151,
   'coor_y': 0.444824,
   'width': 0.115885,
   'height': 0.05957},
  {'category_id': 39,
   'coor_x': 0.339193,
   'coor_y': 0.529053,
   'width': 0.226562,
   'height': 0.174316},
  {'category_id': 39,
   'coor_x': 0.421875,
   'coor_y': 0.671387,
   'width': 0.302083,
   'height': 0.199219}],
 [{'category_id': 36,
   'coor_x': 0.525327,
   'coor_y': 0.629749,
   'width': 0.181373,
   'height': 0.116728}],
 [{'category_id': 7,
   'coor_x': 0.473049,
   'coor_y': 0.562872,
   'width': 0.148479,
   'height': 0.109375}],
 [{'category_id': 12,
   'coor_x': 0.335018,
   'coor_y': 0.538399,
   'width': 0.070772,
   'height': 0.041667},
  {'category_id': 59,
   'coor_x': 0.561275,
   'coor_y': 0.335784,
   'width': 0.006127,
   'height': 0.017974},
  {'category_id': 59,
   'coor_x': 0.367647,
   'coor_y': 0.880719,
   'width': 0.009804

#### Create txt files

In [None]:
# Parameters:  
# imgs_filename - list of names of the images for wich it will create the txt
# info_labels - list of annotations for each image id
# destination_path - path where the txt will be saved

# Every image has a separate text file containing the class label and annotations for each object in a new line.
def create_txts (imgs_filename, info_labels, destination_path):

    for index, img_name in enumerate(imgs_filename):
        path_txt = os.path.join(destination_path, img_name.split('.')[0]) # Get name of the image to create the path of the txt   

        # Create txt file 
        with open(path_txt + '.txt', mode='w') as file_txt: 
            lines = ''
            # Store in lines each dictionary (containing the annotations) of the image
            for x in info_labels[index]:
                #print(x)
                lines += str(x['category_id']) + ' ' + str(x['coor_x']) + ' ' + str(x['coor_y'])+ ' ' + str(x['width']) + ' ' + str(x['height']) + '\n'
            file_txt.writelines(lines)

In [None]:
# Create txt 

create_txts(copied_train_img, anns_train, TRAIN_LABELS_PATH) # Create txt for training images into labels/train
create_txts(copied_valid_img, anns_valid, VALID_LABELS_PATH) # Create txt for validation images into labels/val
create_txts(copied_test_img, anns_test, TEST_LABELS_PATH) # Create txt for testing images into labels/test