<a href="https://colab.research.google.com/github/JorgeAnsotegui/TFM/blob/main/Balancear_Dataset_Yolo_y_Coco.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Conectar Colab a Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json
import random
import shutil
from pathlib import Path

# Rutas de las carpetas de imágenes originales y el archivo de anotaciones
images_dir = "/content/drive/MyDrive/TFM/dataset_Jorge_V2/train/images"
labels_dir = "/content/drive/MyDrive/TFM/dataset_Jorge_V2/train/labels"
coco_annotations_path = "/content/drive/MyDrive/TFM/COCO_Segmentado.json"

# Ruta principal del dataset final
main_dir = "/content/dataset_Detectron2"

# Subcarpetas del dataset final
train_dir = os.path.join(main_dir, "train")
test_dir = os.path.join(main_dir, "test")
val_dir = os.path.join(main_dir, "val")

# Rutas de las carpetas de train, test y val para imágenes y etiquetas en formato YOLO
train_images_dir = os.path.join(train_dir, "images")
train_labels_dir = os.path.join(train_dir, "labels")
test_images_dir = os.path.join(test_dir, "images")
test_labels_dir = os.path.join(test_dir, "labels")
val_images_dir = os.path.join(val_dir, "images")
val_labels_dir = os.path.join(val_dir, "labels")

# Función para crear carpetas si no existen
def create_directories_if_not_exist(*directories):
    for directory in directories:
        os.makedirs(directory, exist_ok=True)

# Crear carpetas de train, test y val si no existen
create_directories_if_not_exist(train_dir, test_dir, val_dir)
create_directories_if_not_exist(train_images_dir, train_labels_dir, test_images_dir, test_labels_dir, val_images_dir, val_labels_dir)

# Proporciones para train, test y val
train_percent = 0.6
test_percent = 0.2
val_percent = 0.2

# Obtener lista de nombres de archivos de imágenes
image_files = os.listdir(images_dir)
# Filtrar solo los archivos con extensión .jpg
image_files = [file for file in image_files if file.endswith(".jpg")]

# Barajar la lista de nombres de archivos
random.shuffle(image_files)

# Calcular el número de archivos para cada conjunto
total_images = len(image_files)
num_train = int(total_images * train_percent)
num_test = int(total_images * test_percent)
num_val = total_images - num_train - num_test

# Dividir la lista de nombres de archivos en train, test y val
train_images = image_files[:num_train]
test_images = image_files[num_train:num_train+num_test]
val_images = image_files[num_train+num_test:]

# Función para copiar y renombrar archivos de etiquetas en formato YOLO
def copy_and_rename_labels(files, source_dir, dest_dir):
    for file in files:
        label_file = os.path.splitext(file)[0] + ".txt"
        print(label_file)
        source_label_path = os.path.join(source_dir, label_file)
        dest_label_path = os.path.join(dest_dir, label_file)
        shutil.copy(source_label_path, dest_label_path)

# Copiar y renombrar etiquetas en formato YOLO a las carpetas correspondientes
copy_and_rename_labels(train_images, labels_dir, train_labels_dir)
copy_and_rename_labels(test_images, labels_dir, test_labels_dir)
copy_and_rename_labels(val_images, labels_dir, val_labels_dir)

# Función para copiar imágenes a las carpetas correspondientes
def copy_files(files, source_dir, dest_images_dir):
    for file in files:
        shutil.copy(os.path.join(source_dir, file), dest_images_dir)

# Copiar imágenes a las carpetas correspondientes
copy_files(train_images, images_dir, train_images_dir)
copy_files(test_images, images_dir, test_images_dir)
copy_files(val_images, images_dir, val_images_dir)

# Leer el archivo de anotaciones COCO
with open(coco_annotations_path, 'r') as f:
    coco_data = json.load(f)

# Crear nuevas estructuras de datos para las anotaciones de train, val y test
train_annotations = {"info": coco_data["info"], "licenses": coco_data["licenses"], "images": [], "annotations": [], "categories": coco_data["categories"]}
val_annotations = {"info": coco_data["info"], "licenses": coco_data["licenses"], "images": [], "annotations": [], "categories": coco_data["categories"]}
test_annotations = {"info": coco_data["info"], "licenses": coco_data["licenses"], "images": [], "annotations": [], "categories": coco_data["categories"]}

# Crear un diccionario para mapear image_id a conjunto
image_to_set = {img: 'train' for img in train_images}
image_to_set.update({img: 'test' for img in test_images})
image_to_set.update({img: 'val' for img in val_images})

# Crear un diccionario para mapear image_id a conjunto
image_id_to_set = {}
for img in coco_data["images"]:
    file_name = img["file_name"]
    if file_name in image_to_set:
        set_name = image_to_set[file_name]
        image_id_to_set[img["id"]] = set_name
        if set_name == 'train':
            train_annotations["images"].append(img)
        elif set_name == 'val':
            val_annotations["images"].append(img)
        elif set_name == 'test':
            test_annotations["images"].append(img)

# Dividir las anotaciones entre train, val y test
for ann in coco_data["annotations"]:
    image_id = ann["image_id"]
    if image_id in image_id_to_set:
        set_name = image_id_to_set[image_id]
        if set_name == 'train':
            train_annotations["annotations"].append(ann)
        elif set_name == 'val':
            val_annotations["annotations"].append(ann)
        elif set_name == 'test':
            test_annotations["annotations"].append(ann)

# Guardar los nuevos archivos de anotaciones
with open(os.path.join(train_dir, 'annotations.json'), 'w') as f:
    json.dump(train_annotations, f)

with open(os.path.join(val_dir, 'annotations.json'), 'w') as f:
    json.dump(val_annotations, f)

with open(os.path.join(test_dir, 'annotations.json'), 'w') as f:
    json.dump(test_annotations, f)

print("División de imágenes, etiquetas en formato YOLO y anotaciones completada.")


100H0093.txt
100H0140.txt
100H0192.txt
100H0006.txt
100H0164.txt
100H0010.txt
100H0143.txt
100H0156.txt
100H0194.txt
100H0198.txt
100H0024.txt
100H0080.txt
100H0005.txt
100H0172.txt
100H0047.txt
100H0033.txt
100H0238.txt
100H0046.txt
100H0190.txt
100H0074.txt
100H0015.txt
100H0034.txt
100H0049.txt
100H0206.txt
100H0173.txt
100H0025.txt
100H0071.txt
100H0136.txt
100H0127.txt
100H0253.txt
100H0153.txt
100H0168.txt
100H0169.txt
100H0091.txt
100H0019.txt
100H0053.txt
100H0039.txt
100H0231.txt
100H0135.txt
100H0044.txt
100H0112.txt
100H0146.txt
100H0180.txt
100H0099.txt
100H0011.txt
100H0256.txt
100H0029.txt
100H0123.txt
100H0064.txt
100H0030.txt
100H0085.txt
100H0201.txt
100H0216.txt
100H0103.txt
100H0106.txt
100H0247.txt
100H0129.txt
100H0196.txt
100H0210.txt
100H0179.txt
100H0023.txt
100H0133.txt
100H0207.txt
100H0069.txt
100H0175.txt
100H0119.txt
100H0159.txt
100H0013.txt
100H0087.txt
100H0100.txt
100H0014.txt
100H0058.txt
100H0189.txt
100H0263.txt
100H0157.txt
100H0050.txt
100H0214.txt

# Comprobación de la división

In [3]:
import json

# Ruta del archivo de anotaciones COCO
coco_annotations_path = "/content/dataset_Detectron2/train/annotations.json"

# Leer el archivo de anotaciones COCO
with open(coco_annotations_path, 'r') as f:
    coco_data = json.load(f)

# Obtener la lista de nombres de las imágenes
image_names = [img['file_name'] for img in coco_data['images']]

# Ordenar la lista de nombres de las imágenes alfanuméricamente
image_names.sort()

# Imprimir la lista de nombres de las imágenes ordenada
for name in image_names:
    print(name)

# Opcional: imprimir el número total de imágenes
print(f'Número total de imágenes: {len(image_names)}')


100H0003.jpg
100H0004.jpg
100H0006.jpg
100H0007.jpg
100H0010.jpg
100H0011.jpg
100H0013.jpg
100H0014.jpg
100H0015.jpg
100H0019.jpg
100H0023.jpg
100H0024.jpg
100H0025.jpg
100H0028.jpg
100H0029.jpg
100H0030.jpg
100H0033.jpg
100H0034.jpg
100H0036.jpg
100H0039.jpg
100H0042.jpg
100H0044.jpg
100H0045.jpg
100H0046.jpg
100H0047.jpg
100H0049.jpg
100H0050.jpg
100H0051.jpg
100H0052.jpg
100H0053.jpg
100H0056.jpg
100H0057.jpg
100H0058.jpg
100H0060.jpg
100H0063.jpg
100H0064.jpg
100H0065.jpg
100H0066.jpg
100H0069.jpg
100H0071.jpg
100H0074.jpg
100H0075.jpg
100H0078.jpg
100H0080.jpg
100H0081.jpg
100H0085.jpg
100H0086.jpg
100H0087.jpg
100H0091.jpg
100H0093.jpg
100H0099.jpg
100H0100.jpg
100H0102.jpg
100H0103.jpg
100H0105.jpg
100H0106.jpg
100H0108.jpg
100H0109.jpg
100H0110.jpg
100H0112.jpg
100H0115.jpg
100H0119.jpg
100H0123.jpg
100H0124.jpg
100H0125.jpg
100H0127.jpg
100H0129.jpg
100H0130.jpg
100H0131.jpg
100H0133.jpg
100H0135.jpg
100H0136.jpg
100H0137.jpg
100H0139.jpg
100H0140.jpg
100H0143.jpg
100H0145.jpg

# Guardamos el nuevo dataset con ambas etiquetas.


In [4]:
# Directorio de origen
src_dir = "/content/dataset_Detectron2"

# Directorio de destino
dest_dir = "/content/drive/MyDrive/TFM/dataset_Detectron2"

def copy_directory(src, dest):
    if not os.path.exists(dest):
        os.makedirs(dest)

    for root, dirs, files in os.walk(src):
        for name in dirs:
            src_dir = os.path.join(root, name)
            dest_dir = os.path.join(dest, os.path.relpath(src_dir, src))
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)

        for name in files:
            src_file = os.path.join(root, name)
            dest_file = os.path.join(dest, os.path.relpath(src_file, src))
            if not os.path.exists(os.path.dirname(dest_file)):
                os.makedirs(os.path.dirname(dest_file))
            shutil.copy2(src_file, dest_file)

# Copiar el directorio de origen al directorio de destino
copy_directory(src_dir, dest_dir)

print(f"Directorio copiado de {src_dir} a {dest_dir}")

Directorio copiado de /content/dataset_Detectron2 a /content/drive/MyDrive/TFM/dataset_Detectron2
