<a href="https://colab.research.google.com/github/JorgeAnsotegui/TFM/blob/main/Balancear_Dataset_Yolo_y_Coco.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Conectar Colab a Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json
import random
import shutil
from pathlib import Path
from PIL import Image

# Rutas de las carpetas de imágenes originales y el archivo de anotaciones
images_dir_yolo = "/content/drive/MyDrive/TFM/dataset_Jorge_V2/train/images"
images_dir_coco = "/content/drive/MyDrive/TFM/Coco_Limpio"  # Nueva ruta para las imágenes COCO
labels_dir = "/content/drive/MyDrive/TFM/dataset_Jorge_V2/train/labels"
coco_annotations_path = "/content/drive/MyDrive/TFM/Coco_Limpio/COCO_Segmentado.json"

# Ruta principal del dataset final
main_dir = "/content/dataset_Detectron2"

# Subcarpetas del dataset final
train_dir = os.path.join(main_dir, "train")
test_dir = os.path.join(main_dir, "test")
val_dir = os.path.join(main_dir, "val")

# Rutas de las carpetas de train, test y val para imágenes y etiquetas en formato YOLO
train_images_dir = os.path.join(train_dir, "images")
train_labels_dir = os.path.join(train_dir, "labels")
test_images_dir = os.path.join(test_dir, "images")
test_labels_dir = os.path.join(test_dir, "labels")
val_images_dir = os.path.join(val_dir, "images")
val_labels_dir = os.path.join(val_dir, "labels")

# Función para crear carpetas si no existen
def create_directories_if_not_exist(*directories):
    for directory in directories:
        os.makedirs(directory, exist_ok=True)

# Crear carpetas de train, test y val si no existen
create_directories_if_not_exist(train_dir, test_dir, val_dir)
create_directories_if_not_exist(train_images_dir, train_labels_dir, test_images_dir, test_labels_dir, val_images_dir, val_labels_dir)

# Proporciones para train, test y val
train_percent = 0.6
test_percent = 0.2
val_percent = 0.2

# Obtener lista de nombres de archivos de imágenes
image_files = os.listdir(images_dir_yolo)
# Filtrar solo los archivos con extensión .jpg
image_files = [file for file in image_files if file.endswith(".jpg")]

# Barajar la lista de nombres de archivos
random.shuffle(image_files)

# Calcular el número de archivos para cada conjunto
total_images = len(image_files)
num_train = int(total_images * train_percent)
num_test = int(total_images * test_percent)
num_val = total_images - num_train - num_test

# Dividir la lista de nombres de archivos en train, test y val
train_images = image_files[:num_train]
test_images = image_files[num_train:num_train+num_test]
val_images = image_files[num_train+num_test:]

# Función para copiar y renombrar archivos de etiquetas en formato YOLO
def copy_and_rename_labels(files, source_dir, dest_dir):
    for file in files:
        label_file = os.path.splitext(file)[0] + ".txt"
        print(label_file)
        source_label_path = os.path.join(source_dir, label_file)
        dest_label_path = os.path.join(dest_dir, label_file)
        shutil.copy(source_label_path, dest_label_path)

# Copiar y renombrar etiquetas en formato YOLO a las carpetas correspondientes
copy_and_rename_labels(train_images, labels_dir, train_labels_dir)
copy_and_rename_labels(test_images, labels_dir, test_labels_dir)
copy_and_rename_labels(val_images, labels_dir, val_labels_dir)

# Función para copiar imágenes a las carpetas correspondientes
def copy_files(files, source_dir, dest_images_dir):
    for file in files:
        shutil.copy(os.path.join(source_dir, file), dest_images_dir)

# Copiar imágenes a las carpetas correspondientes
copy_files(train_images, images_dir_yolo, train_images_dir)
copy_files(test_images, images_dir_yolo, test_images_dir)
copy_files(val_images, images_dir_yolo, val_images_dir)

# Leer el archivo de anotaciones COCO
with open(coco_annotations_path, 'r') as f:
    coco_data = json.load(f)

# Crear nuevas estructuras de datos para las anotaciones de train, val y test
train_annotations = {"info": coco_data["info"], "licenses": coco_data["licenses"], "images": [], "annotations": [], "categories": coco_data["categories"]}
val_annotations = {"info": coco_data["info"], "licenses": coco_data["licenses"], "images": [], "annotations": [], "categories": coco_data["categories"]}
test_annotations = {"info": coco_data["info"], "licenses": coco_data["licenses"], "images": [], "annotations": [], "categories": coco_data["categories"]}

# Crear un diccionario para mapear image_id a conjunto
image_to_set = {img: 'train' for img in train_images}
image_to_set.update({img: 'test' for img in test_images})
image_to_set.update({img: 'val' for img in val_images})

# Función para escalar las anotaciones de COCO
def scale_annotations(image_id, yolo_width, yolo_height, coco_width, coco_height):
    scale_x = yolo_width / coco_width
    scale_y = yolo_height / coco_height
    for ann in coco_data["annotations"]:
        if ann["image_id"] == image_id:
            # Escalar bounding box
            ann["bbox"] = [
                ann["bbox"][0] * scale_x,
                ann["bbox"][1] * scale_y,
                ann["bbox"][2] * scale_x,
                ann["bbox"][3] * scale_y
            ]
            # Escalar segmentaciones
            if ann["segmentation"]:
                scaled_segmentation = []
                for segment in ann["segmentation"]:
                    scaled_segment = [coord * scale_x if i % 2 == 0 else coord * scale_y for i, coord in enumerate(segment)]
                    scaled_segmentation.append(scaled_segment)
                ann["segmentation"] = scaled_segmentation

# Crear un diccionario para mapear image_id a conjunto
image_id_to_set = {}
for img in coco_data["images"]:
    file_name = img["file_name"]
    if file_name in image_to_set:
        set_name = image_to_set[file_name]

        # Verificar tamaños de las imágenes de YOLO y COCO
        yolo_image_path = os.path.join(images_dir_yolo, file_name)
        coco_image_path = os.path.join(images_dir_coco, file_name)
        with Image.open(yolo_image_path) as yolo_img, Image.open(coco_image_path) as coco_img:
            yolo_width, yolo_height = yolo_img.size
            coco_width, coco_height = coco_img.size

            if (yolo_width, yolo_height) != (coco_width, coco_height):
                img["width"] = yolo_width
                img["height"] = yolo_height
                scale_annotations(img["id"], yolo_width, yolo_height, coco_width, coco_height)

        image_id_to_set[img["id"]] = set_name
        if set_name == 'train':
            train_annotations["images"].append(img)
        elif set_name == 'val':
            val_annotations["images"].append(img)
        elif set_name == 'test':
            test_annotations["images"].append(img)

# Dividir las anotaciones entre train, val y test
for ann in coco_data["annotations"]:
    image_id = ann["image_id"]
    if image_id in image_id_to_set:
        set_name = image_id_to_set[image_id]
        if set_name == 'train':
            train_annotations["annotations"].append(ann)
        elif set_name == 'val':
            val_annotations["annotations"].append(ann)
        elif set_name == 'test':
            test_annotations["annotations"].append(ann)

# Guardar los nuevos archivos de anotaciones
with open(os.path.join(train_dir, 'annotations.json'), 'w') as f:
    json.dump(train_annotations, f)

with open(os.path.join(val_dir, 'annotations.json'), 'w') as f:
    json.dump(val_annotations, f)

with open(os.path.join(test_dir, 'annotations.json'), 'w') as f:
    json.dump(test_annotations, f)

print("División de imágenes, etiquetas en formato YOLO y anotaciones completada.")


100H0015.txt
100H0052.txt
100H0176.txt
100H0070.txt
100H0109.txt
100H0260.txt
100H0146.txt
100H0016.txt
100H0212.txt
100H0111.txt
100H0098.txt
100H0125.txt
100H0044.txt
100H0136.txt
100H0041.txt
100H0074.txt
100H0224.txt
100H0045.txt
100H0094.txt
100H0004.txt
100H0020.txt
100H0185.txt
100H0043.txt
100H0195.txt
100H0123.txt
100H0040.txt
100H0203.txt
100H0263.txt
100H0178.txt
100H0190.txt
100H0238.txt
100H0209.txt
100H0239.txt
100H0154.txt
100H0140.txt
100H0102.txt
100H0235.txt
100H0014.txt
100H0223.txt
100H0107.txt
100H0126.txt
100H0244.txt
100H0023.txt
100H0071.txt
100H0067.txt
100H0027.txt
100H0258.txt
100H0265.txt
100H0012.txt
100H0246.txt
100H0058.txt
100H0077.txt
100H0173.txt
100H0214.txt
100H0247.txt
100H0207.txt
100H0174.txt
100H0211.txt
100H0002.txt
100H0080.txt
100H0169.txt
100H0024.txt
100H0034.txt
100H0221.txt
100H0164.txt
100H0188.txt
100H0236.txt
100H0128.txt
100H0149.txt
100H0035.txt
100H0205.txt
100H0225.txt
100H0231.txt
100H0032.txt
100H0072.txt
100H0145.txt
100H0029.txt

# Comprobación de la división

In [5]:
import json
import pandas as pd
from IPython.display import display

# Rutas de los archivos de anotaciones COCO
train_annotations_path = "/content/dataset_Detectron2/train/annotations.json"
val_annotations_path = "/content/dataset_Detectron2/val/annotations.json"
test_annotations_path = "/content/dataset_Detectron2/test/annotations.json"

# Función para leer los nombres de las imágenes de un archivo de anotaciones
def get_image_names(annotations_path):
    with open(annotations_path, 'r') as f:
        coco_data = json.load(f)
    image_names = [img['file_name'] for img in coco_data['images']]
    image_names.sort()
    return image_names

# Obtener los nombres de las imágenes de cada archivo de anotaciones
train_image_names = get_image_names(train_annotations_path)
val_image_names = get_image_names(val_annotations_path)
test_image_names = get_image_names(test_annotations_path)

# Crear un DataFrame con los nombres de las imágenes
data = {
    'Train': pd.Series(train_image_names),
    'Val': pd.Series(val_image_names),
    'Test': pd.Series(test_image_names)
}
df = pd.DataFrame(data)

# Configurar pandas para mostrar el DataFrame con desplazamiento
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Mostrar el DataFrame con desplazamiento
display(df.style.set_table_styles([{
    'selector': 'table',
    'props': [('max-height', '300px'), ('display', 'inline-block'), ('overflow', 'scroll')]
}]))

# Opcional: imprimir el número total de imágenes en cada partición
print(f'Número total de imágenes: {len(train_image_names) + len(val_image_names) + len(test_image_names)}')
print(f'Número total de imágenes en Train: {len(train_image_names)}')
print(f'Número total de imágenes en Val: {len(val_image_names)}')
print(f'Número total de imágenes en Test: {len(test_image_names)}')


Unnamed: 0,Train,Val,Test
0,100H0001.jpg,100H0007.jpg,100H0006.jpg
1,100H0002.jpg,100H0008.jpg,100H0033.jpg
2,100H0003.jpg,100H0010.jpg,100H0036.jpg
3,100H0004.jpg,100H0013.jpg,100H0039.jpg
4,100H0009.jpg,100H0026.jpg,100H0042.jpg
5,100H0011.jpg,100H0053.jpg,100H0046.jpg
6,100H0012.jpg,100H0060.jpg,100H0048.jpg
7,100H0014.jpg,100H0065.jpg,100H0049.jpg
8,100H0015.jpg,100H0069.jpg,100H0050.jpg
9,100H0016.jpg,100H0081.jpg,100H0056.jpg


Número total de imágenes: 237
Número total de imágenes en Train: 143
Número total de imágenes en Val: 49
Número total de imágenes en Test: 45


# Guardamos el nuevo dataset con ambas etiquetas.


In [6]:
# Directorio de origen
src_dir = "/content/dataset_Detectron2"

# Directorio de destino
dest_dir = "/content/drive/MyDrive/TFM/dataset_Detectron2_V3"

def copy_directory(src, dest):
    if not os.path.exists(dest):
        os.makedirs(dest)

    for root, dirs, files in os.walk(src):
        for name in dirs:
            src_dir = os.path.join(root, name)
            dest_dir = os.path.join(dest, os.path.relpath(src_dir, src))
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)

        for name in files:
            src_file = os.path.join(root, name)
            dest_file = os.path.join(dest, os.path.relpath(src_file, src))
            if not os.path.exists(os.path.dirname(dest_file)):
                os.makedirs(os.path.dirname(dest_file))
            shutil.copy2(src_file, dest_file)

# Copiar el directorio de origen al directorio de destino
copy_directory(src_dir, dest_dir)

print(f"Directorio copiado de {src_dir} a {dest_dir}")

Directorio copiado de /content/dataset_Detectron2 a /content/drive/MyDrive/TFM/dataset_Detectron2_V3
