# installations

In [None]:
if os.environ.get('COLAB_GPU') is not None:
    !pip install ultralytics
    !pip install optuna

elif os.path.exists("/kaggle"):  # Kaggle
    !pip install ultralytics
    !pip install optuna


# pipeline_satup.py

In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split
import json
from pprint import pprint
import albumentations as A
import cv2
import yaml
import sys

# === Configuración Inicial ===
def detect_environment():
    """
    Detecta el entorno de ejecución (Kaggle, Google Colab o Local).

    Returns:
    - str: Nombre del entorno detectado.
    """
    if "google.colab" in sys.modules:
        environment = "colab"
    elif os.path.exists("/kaggle"):
        environment = "kaggle"
    else:
        environment = "local"
    pprint({"Detected Environment": environment})
    return environment


def setup_environment(base_path="/kaggle/working/output"):
    """
    Configura el entorno según el sistema detectado y prepara el dataset.

    Parameters:
    - base_path (str): Carpeta base donde se configurará la salida.

    Returns:
    - dict: Rutas configuradas para las imágenes y etiquetas crudas.
    """
    environment = detect_environment()
    print(f"\n[INFO] Entorno detectado: {environment.capitalize()}\n")

    if environment == "kaggle":
        dataset_path = "/kaggle/input/spiled-lego-bricks"
        required_folders = ["Images_600x800", "LabelMe_txt_bricks"]
        for folder in required_folders:
            full_path = os.path.join(dataset_path, folder)
            if not os.path.exists(full_path):
                raise FileNotFoundError(f"[ERROR] Carpeta requerida no encontrada: {full_path}")
            print(f"[INFO] Carpeta verificada: {full_path}")

        return {
            "raw_images_path": os.path.join(dataset_path, "Images_600x800"),
            "raw_labels_path": os.path.join(dataset_path, "LabelMe_txt_bricks"),
            "output_path": base_path
        }
    elif environment == "colab":
            from google.colab import userdata
            kaggle_path = "kaggle.json"
            if not os.path.exists(kaggle_path):
                # raise EnvironmentError("[ERROR] Sube tu archivo kaggle.json al entorno Colab en /root/.kaggle/")
                os.makedirs("/root/.kaggle", exist_ok=True)
            
            
                kaggle_user = userdata.get('KaggleUser')
                kaggle_token = userdata.get('KaggleToken')
                if not kaggle_user or not kaggle_token:
                    raise EnvironmentError("[ERROR] No se encontraron las credenciales de Kaggle en Google Colab.")
                kaggle_data = {
                    "username": kaggle_user,
                    "key": kaggle_token
                }
                with open("/root/.kaggle/kaggle.json", "w") as f:
                    json.dump(kaggle_data, f)
                    print("[INFO] Credenciales de Kaggle configuradas en Google Colab.")
            else:
                os.makedirs("/root/.kaggle", exist_ok=True)
                shutil.move(kaggle_path, "/root/.kaggle/kaggle.json")
                print("[INFO] Archivo kaggle.json movido a /root/.kaggle/")
            os.chmod("/root/.kaggle/kaggle.json", 0o600)
            os.makedirs("working", exist_ok=True)
            os.makedirs("working/spiled-lego-bricks", exist_ok=True)
            os.system("kaggle datasets download -d migueldilalla/spiled-lego-bricks -p working/spiled-lego-bricks --unzip")
            os.makedirs("/working/output", exist_ok=True)
            dataset_path = "working/spiled-lego-bricks"

            return {
                "raw_images_path": os.path.join(dataset_path, "Images_600x800"),
                "raw_labels_path": os.path.join(dataset_path, "LabelMe_txt_bricks"),
                "output_path": os.path.join(os.getcwd(), "working", "output")
            }


    elif environment == "local":
        kaggle_json_path = os.path.expanduser("~/.kaggle/kaggle.json")
        if not os.path.exists(kaggle_json_path):
            raise EnvironmentError("[ERROR] Archivo kaggle.json no encontrado en ~/.kaggle/")
        os.makedirs("working", exist_ok=True)
        os.makedirs("working/spiled-lego-bricks", exist_ok=True)
        if not os.listdir("working/spiled-lego-bricks"):
            os.system("kaggle datasets download -d migueldilalla/spiled-lego-bricks -p working/spiled-lego-bricks --unzip")
        os.makedirs("working/output", exist_ok=True)
        dataset_path = "working/spiled-lego-bricks"

        return {
             "raw_images_path": os.path.join(dataset_path, "Images_600x800"),
            "raw_labels_path": os.path.join(dataset_path, "LabelMe_txt_bricks"),
            "output_path": os.path.join(os.getcwd(), "working", "output")
        }
    else:
        while True:
            user_input = input("[PROMPT] No se detectó un entorno. Por favor, escribe 'k' para Kaggle, 'g' para Google Colab, o 'l' para Local: ").strip().lower()
            if user_input in ["k", "g", "l"]:
                return setup_environment_custom(user_input, base_path)
            print("[ERROR] Entrada inválida. Intenta nuevamente.")

def setup_environment_custom(choice, base_path):
    """
    Configura el entorno manualmente basado en la elección del usuario.

    Parameters:
    - choice (str): 'k' para Kaggle, 'g' para Colab, 'l' para Local.
    - base_path (str): Ruta base para la salida.

    Returns:
    - dict: Rutas configuradas para las imágenes y etiquetas crudas.
    """
    if choice == "k":
        return setup_environment()
    elif choice == "g":
        return setup_environment(base_path="working")
    elif choice == "l":
        return setup_environment(base_path="working")
    else:
        raise EnvironmentError("[ERROR] Configuración desconocida.")

def verify_dataset_structure(raw_images_path, raw_labels_path):
    """
    Verifica la existencia de las carpetas requeridas en el dataset y muestra estadísticas iniciales.

    Parameters:
    - raw_images_path (str): Ruta a las imágenes crudas.
    - raw_labels_path (str): Ruta a las etiquetas crudas.
    """
    required_folders = [raw_images_path, raw_labels_path]
    summary = {}
    for folder in required_folders:
        if not os.path.exists(folder):
            raise FileNotFoundError(f"[ERROR] Carpeta requerida no encontrada: {folder}")

        num_files = len([f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])
        if num_files == 0:
            raise ValueError(f"[ERROR] La carpeta {folder} está vacía.")
        summary[folder] = num_files

    pprint({"Dataset Estructura": summary})

def create_preprocessing_structure(output_dir="/kaggle/working/output"):
    """
    Crea la estructura de carpetas para PREPROCESSING/.

    Parameters:
    - output_dir (str): Ruta base para la carpeta PREPROCESSING/.
    """
    os.makedirs(output_dir, exist_ok=True)
    subfolders = [
        "dataset/images/train", "dataset/images/val", "dataset/images/test",
        "dataset/labels/train", "dataset/labels/val", "dataset/labels/test",
        "test_images"
    ]
    for subfolder in subfolders:
        os.makedirs(os.path.join(output_dir, subfolder), exist_ok=True)
    print(f"[INFO] Estructura de carpetas creada en {output_dir}.")

def copy_and_partition_data(input_images, input_labels, output_dir):
    """
    Copia imágenes y etiquetas a las carpetas correspondientes y realiza la partición de datos.

    Parameters:
    - input_images (str): Carpeta de imágenes de entrada.
    - input_labels (str): Carpeta de etiquetas de entrada.
    - output_dir (str): Carpeta base para PREPROCESSING/.
    """
    images = sorted([f for f in os.listdir(input_images) if f.endswith(".jpg")])
    labels = sorted([f for f in os.listdir(input_labels) if f.endswith(".txt")])

    if len(images) != len(labels):
        raise ValueError("[ERROR] Número de imágenes y etiquetas no coincide.")

    image_paths = [os.path.join(input_images, img) for img in images]
    label_paths = [os.path.join(input_labels, lbl) for lbl in labels]

    train_imgs, temp_imgs, train_lbls, temp_lbls = train_test_split(image_paths, label_paths, test_size=0.3, random_state=42)
    val_imgs, test_imgs, val_lbls, test_lbls = train_test_split(temp_imgs, temp_lbls, test_size=0.33, random_state=42)

    partitions = {
        "train": (train_imgs, train_lbls),
        "val": (val_imgs, val_lbls),
        "test": (test_imgs, test_lbls)
    }

    for partition, (imgs, lbls) in partitions.items():
        for img, lbl in zip(imgs, lbls):
            shutil.copy(img, os.path.join(output_dir, f"dataset/images/{partition}/"))
            shutil.copy(lbl, os.path.join(output_dir, f"dataset/labels/{partition}/"))

    pprint({"Partición Completada": {partition: len(imgs) for partition, (imgs, _) in partitions.items()}})

def augment_data(input_images, input_labels, output_dir, num_augmentations=2):
    """
    Aplica aumentaciones al dataset y guarda imágenes y etiquetas aumentadas.

    Parameters:
    - input_images (str): Carpeta de imágenes originales.
    - input_labels (str): Carpeta de etiquetas en formato YOLO.
    - output_dir (str): Carpeta donde se guardarán los datos aumentados.
    - num_augmentations (int): Número de versiones aumentadas por imagen.
    """
    aug_images_dir = os.path.join(output_dir, "augmented_images")
    aug_labels_dir = os.path.join(output_dir, "augmented_labels")
    os.makedirs(aug_images_dir, exist_ok=True)
    os.makedirs(aug_labels_dir, exist_ok=True)

    transform = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.2),
        A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5),
        A.Resize(height=640, width=640),
    ], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

    images = sorted([f for f in os.listdir(input_images) if f.endswith(".jpg")])
    for img_file in images:
        img_path = os.path.join(input_images, img_file)
        label_path = os.path.join(input_labels, img_file.replace(".jpg", ".txt"))

        if not os.path.exists(label_path):
            continue

        image = cv2.imread(img_path)
        bboxes, class_labels = load_labels(label_path)

        for i in range(num_augmentations):
            augmented = transform(image=image, bboxes=bboxes, class_labels=class_labels)
            aug_image = augmented["image"]
            aug_bboxes = augmented["bboxes"]
            aug_labels = augmented["class_labels"]

            aug_image_path = os.path.join(aug_images_dir, f"{img_file.split('.')[0]}_aug{i}.jpg")
            cv2.imwrite(aug_image_path, aug_image)

            aug_label_path = os.path.join(aug_labels_dir, f"{img_file.split('.')[0]}_aug{i}.txt")
            save_labels(aug_label_path, aug_bboxes, aug_labels)

    print(f"[INFO] Augmented data saved to {output_dir}.")

def load_labels(label_path):
    """
    Carga etiquetas en formato YOLO desde un archivo .txt.

    Parameters:
    - label_path (str): Ruta al archivo de etiquetas en formato YOLO.

    Returns:
    - bboxes (list): Lista de bounding boxes en formato YOLO.
    - class_labels (list): Lista de etiquetas de clase.
    """
    bboxes, class_labels = [], []
    with open(label_path, "r") as f:
        lines = f.readlines()
    for line in lines:
        class_id, x_center, y_center, width, height = map(float, line.strip().split())
        bboxes.append([x_center, y_center, width, height])
        class_labels.append(int(class_id))
    return bboxes, class_labels

def save_labels(output_path, bboxes, class_labels):
    """
    Guarda etiquetas en formato YOLO en un archivo .txt.

    Parameters:
    - output_path (str): Ruta donde se guardará el archivo de etiquetas.
    - bboxes (list): Lista de bounding boxes en formato YOLO.
    - class_labels (list): Lista de etiquetas de clase.
    """
    with open(output_path, "w") as f:
        for bbox, label in zip(bboxes, class_labels):
            f.write(f"{label} {' '.join(map(str, bbox))}\n")


def copy_augmented_to_train(augmented_dir, output_path):
    """
    Copia los datos aumentados a las subcarpetas correspondientes de 'train'.

    Parameters:
    - augmented_dir (str): Directorio que contiene imágenes y etiquetas aumentadas.
    - output_path(str): Ruta base para la salida.
    """
    aug_images_dir = os.path.join(augmented_dir, "augmented_images")
    aug_labels_dir = os.path.join(augmented_dir, "augmented_labels")
    train_images_dir = os.path.join(output_path, "dataset/images/train")
    train_labels_dir = os.path.join(output_path, "dataset/labels/train")

    for img_file in os.listdir(aug_images_dir):
        shutil.copy(os.path.join(aug_images_dir, img_file), train_images_dir)

    for label_file in os.listdir(aug_labels_dir):
        shutil.copy(os.path.join(aug_labels_dir, label_file), train_labels_dir)

    print(f"[INFO] Augmented data merged into train set at {output_path}.")

def create_dataset_yaml(output_path, num_classes, class_names):
    """
    Creates a dataset.yaml file with absolute paths for YOLO training.

    Parameters:
    - output_path (str): Base directory where the dataset.yaml file will be saved.
    - num_classes (int): Total number of classes.
    - class_names (list): List of class names.
    """
    # Resolve absolute paths for train and val folders
    dataset_dir = os.path.abspath(output_path)
    train_path = os.path.join(dataset_dir, "images/train")
    val_path = os.path.join(dataset_dir, "images/val")

    # Create the dataset configuration dictionary
    dataset_config = {
        "path": dataset_dir,
        "train": train_path,
        "val": val_path,
        "nc": num_classes,
        "names": {i: name for i, name in enumerate(class_names)}
    }

    # Save the configuration to the dataset.yaml file
    yaml_path = os.path.join(dataset_dir, "dataset.yaml")
    with open(yaml_path, "w") as f:
        yaml.dump(dataset_config, f, default_flow_style=False)
    
    print(f"[INFO] dataset.yaml created at: {yaml_path}")

def validate_final_structure(output_dir="/kaggle/working/output"):
    """
    Valida que las carpetas de imágenes y etiquetas contengan archivos coincidentes.

    Parameters:
    - output_dir (str): Carpeta base para PREPROCESSING/.
    """
    partitions = ["train", "val", "test"]
    summary = {}

    # flag = True

    for partition in partitions:
        images = sorted(os.listdir(os.path.join(output_dir, f"dataset/images/{partition}/")))
        labels = sorted(os.listdir(os.path.join(output_dir, f"dataset/labels/{partition}/")))

        
        # if flag:
        #     print(output_dir, f"dataset/images/{partition}/")
        #     flag = False
        #     #open the folder in file explorer
        #     os.system(f"explorer {os.path.join(output_dir, f'dataset/images/{partition}/').replace('/', '\\')}")
        
        if len(images) != len(labels):
            raise ValueError(f"[ERROR] Desbalance entre imágenes y etiquetas en {partition}.")
        summary[partition] = len(images)
    
    pprint({"Validación Final": summary})

def main():
    """
    Ejecución principal del pipeline.
    """
    paths = setup_environment()
    pprint({"Rutas Configuradas": paths})

    verify_dataset_structure(paths["raw_images_path"], paths["raw_labels_path"])

    create_preprocessing_structure(paths["output_path"])

    copy_and_partition_data(paths["raw_images_path"], paths["raw_labels_path"], paths["output_path"])

    augment_data(
        input_images=os.path.join(paths["output_path"], "dataset/images/train"),
        input_labels=os.path.join(paths["output_path"], "dataset/labels/train"),
        output_dir=os.path.join(paths["output_path"], "augmented_dataset"),
        num_augmentations=3
    )

    copy_augmented_to_train(
        augmented_dir=os.path.join(paths["output_path"], "augmented_dataset"),
        output_path=paths["output_path"]
    )

    create_dataset_yaml(
        output_path=os.path.join(paths["output_path"], "dataset"),
        num_classes=1,  # Replace with the actual number of classes
        class_names=["brick"]  # Add all class names here
    )

    validate_final_structure(paths["output_path"])
    print("\n[INFO] Pipeline setup completed with augmentations and dataset.yaml creation.\n")

if __name__ == "__main__":
    main()


# pipeline_train.py

In [None]:
import os
import optuna
from ultralytics import YOLO
from datetime import datetime
import logging
from tqdm import tqdm

EPOCAS = 4

# === Configuración del Logger ===
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# === Detección del dispositivo ===
def get_device():
    """
    Detecta el dispositivo adecuado para la ejecución.

    Returns:
    - str: Dispositivo a usar ("cpu", "0", "0,1").
    """
    if os.environ.get('COLAB_GPU') is not None:
        return "0"  # Colab
    elif os.path.exists("/kaggle"):  # Kaggle
        return "0,1"
    else:
        return "cpu"  # Local

# === Callback personalizado para barra de progreso ===
class ProgressBarCallback:
    def __init__(self, total_epochs):
        self.total_epochs = total_epochs
        self.pbar = None

    def on_train_start(self, trainer, **kwargs):
        # Inicializar barra de progreso
        self.pbar = tqdm(total=self.total_epochs, desc="Progreso del entrenamiento", unit="época")

    def on_epoch_end(self, trainer, **kwargs):
        # Actualizar barra de progreso al final de cada época
        self.pbar.update(1)
        self.pbar.set_postfix({"Última época": kwargs.get('epoch') + 1})

    def on_train_end(self, trainer, **kwargs):
        # Cerrar barra de progreso
        self.pbar.close()

# === Configuración de la Función Objetivo de Optuna ===
def objective(trial):
    """
    Función objetivo para Optuna que entrena el modelo YOLO utilizando hiperparámetros sugeridos.

    Returns:
    - mAP50 (float): Precisión media a IoU 0.5, métrica a optimizar.
    """
    # Definir espacio de búsqueda para hiperparámetros
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
    batch_size = trial.suggest_int("batch_size", 8, 32, step=8)
    momentum = trial.suggest_uniform("momentum", 0.8, 0.99)
    imgsz = trial.suggest_categorical("imgsz", [320, 480, 640, 800])  # Tamaños de imagen

    # Inicializar modelo YOLO
    model = YOLO("yolov8n.pt")

    # Configurar entrenamiento
    project_name = "optuna_yolo_training"
    dataset_yaml = os.path.join(os.getcwd(), "working", "output", "dataset", "dataset.yaml")
    try:
        results = model.train(
            data=dataset_yaml,
            epochs=EPOCAS,  # Épocas fijas para experimentos
            batch=batch_size,
            imgsz=imgsz,
            lr0=learning_rate,
            momentum=momentum,
            project=project_name,
            name=f"trial_{trial.number}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
            device=get_device()
        )

        # Evaluar el modelo
        metrics = model.val()
        return metrics["mAP50"]  # Devolver mAP50 como métrica objetivo
    except Exception as e:
        logging.error(f"[ERROR] Error durante el entrenamiento en el trial {trial.number}: {e}")
        return float("nan")

# === Entrenamiento Regular (Sin Optuna) ===
def train_model(dataset_yaml=None, pretrained_model="yolov8n.pt", epochs=EPOCAS, batch_size=16, learning_rate=0.001, momentum=0.9, imgsz=640):
    """
    Entrena el modelo YOLO con hiperparámetros definidos manualmente.

    Parameters:
    - dataset_yaml (str): Ruta al archivo dataset.yaml.
    - pretrained_model (str): Modelo YOLO preentrenado.
    - epochs (int): Número de épocas para el entrenamiento.
    - batch_size (int): Tamaño del batch.
    - learning_rate (float): Tasa de aprendizaje inicial.
    - momentum (float): Momento para el optimizador.
    - imgsz (int): Tamaño de las imágenes de entrada.
    """
    dataset_yaml = dataset_yaml or os.path.join(os.getcwd(), "working", "output", "dataset", "dataset.yaml")

    if not os.path.exists(dataset_yaml):
        logging.error(f"[ERROR] dataset.yaml no encontrado en {dataset_yaml}. Asegúrate de que el pipeline_setup.py lo haya generado.")
        return

    logging.info(f"[INFO] Usando dataset.yaml en: {dataset_yaml}")

    model = YOLO(pretrained_model)

    output_dir = f"regular_yolo_training/{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    os.makedirs(output_dir, exist_ok=True)

    # Registrar el callback de barra de progreso
    progress_bar = ProgressBarCallback(total_epochs=epochs)
    model.add_callback("on_train_start", progress_bar.on_train_start)
    model.add_callback("on_epoch_end", progress_bar.on_epoch_end)
    model.add_callback("on_train_end", progress_bar.on_train_end)

    try:
        logging.info("[INFO] Iniciando entrenamiento regular...")
        model.train(
            data=dataset_yaml,
            epochs=epochs,
            batch=batch_size,
            imgsz=imgsz,
            lr0=learning_rate,
            momentum=momentum,
            project=output_dir,
            name="train",
            device=get_device()
        )
        logging.info(f"[INFO] Entrenamiento completado. Resultados guardados en {output_dir}.")
    except Exception as e:
        logging.error(f"[ERROR] Error durante el entrenamiento: {e}")

# === Integración de Optuna en el Pipeline ===
def run_optuna_study(dataset_yaml=None, n_trials=20):
    """
    Ejecuta un estudio de Optuna para optimizar los hiperparámetros de YOLO.

    Parameters:
    - dataset_yaml (str): Ruta al archivo dataset.yaml.
    - n_trials (int): Número de pruebas a ejecutar.
    """
    dataset_yaml = dataset_yaml or os.path.join(os.getcwd(), "working", "output", "dataset", "dataset.yaml")

    if not os.path.exists(dataset_yaml):
        logging.error(f"[ERROR] dataset.yaml no encontrado en {dataset_yaml}. Asegúrate de que el pipeline_setup.py lo haya generado.")
        return

    logging.info(f"[INFO] Usando dataset.yaml en: {dataset_yaml}")

    logging.info("[INFO] Iniciando optimización con Optuna...")
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    # Mostrar resultados
    logging.info(f"[INFO] Mejor conjunto de hiperparámetros: {study.best_params}")
    logging.info(f"[INFO] Mejor mAP50 obtenido: {study.best_value}")

    # Guardar resultados
    study.trials_dataframe().to_csv("optuna_results.csv")
    optuna.visualization.plot_optimization_history(study).write_html("optuna_optimization_history.html")


# === Función Principal ===
def main(optuna_mode=False):
    """
    Ejecuta el entrenamiento con o sin Optuna.

    Parameters:
    - optuna_mode (bool): Si es True, utiliza Optuna para optimizar hiperparámetros.
    """
    if "google.colab" in sys.modules:
        dataset_yaml = os.path.join(os.getcwd(), "working", "output", "dataset", "dataset.yaml")
        print(dataset_yaml)
    elif os.path.exists("/kaggle"):
        dataset_yaml = os.path.join(os.getcwd(), "output", "dataset", "dataset.yaml")
        print(dataset_yaml)
    else:
        dataset_yaml = os.path.join(os.getcwd(), "output", "dataset", "dataset.yaml")
        print(dataset_yaml)


    if optuna_mode:
        run_optuna_study(dataset_yaml, n_trials=20)
    else:
        train_model(dataset_yaml, imgsz=640)  # Tamaño de imagen predeterminado

if __name__ == "__main__":
    main(optuna_mode=False)
