In [1]:
"""Script principal para ejecutar el flujo de trabajo."""

from src.data_cleaning import ParquetDataCleaner
from src.data_processing import ParquetDataProcessor
from src import utils
import config
import os
import numpy as np


def main():
    """Ejecuta el flujo de trabajo principal."""
    data_info = utils.read_csv(config.CSV_FILE)

    # Filtrar el archivo train.csv por la palabra clave 'bird'
    filtered_csv_path = config.DATA_PATH + "filtered_train.csv"
    utils.filter_csv_by_sign(config.CSV_FILE, filtered_csv_path, "time")

    # Leer el CSV filtrado esto se comenta si se quiere procesar todas las palabras
    data_info = utils.read_csv(filtered_csv_path)

    # Crear las carpetas necesarias si no existen
    os.makedirs(config.CLEANED_DATA_PATH, exist_ok=True)
    os.makedirs(config.NPY_DATA_PATH, exist_ok=True)

    data_cleaner = ParquetDataCleaner()
    data_processor = ParquetDataProcessor()

    train_subjects_data = {}
    val_subjects_data = {}
    min_and_max = {}
    for index, row in data_info.iterrows():
        parquet_path = config.RAW_DATA_PATH + row["path"]
        cleaned_data = data_cleaner.clean(parquet_path)

        # Agregar esta línea para imprimir el número máximo de frames en cleaned_data
        print(
            "Max frames in cleaned_data:",
            cleaned_data["frame"].max(),
            "for participant_id:",
            row["participant_id"],
            "and sequence_id:",
            row["sequence_id"],
        )
        # Añadir todos los numeros Cleaned_data, row['participant_id'] y row['sequence_id'] a un diccionario min_and_max
        if row["participant_id"] not in min_and_max:
            min_and_max[row["participant_id"]] = {}
        min_and_max[row["participant_id"]][row["sequence_id"]] = cleaned_data["frame"].max()

        # Dividir y guardar los datos en archivos .npy
        train_data, val_data = data_processor.split_data(
            cleaned_data, config.TRAIN_RATIO
        )
        data_processor.save_npy_file(
            train_data,
            f"{config.NPY_DATA_PATH}{row['participant_id']}_{row['sequence_id']}_train.npy",
        )
        data_processor.save_npy_file(
            val_data,
            f"{config.NPY_DATA_PATH}{row['participant_id']}_{row['sequence_id']}_val.npy",
        )

        # Almacenar la información en los diccionarios
        participant_id = row["participant_id"]
        if participant_id not in train_subjects_data:
            train_subjects_data[participant_id] = {"n_points": 0, "n_frames": set()}

        if participant_id not in val_subjects_data:
            val_subjects_data[participant_id] = {"n_points": 0, "n_frames": set()}

        train_subjects_data[participant_id]["n_points"] += len(train_data)
        train_subjects_data[participant_id]["n_frames"].update(
            np.unique(train_data[:, 1])
        )

        val_subjects_data[participant_id]["n_points"] += len(val_data)
        val_subjects_data[participant_id]["n_frames"].update(np.unique(val_data[:, 1]))

    utils.save_dict_to_csv(train_subjects_data, config.TRAIN_SUBJECTS_DATA_PATH)
    utils.save_dict_to_csv(val_subjects_data, config.VAL_SUBJECTS_DATA_PATH)

if __name__ == "__main__":
    main()
    

Max frames in cleaned_data: 18.0 for participant_id: 29302 and sequence_id: 100039661
Max frames in cleaned_data: 15.0 for participant_id: 28656 and sequence_id: 1001158776
Max frames in cleaned_data: 32.0 for participant_id: 2044 and sequence_id: 1005467898
Max frames in cleaned_data: 31.0 for participant_id: 37779 and sequence_id: 1009663684
Max frames in cleaned_data: 59.0 for participant_id: 28656 and sequence_id: 1054086064
Max frames in cleaned_data: 236.0 for participant_id: 16069 and sequence_id: 1067826961
Max frames in cleaned_data: 15.0 for participant_id: 29302 and sequence_id: 1079165547
Max frames in cleaned_data: 139.0 for participant_id: 4718 and sequence_id: 108406521
Max frames in cleaned_data: 32.0 for participant_id: 28656 and sequence_id: 109378174
Max frames in cleaned_data: 42.0 for participant_id: 27610 and sequence_id: 1100842910
Max frames in cleaned_data: 49.0 for participant_id: 26734 and sequence_id: 1105715380
Max frames in cleaned_data: 223.0 for particip

In [2]:
print(min_and_max)

NameError: name 'min_and_max' is not defined