# División del conjunto de datos en entrenamiento, validación y prueba

Este Jupyter Notebook se utiliza para dividir un conjunto de datos en tres conjuntos separados: entrenamiento, validación y prueba. La división se realizará en una proporción de 80% para entrenamiento, 10% para validación y 10% para prueba. El conjunto de datos original se lee desde un archivo CSV y se guardarán tres archivos CSV separados correspondientes a los conjuntos divididos.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta

In [2]:
# Ruta del archivo CSV de entrada
csv_path = "../../CSVs/dataset_ampliado.csv"

# Ruta del directorio donde se guardarán los CSVs
dataset_path = "../../CSVs/dataset/"

### Proporciones

In [3]:
# Definir las proporciones para el split
train_ratio = 0.8
test_ratio = 0.1
val_ratio = 0.1

### Leer el archivo CSV


In [4]:
# Leer el archivo CSV
df = pd.read_csv(csv_path)

In [5]:
# Obtener las clases únicas
classes = df['class'].unique()

### Dividir el conjunto de datos en entrenamiento, validación y prueba

In [6]:
# Realizar el split por clase manteniendo el equilibrio
train_df = pd.DataFrame(columns=['path', 'class'])
test_df = pd.DataFrame(columns=['path', 'class'])
val_df = pd.DataFrame(columns=['path', 'class'])

In [7]:
# Iterar sobre las clases
for class_name in classes:
    print(class_name)
    class_df = df[df['class'] == class_name]

    # Ordenar el dataframe por tiempo
    class_df = class_df.sort_values(by='date_time')

    class_count = len(class_df)

    # Calcular las cantidades para cada split
    train_count = int(class_count * train_ratio)
    test_count = int((class_count - train_count) / 2)
    val_count = class_count - train_count - test_count
    print("Reparto: ", train_count, test_count, val_count)

    # Dividir el dataframe de la clase en train, test y val
    train_indices = []
    test_indices = []
    val_indices = []
    prev_time = None

    # Recorrer el dataframe de la clase
    for index, row in class_df.iterrows():
        current_time = datetime.strptime(row['date_time'], '%Y:%m:%d %H:%M:%S')

        if prev_time is not None:
            time_diff = current_time - prev_time

            # Verificar la diferencia de tiempo con respecto a la imagen anterior
            if time_diff < timedelta(minutes=2):
                # Misma secuencia de tiempo, asignar al mismo grupo que la imagen anterior
                train_indices.append(index) if len(train_indices) < train_count else (
                    test_indices.append(index) if len(test_indices) < test_count else val_indices.append(index)
                )
            else:
                # Diferente secuencia de tiempo, asignar aleatoriamente
                train_indices.append(index) if len(train_indices) < train_count else (
                    test_indices.append(index) if len(test_indices) < test_count else val_indices.append(index)
                )
        else:
            # Primera imagen de la clase, asignar aleatoriamente
            train_indices.append(index) if len(train_indices) < train_count else (
                test_indices.append(index) if len(test_indices) < test_count else val_indices.append(index)
            )

        prev_time = current_time

    # Agregar los splits al dataframe final
    train_df = train_df.append(class_df.loc[train_indices, ['path', 'class']])
    test_df = test_df.append(class_df.loc[test_indices, ['path', 'class']])
    val_df = val_df.append(class_df.loc[val_indices, ['path', 'class']])

vacia
Reparto:  793 99 100
ciervo
Reparto:  800 100 100
jabali
Reparto:  800 100 100


  train_df = train_df.append(class_df.loc[train_indices, ['path', 'class']])
  test_df = test_df.append(class_df.loc[test_indices, ['path', 'class']])
  val_df = val_df.append(class_df.loc[val_indices, ['path', 'class']])
  train_df = train_df.append(class_df.loc[train_indices, ['path', 'class']])
  test_df = test_df.append(class_df.loc[test_indices, ['path', 'class']])
  val_df = val_df.append(class_df.loc[val_indices, ['path', 'class']])
  train_df = train_df.append(class_df.loc[train_indices, ['path', 'class']])
  test_df = test_df.append(class_df.loc[test_indices, ['path', 'class']])
  val_df = val_df.append(class_df.loc[val_indices, ['path', 'class']])


vaca
Reparto:  800 100 100
caballo
Reparto:  800 100 100
humano
Reparto:  799 100 100


  train_df = train_df.append(class_df.loc[train_indices, ['path', 'class']])
  test_df = test_df.append(class_df.loc[test_indices, ['path', 'class']])
  val_df = val_df.append(class_df.loc[val_indices, ['path', 'class']])
  train_df = train_df.append(class_df.loc[train_indices, ['path', 'class']])
  test_df = test_df.append(class_df.loc[test_indices, ['path', 'class']])
  val_df = val_df.append(class_df.loc[val_indices, ['path', 'class']])
  train_df = train_df.append(class_df.loc[train_indices, ['path', 'class']])
  test_df = test_df.append(class_df.loc[test_indices, ['path', 'class']])
  val_df = val_df.append(class_df.loc[val_indices, ['path', 'class']])


zorro
Reparto:  799 100 100
gamo
Reparto:  800 100 100


  train_df = train_df.append(class_df.loc[train_indices, ['path', 'class']])
  test_df = test_df.append(class_df.loc[test_indices, ['path', 'class']])
  val_df = val_df.append(class_df.loc[val_indices, ['path', 'class']])
  train_df = train_df.append(class_df.loc[train_indices, ['path', 'class']])
  test_df = test_df.append(class_df.loc[test_indices, ['path', 'class']])
  val_df = val_df.append(class_df.loc[val_indices, ['path', 'class']])


### Información del split

In [8]:
# Mostrar información sobre el split
print("Train set:")
print(train_df['class'].value_counts())
print()
print("Test set:")
print(test_df['class'].value_counts())
print()
print("Validation set:")
print(val_df['class'].value_counts())


Train set:
ciervo     800
jabali     800
vaca       800
caballo    800
gamo       800
humano     799
zorro      799
vacia      793
Name: class, dtype: int64

Test set:
ciervo     100
jabali     100
vaca       100
caballo    100
humano     100
zorro      100
gamo       100
vacia       99
Name: class, dtype: int64

Validation set:
vacia      100
ciervo     100
jabali     100
vaca       100
caballo    100
humano     100
zorro      100
gamo       100
Name: class, dtype: int64


### Guardar los conjuntos de datos divididos en archivos CSV

In [9]:
# Guardar los conjuntos de datos divididos en archivos CSV
train_df.to_csv(dataset_path + "train.csv", index=False)
val_df.to_csv(dataset_path + "val.csv", index=False)
test_df.to_csv(dataset_path + "test.csv", index=False)