In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import random

In [64]:
# Constantes
N_STATIONS = 3

# División de proporciones
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

In [65]:
# Ruta del archivo
PATH = "../"
input_csv_file = PATH + "Data/CSVs/dataset_caltech_bb.csv"

train_file = PATH + "Data/CSVs/train_dataset_caltech.csv"
validation_file = PATH + "Data/CSVs/validation_dataset_caltech.csv"
test_file = PATH + "Data/CSVs/test_dataset_caltech.csv"

In [66]:
# Cargar el archivo CSV
data = pd.read_csv(input_csv_file)

### Locations selection

Unique locations selection for validation and test

In [67]:
# Obtener localizaciones únicas
unique_locations = data['station'].unique()

# Calcular el número de localizaciones a seleccionar (10%)
num_lowest_locations = int(0.2 * len(unique_locations))

# Ordenar las localizaciones por cantidad de imágenes
location_counts = data['station'].value_counts()
sorted_locations = location_counts.sort_values().index

# Seleccionar las localizaciones con menos entradas
selected_locations = sorted_locations[:num_lowest_locations].tolist()

# Calcular clases minoritarias
minority_classes = data['class'].value_counts().tail(5).index.tolist()

# Filtrar las localizaciones que contienen clases minoritarias
filtered_locations = [loc for loc in selected_locations if all(data[(data['station'] == loc) & (data['class'] == cls)].shape[0] == 0 for cls in minority_classes)]

# Barajar las localizaciones seleccionadas
random.shuffle(selected_locations)

val_locations = selected_locations[:N_STATIONS]
test_locations = selected_locations[len(selected_locations) - N_STATIONS:]

# Dividir en conjuntos de validación y prueba basado en localizaciones
unique_location_val_data = data[data['station'].isin(val_locations)]
unique_location_test_data = data[data['station'].isin(test_locations)]
remaining_data = data[~data['station'].isin(val_locations + test_locations)]

assert len(data) == len(remaining_data) + len(unique_location_test_data) + len(unique_location_val_data)

assert [loc not in remaining_data["station"] for loc in val_locations + test_locations]

In [68]:
# Crear la categoría temporal 'station_season_day_part'
remaining_data['station_season_day_part'] = remaining_data['station'] + '_' + remaining_data['season'] + '_' + remaining_data['day_part']

# Obtener las clases únicas
unique_classes = remaining_data['class'].unique()

# Inicializar listas para los subconjuntos
train_data = []
val_data = []
test_data = []

# Distribución de proporciones por clase
for class_name in unique_classes:
    class_data = remaining_data[remaining_data['class'] == class_name]
    class_grouped = class_data.groupby('station_season_day_part')
    
    for group, group_data in class_grouped:
        # Calcula la cantidad de imágenes para cada subconjunto
        num_images = len(group_data)
        num_train = int(num_images * train_ratio)
        num_val = int(num_images * val_ratio)
        num_test = num_images - num_train - num_val

        # Realiza la división según las reglas establecidas
        if num_images == 1:
            train_data.append(group_data)
        elif num_images == 2:
            train_data.append(group_data.iloc[0:num_train])
            val_data.append(group_data.iloc[num_train:])
        else:
            train_data.append(group_data.iloc[0:num_train])
            val_data.append(group_data.iloc[num_train:num_train+num_val])
            test_data.append(group_data.iloc[num_train+num_val:])

# Concatenar los subconjuntos para obtener los conjuntos finales
train_data = pd.concat(train_data)
val_data = pd.concat(val_data)
test_data = pd.concat(test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remaining_data['station_season_day_part'] = remaining_data['station'] + '_' + remaining_data['season'] + '_' + remaining_data['day_part']


In [69]:
def fix_seq_id_collisions(train_data, test_data, val_data):
    # Combine the subdatasets into a single dataset
    combined_data = pd.concat([train_data, test_data, val_data], ignore_index=True)

    # Calculate class proportions for later use
    class_proportions = combined_data['class'].value_counts(normalize=True)

    # Sort the combined dataset by seq_id
    combined_data_sorted = combined_data.sort_values(by='seq_id')

    # Initialize new subdatasets
    new_train_data = pd.DataFrame(columns=combined_data.columns)
    new_test_data = pd.DataFrame(columns=combined_data.columns)
    new_val_data = pd.DataFrame(columns=combined_data.columns)

    # Dictionary to track seq_id and their respective subsets
    seq_id_subsets = {}

    # Counters for the desired distribution
    train_count = int(len(combined_data) * train_ratio)
    test_count = int(len(combined_data) * test_ratio)
    val_count = len(combined_data) - train_count - test_count

    # Iterate through the sorted dataset and distribute images by seq_id
    for _, row in combined_data_sorted.iterrows():
        seq_id = row['seq_id']

        # Check if the seq_id is already in the new subdatasets
        if seq_id in seq_id_subsets:
            subset = seq_id_subsets[seq_id]
        else:
            # Determine the subdataset to add the image to
            counts = {
                'train': len(new_train_data),
                'test': len(new_test_data),
                'val': len(new_val_data)
            }

            # Distribute based on the counters
            if counts['train'] < train_count:
                subset = 'train'
            elif counts['test'] < test_count:
                subset = 'test'
            else:
                subset = 'val'

            seq_id_subsets[seq_id] = subset

        # Add the image to the appropriate subdataset
        if subset == 'train':
            new_train_data = new_train_data.append(row)
        elif subset == 'test':
            new_test_data = new_test_data.append(row)
        elif subset == 'val':
            new_val_data = new_val_data.append(row)

    # Return the fixed subdatasets
    return new_train_data, new_test_data, new_val_data


In [70]:
new_train_data, new_test_data, new_val_data = fix_seq_id_collisions(train_data, test_data, val_data)

  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.append(row)
  new_train_data = new_train_data.

In [87]:
assert (len(new_train_data) + len(new_test_data) + len(new_val_data) + len(unique_location_test_data) + len(unique_location_val_data)) == len(data)

In [75]:
test_data = pd.concat([unique_location_test_data, new_test_data])
val_data = pd.concat([unique_location_val_data, new_val_data])

In [88]:
# Remove the extra column from the new subdatasets
new_train_data.drop(columns=['station_season_day_part'], inplace=True)
test_data.drop(columns=['station_season_day_part'], inplace=True)
val_data.drop(columns=['station_season_day_part'], inplace=True)

In [90]:
new_train_data.to_csv(train_file, index=False)
val_data.to_csv(validation_file, index=False)
test_data.to_csv(test_file, index=False)

# Test

In [91]:
# Cargar los DataFrames de los archivos CSV
train_df = pd.read_csv(train_file)
validation_df = pd.read_csv(validation_file)
test_df = pd.read_csv(test_file)

# Obtener las secuencias en cada subconjunto
train_seqs = train_df['seq_id'].unique()
validation_seqs = validation_df['seq_id'].unique()
test_seqs = test_df['seq_id'].unique()

# Verificar si hay secuencias en más de un subconjunto
overlapping_seqs = []

for seq_id in train_seqs:
    if seq_id in validation_seqs or seq_id in test_seqs:
        overlapping_seqs.append(seq_id)

for seq_id in validation_seqs:
    if seq_id in test_seqs:
        overlapping_seqs.append(seq_id)

# Obtener localizaciones exclusivas de validación y prueba
validation_only_locations = validation_df[~validation_df['station'].isin(test_df['station'])]['station'].unique()
test_only_locations = test_df[~test_df['station'].isin(validation_df['station'])]['station'].unique()

# Mostrar las localizaciones exclusivas de validación y prueba
print("Localizaciones exclusivas en Validation:", validation_only_locations)
print("Localizaciones exclusivas en Test:", test_only_locations)

# Mostrar las secuencias en cada subconjunto
#print("Secuencias en el conjunto de entrenamiento:", train_seqs)
#print("Secuencias en el conjunto de validación:", validation_seqs)
#print("Secuencias en el conjunto de prueba:", test_seqs)

# Mostrar el resultado de la verificación de secuencias
if not overlapping_seqs:
    print("No hay imágenes de la misma secuencia separadas entre los conjuntos.")
else:
    print("Se encontraron las siguientes secuencias en múltiples conjuntos:")
    for seq_id in overlapping_seqs:
        print(seq_id)

Localizaciones exclusivas en Validation: ['40' '67' '45']
Localizaciones exclusivas en Test: ['23' '58' '63']
No hay imágenes de la misma secuencia separadas entre los conjuntos.
