<a href="https://colab.research.google.com/github/Morsalah/M.Sc-Research-HRI-using-DIGIT-tactile-sensor/blob/main/Data_Handling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

walking through data set

In [None]:
import os
import pandas as pd
import shutil

def walk_through_folders(base_path, dataset_path):
    """Walk through YES and NO folders, copy images to dataset, rename them accordingly, and store in a DataFrame."""
    folders = ['YES', 'NO']
    data = []
    seen_filenames = {}  # To avoid overwriting files with the same timestamp

    # Ensure dataset directory exists
    os.makedirs(dataset_path, exist_ok=True)

    for folder in folders:
        folder_path = os.path.join(base_path, folder)
        if os.path.exists(folder_path):
            for root, _, files in os.walk(folder_path):
                for file in sorted(files):  # Keep order consistent
                    if file.lower().endswith(('png', 'jpg', 'jpeg')):  # Filter only image files
                        original_path = os.path.join(root, file)
                        relative_path = os.path.relpath(original_path, base_path)

                        # Extract timestamp from filename (assuming the format includes "_D21114_20250217_094231_")
                        parts = file.split('_')
                        timestamp = parts[2] if len(parts) >= 4 else "unknown"

                        # Prevent overwriting: Append index if filename exists
                        if timestamp in seen_filenames:
                            seen_filenames[timestamp] += 1
                            new_filename = f"{timestamp}_{folder.lower()}_{seen_filenames[timestamp]}.png"
                        else:
                            seen_filenames[timestamp] = 1
                            new_filename = f"{timestamp}_{folder.lower()}.png"

                        new_path = os.path.join(dataset_path, new_filename)

                        # Copy and rename image to dataset directory
                        shutil.copy2(original_path, new_path)

                        data.append({
                            'Original_Folder': folder,
                            'Original_Image_Path': relative_path,
                            'New_Image_Path': new_path,
                            'New_Filename': new_filename
                        })

    # Create DataFrame
    df = pd.DataFrame(data)
    print(f"Processed {len(df)} images.")
    return df

if __name__ == "__main__":
    base_directory = "./captured_images"
    dataset_directory = "./dataset"
    df_images = walk_through_folders(base_directory, dataset_directory)

    # Save to CSV
    df_images.to_csv(os.path.join(base_directory, "captured_images_summary.csv"), index=False)


spliting data

In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def split_dataset(input_dir, output_dir, train_ratio, val_ratio, test_ratio, overwrite=True):
    """Split dataset into train/val/test while organizing images into 'yes' and 'no' subfolders."""

    # Ensure input directory exists
    if not os.path.exists(input_dir):
        print(f"Error: Input directory '{input_dir}' does not exist!")
        return

    # Remove and recreate output directory if overwrite is enabled
    if os.path.exists(output_dir) and overwrite:
        print(f"Removing existing directory: {output_dir}")
        shutil.rmtree(output_dir)

    # Prepare output directories
    subsets = ['train', 'val', 'test']
    classes = ['yes', 'no']

    for subset in subsets:
        for cls in classes:
            os.makedirs(os.path.join(output_dir, subset, cls), exist_ok=True)

    # Collect all images
    images = [f for f in os.listdir(input_dir) if f.lower().endswith('png')]

    # Split dataset
    train_images, temp_images = train_test_split(images, train_size=train_ratio, random_state=42)
    val_images, test_images = train_test_split(temp_images, train_size=val_ratio / (val_ratio + test_ratio), random_state=42)

    # Function to move files
    def move_files(image_list, subset_name):
        for img in tqdm(image_list, desc=f"Processing {subset_name}"):
            # Determine class based on filename
            cls = 'yes' if 'yes' in img.lower() else 'no'
            src = os.path.join(input_dir, img)
            dst = os.path.join(output_dir, subset_name, cls, img)
            shutil.copy(src, dst)

    # Move files to respective folders
    move_files(train_images, 'train')
    move_files(val_images, 'val')
    move_files(test_images, 'test')

    # Print summary
    print("\nDataset split complete!")
    print(f"Training: {len(train_images)} images")
    print(f"Validation: {len(val_images)} images")
    print(f"Testing: {len(test_images)} images")

if __name__ == "__main__":
    dataset_directory = "./dataset"  # Folder containing all images (not sorted)
    output_directory = "./split_dataset"  # Output folder for organized dataset

    split_dataset(dataset_directory, output_directory, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1, overwrite=True)
