In [4]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

def split_data_and_move_images(image_folder, csv_file, output_dir, train_size=0.8, test_size=0.1, val_size=0.1):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Make sure the output folder exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Create subdirectories for train, test, and val
    train_folder = os.path.join(output_dir, 'train')
    test_folder = os.path.join(output_dir, 'test')
    val_folder = os.path.join(output_dir, 'val')

    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)

    # Create corresponding subfolders for images in each category (train, test, val)
    train_image_folder = os.path.join(train_folder, 'images')
    test_image_folder = os.path.join(test_folder, 'images')
    val_image_folder = os.path.join(val_folder, 'images')

    os.makedirs(train_image_folder, exist_ok=True)
    os.makedirs(test_image_folder, exist_ok=True)
    os.makedirs(val_image_folder, exist_ok=True)

    # Split the dataframe into train, test, and validation based on the image filenames
    train_df, temp_df = train_test_split(df, train_size=train_size, random_state=42, shuffle=True)
    val_df, test_df = train_test_split(temp_df, test_size=test_size/(test_size + val_size), random_state=42, shuffle=True)

    # Define function to move images and create new CSV for each split
    def move_images_and_create_csv(split_df, split_folder, image_folder):
        # Move images to corresponding folder
        for _, row in split_df.iterrows():
            image_name = row['image']
            image_path = os.path.join(image_folder, image_name)
            destination_path = os.path.join(split_folder, 'images', image_name)

            # Move the image
            shutil.copy(image_path, destination_path)

        # Save the split dataframe to a CSV file (directly in the split folder)
        split_csv_path = os.path.join(split_folder, f"output_label.csv")
        split_df.to_csv(split_csv_path, index=False)

    # Move images and save CSVs for each split
    move_images_and_create_csv(train_df, train_folder, image_folder)
    move_images_and_create_csv(test_df, test_folder, image_folder)
    move_images_and_create_csv(val_df, val_folder, image_folder)

    print("Data split into train, test, and val folders. CSVs created.")

# Example usage:
image_folder = 'Transform_Image'  # Folder containing images
csv_file = 'output_label.csv'  # CSV file with image names and output values
output_dir = 'model_data'  # Directory where train, test, val folders will be created

split_data_and_move_images(image_folder, csv_file, output_dir)


Data split into train, test, and val folders. CSVs created.


In [5]:
import os
import pandas as pd

def check_images_in_csv(image_folder, csv_file):
    # Read the CSV file to get the image names and labels
    df = pd.read_csv(csv_file)
    
    # Create a set of image names from the CSV file
    csv_image_names = set(df['image'])
    
    # Initialize lists to store the results
    missing_images = []
    missing_in_csv = []
    
    # Iterate over all images in the folder
    for root, dirs, files in os.walk(image_folder):
        for file in files:
            if file.endswith('.jpg'):  # Assuming images are .jpg
                image_name = file
                
                # Check if the image exists in the CSV
                if image_name not in csv_image_names:
                    missing_in_csv.append(image_name)
                else:
                    csv_image_names.remove(image_name)  # Remove from set once found

    # If there are any missing images in the CSV
    if missing_in_csv:
        print(f"These images are missing from the CSV: {missing_in_csv}")
    else:
        print("All images have corresponding entries in the CSV.")

    # If there are any images in the CSV not found in the folder
    if csv_image_names:
        print(f"These images are in the CSV but not found in the folder: {list(csv_image_names)}")
    else:
        print("No missing images from the CSV in the folder.")
        
# Example usage:
image_folder_train = 'model_data/train/images'  # Folder containing training images
image_folder_test = 'model_data/test/images'  # Folder containing test images
image_folder_val = 'model_data/val/images'  # Folder containing validation images
csv_file_train = 'model_data/train/output_label.csv'  # CSV file for train set
csv_file_test = 'model_data/test/output_label.csv'  # CSV file for test set
csv_file_val = 'model_data/val/output_label.csv'  # CSV file for val set

# Checking the images in each set
print("Checking train images:")
check_images_in_csv(image_folder_train, csv_file_train)

print("\nChecking test images:")
check_images_in_csv(image_folder_test, csv_file_test)

print("\nChecking val images:")
check_images_in_csv(image_folder_val, csv_file_val)


Checking train images:
All images have corresponding entries in the CSV.
No missing images from the CSV in the folder.

Checking test images:
All images have corresponding entries in the CSV.
No missing images from the CSV in the folder.

Checking val images:
All images have corresponding entries in the CSV.
No missing images from the CSV in the folder.
