## Image Processing

The code organizes the dataset of images into training and test sets for three categories: human, dalle, and stable. Random samples are selected from each category for training and the remaining samples are used for testing. The images are preprocessed, resized, and converted into numpy arrays. Training and test batches are created and saved as pickle files for later use.

In [1]:
import pickle as pkl
from PIL import Image
import numpy as np
import os
import random
from tqdm import tqdm

def get_random_samples(file_list, sample_size):
    """Get random samples from a list of files."""
    if sample_size > len(file_list):
        print(f"Warning: Requested {sample_size} samples but only {len(file_list)} available.")
        return file_list
    return random.sample(file_list, sample_size)

def get_remaining_samples(file_list, selected_samples):
    """Get samples that were not selected."""
    return [sample for sample in file_list if sample not in selected_samples]

def preprocess_image(image_path, target_size):
    """Preprocess an image: resize and normalize."""
    try:
        img = Image.open(image_path).resize(target_size)
        return np.array(img.convert("RGB")) / 255.
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

def save_pickle_file(data, file_path):
    """Save data to pickle file."""
    with open(file_path, 'wb') as f:
        pkl.dump(data, f)

def create_batches_with_labels(file_paths, folder, label):
    """Create batches of image data with corresponding labels."""
    data = []
    labels = []
    valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']

    for path in file_paths:
        if os.path.splitext(path)[1].lower() in valid_extensions:
            img_path = os.path.join(folder, path)
            img = preprocess_image(img_path, (256, 256))
            if img is not None:
                data.append(img)
                labels.append(label)
        else:
            print(f"Skipped non-image file: {path}")

    if not data:
        return np.array([]), np.array([])

    return np.stack(data), np.array(labels)

def organize_data(human_path, ai_path, folder):
    """Organize data into training, validation, and test sets."""
    if not os.path.exists(folder + 'file_names/'):
        os.makedirs(folder + 'file_names/')
    if not os.path.exists(folder + 'train_batches/'):
        os.makedirs(folder + 'train_batches/')
    if not os.path.exists(folder + 'val_batches/'):
        os.makedirs(folder + 'val_batches/')  # New directory for validation data
    if not os.path.exists(folder + 'test_batches/'):
        os.makedirs(folder + 'test_batches/')

    # Get all files
    human_files = os.listdir(human_path)
    ai_files = os.listdir(ai_path)

    train_ratio = 0.6
    val_ratio = 0.2
    test_ratio = 0.2

    def split_by_ratio(files):
        random.shuffle(files)
        train_size = int(len(files) * train_ratio)
        val_size = int(len(files) * val_ratio)

        train_files = files[:train_size]
        val_files = files[train_size:train_size+val_size]
        test_files = files[train_size+val_size:]

        return train_files, val_files, test_files

    human_train, human_val, human_test = split_by_ratio(human_files)
    ai_train, ai_val, ai_test = split_by_ratio(ai_files)

    # Save file names for reproducibility
    save_pickle_file([human_train, ai_train], folder + 'file_names/train.pickle')
    save_pickle_file([human_val, ai_val], folder + 'file_names/val.pickle')
    save_pickle_file([human_test, ai_test], folder + 'file_names/test.pickle')

    # Define labels
    human_label = [0, 1]  # [human, ai]
    ai_label = [1, 0]     # [ai, human]

    # Process training data in batches
    num_batches = 20
    human_train_batch_size = len(human_train) // num_batches
    ai_train_batch_size = len(ai_train) // num_batches

    for batch in tqdm(range(num_batches), desc="Processing training batches"):
        human_start = batch * human_train_batch_size
        human_end = (batch + 1) * human_train_batch_size if batch < num_batches - 1 else len(human_train)

        ai_start = batch * ai_train_batch_size
        ai_end = (batch + 1) * ai_train_batch_size if batch < num_batches - 1 else len(ai_train)

        human_train_batch = human_train[human_start:human_end]
        ai_train_batch = ai_train[ai_start:ai_end]

        human_data, human_labels = create_batches_with_labels(human_train_batch, human_path, human_label)
        ai_data, ai_labels = create_batches_with_labels(ai_train_batch, ai_path, ai_label)

        # Skip empty batches
        if human_data.size == 0 or ai_data.size == 0:
            print(f"Warning: Batch {batch} has empty data and will not be saved.")
            continue

        data_batch = {
            'data': np.vstack((human_data, ai_data)),
            'labels': np.vstack((human_labels, ai_labels))
        }

        save_pickle_file(data_batch, f"{folder}train_batches/batch_{batch}.pickle")

    # Process validation data
    human_val_batch_size = len(human_val) // num_batches
    ai_val_batch_size = len(ai_val) // num_batches

    for batch in tqdm(range(num_batches), desc="Processing validation batches"):
        human_start = batch * human_val_batch_size
        human_end = (batch + 1) * human_val_batch_size if batch < num_batches - 1 else len(human_val)

        ai_start = batch * ai_val_batch_size
        ai_end = (batch + 1) * ai_val_batch_size if batch < num_batches - 1 else len(ai_val)

        human_val_batch = human_val[human_start:human_end]
        ai_val_batch = ai_val[ai_start:ai_end]

        human_data, human_labels = create_batches_with_labels(human_val_batch, human_path, human_label)
        ai_data, ai_labels = create_batches_with_labels(ai_val_batch, ai_path, ai_label)

        # Skip empty batches
        if human_data.size == 0 or ai_data.size == 0:
            print(f"Warning: Validation batch {batch} has empty data and will not be saved.")
            continue

        val_data_batch = {
            'data': np.vstack((human_data, ai_data)),
            'labels': np.vstack((human_labels, ai_labels))
        }

        save_pickle_file(val_data_batch, f"{folder}val_batches/val_batch_{batch}.pickle")

    # Process test data
    human_test_data, human_test_labels = create_batches_with_labels(human_test, human_path, human_label)
    ai_test_data, ai_test_labels = create_batches_with_labels(ai_test, ai_path, ai_label)

    # Stack test data and save
    if human_test_data.size > 0 and ai_test_data.size > 0:
        test_data = np.vstack((human_test_data, ai_test_data))
        test_labels = np.vstack((human_test_labels, ai_test_labels))
        test_batch = {'data': test_data, 'labels': test_labels}
        save_pickle_file(test_batch, f"{folder}test_batches/test_batch.pickle")
    else:
        print("Warning: Test data is empty and will not be saved.")

organize_data(
    human_path="train_data/normaldataset/Test/Real",
    ai_path="train_data/normaldataset/Test/Fake",
    folder="train_data/normal"
)

Processing training batches: 100%|██████████| 20/20 [07:58<00:00, 23.94s/it]
Processing validation batches: 100%|██████████| 20/20 [02:39<00:00,  7.98s/it]
