## Image Processing

The code organizes the dataset of images into training and test sets for three categories: human, dalle, and stable. Random samples are selected from each category for training and the remaining samples are used for testing. The images are preprocessed, resized, and converted into numpy arrays. Training and test batches are created and saved as pickle files for later use.

In [1]:
import os
import random
import pickle as pkl
import numpy as np
from PIL import Image
from tqdm import tqdm

# Tiền xử lý ảnh
def preprocess_image(image_path, target_size=(256, 256)):
    img = Image.open(image_path).resize(target_size)
    return np.array(img.convert("RGB")) / 255.

# Tạo dữ liệu kèm nhãn
def create_data_with_labels(file_paths, folder, label):
    data = []
    labels = []
    for file in file_paths:
        if os.path.splitext(file)[1].lower() in ['.jpg', '.jpeg', '.png', '.webp']:
            try:
                img = preprocess_image(os.path.join(folder, file))
                data.append(img)
                labels.append(label)
            except Exception as e:
                print(f"Error processing {file}: {e}")
    return np.array(data), np.array(labels, dtype=np.float32)

# Phân loại AI theo độ khó
def split_ai_by_difficulty(ai_files):
    easy = [f for f in ai_files if f.lower().startswith("easy")]
    mid  = [f for f in ai_files if f.lower().startswith("mid")]
    hard = [f for f in ai_files if f.lower().startswith("hard")]
    return {'easy': easy, 'mid': mid, 'hard': hard}

# Lưu file pickle
def save_pickle_file(data, file_path):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'wb') as f:
        pkl.dump(data, f)

# Hàm chính tổ chức dữ liệu
def organize_data(human_path, ai_path, folder):
    os.makedirs(folder + 'file_names/', exist_ok=True)
    os.makedirs(folder + 'train_batches/', exist_ok=True)
    os.makedirs(folder + 'val_batches/', exist_ok=True)
    os.makedirs(folder + 'test_batches/', exist_ok=True)

    human_files = os.listdir(human_path)
    ai_files = os.listdir(ai_path)

    print(f"Total human files: {len(human_files)}")
    print(f"Total AI files: {len(ai_files)}")

    # Tỉ lệ chia
    train_ratio = 0.6
    val_ratio = 0.2
    test_ratio = 0.2

    # Phân loại AI theo độ khó
    ai_by_difficulty = split_ai_by_difficulty(ai_files)
    for diff, files in ai_by_difficulty.items():
        print(f"{diff.capitalize()} AI files: {len(files)}")

    def split_by_ratio(files):
        random.shuffle(files)
        train_size = int(len(files) * train_ratio)
        val_size = int(len(files) * val_ratio)

        train_files = files[:train_size]
        val_files = files[train_size:train_size+val_size]
        test_files = files[train_size+val_size:]

        return train_files, val_files, test_files

    # Phân chia human theo độ khó (vẫn giữ tỷ lệ giống nhau)
    easy_human_train, easy_human_val, easy_human_test = split_by_ratio(human_files)
    mid_human_train, mid_human_val, mid_human_test = split_by_ratio(human_files)
    hard_human_train, hard_human_val, hard_human_test = split_by_ratio(human_files)

    # Phân chia AI theo độ khó
    easy_ai_train, easy_ai_val, easy_ai_test = split_by_ratio(ai_by_difficulty['easy'])
    mid_ai_train, mid_ai_val, mid_ai_test = split_by_ratio(ai_by_difficulty['mid'])
    hard_ai_train, hard_ai_val, hard_ai_test = split_by_ratio(ai_by_difficulty['hard'])

    # Lưu danh sách file theo độ khó
    save_pickle_file([easy_human_train, easy_ai_train], folder + 'file_names/train_easy.pickle')
    save_pickle_file([mid_human_train, mid_ai_train], folder + 'file_names/train_mid.pickle')
    save_pickle_file([hard_human_train, hard_ai_train], folder + 'file_names/train_hard.pickle')

    save_pickle_file([easy_human_val, easy_ai_val], folder + 'file_names/val_easy.pickle')
    save_pickle_file([mid_human_val, mid_ai_val], folder + 'file_names/val_mid.pickle')
    save_pickle_file([hard_human_val, hard_ai_val], folder + 'file_names/val_hard.pickle')

    save_pickle_file([easy_human_test, easy_ai_test], folder + 'file_names/test_easy.pickle')
    save_pickle_file([mid_human_test, mid_ai_test], folder + 'file_names/test_mid.pickle')
    save_pickle_file([hard_human_test, hard_ai_test], folder + 'file_names/test_hard.pickle')

    # Tạo batch dữ liệu
    num_batches = 10
    batch_size = 32
    human_label = [0, 1]  # [real, fake]
    ai_label = [1, 0]     # [fake, real]

    # Tạo batch training
    for difficulty in ['easy', 'mid', 'hard']:
        ai_train_files = eval(f'{difficulty}_ai_train')
        human_train_files = eval(f'{difficulty}_human_train')

        # Đảm bảo đủ số lượng batch
        max_batch = min(len(ai_train_files) // batch_size, len(human_train_files) // batch_size, num_batches)

        for batch in tqdm(range(max_batch), desc=f"Processing {difficulty} train batches"):
            human_batch_files = human_train_files[batch * batch_size:(batch + 1) * batch_size]
            ai_batch_files = ai_train_files[batch * batch_size:(batch + 1) * batch_size]

            human_data, human_labels = create_data_with_labels(human_batch_files, human_path, human_label)
            ai_data, ai_labels = create_data_with_labels(ai_batch_files, ai_path, ai_label)

            if human_data.size == 0 or ai_data.size == 0:
                print(f"⚠️ Skipping empty batch {batch} for {difficulty}")
                continue

            data_batch = {
                'data': np.concatenate((human_data, ai_data), axis=0),
                'labels': np.concatenate((human_labels, ai_labels), axis=0)
            }

            save_pickle_file(data_batch, f"{folder}train_batches/batch_{difficulty}_{batch}.pickle")

    # Tạo batch validation
    for difficulty in ['easy', 'mid', 'hard']:
        ai_val_files = eval(f'{difficulty}_ai_val')
        human_val_files = eval(f'{difficulty}_human_val')

        val_batch_size = min(len(ai_val_files), len(human_val_files), batch_size*2)

        if val_batch_size < batch_size:
            print(f"⚠️ Warning: Not enough validation data for {difficulty}. Using all available.")

        human_val_batch = human_val_files[:val_batch_size//2]
        ai_val_batch = ai_val_files[:val_batch_size//2]

        human_data, human_labels = create_data_with_labels(human_val_batch, human_path, human_label)
        ai_data, ai_labels = create_data_with_labels(ai_val_batch, ai_path, ai_label)

        if human_data.size == 0 or ai_data.size == 0:
            print(f"⚠️ Skipping empty validation batch for {difficulty}")
            continue

        val_batch = {
            'data': np.concatenate((human_data, ai_data), axis=0),
            'labels': np.concatenate((human_labels, ai_labels), axis=0)
        }

        save_pickle_file(val_batch, f"{folder}val_batches/val_{difficulty}_batch.pickle")
        print(f"✅ Saved {difficulty} validation batch")

    # Tạo batch test
    for difficulty in ['easy', 'mid', 'hard']:
        ai_test_files = eval(f'{difficulty}_ai_test')
        human_test_files = eval(f'{difficulty}_human_test')

        test_batch_size = min(len(ai_test_files), len(human_test_files), batch_size*2)

        if test_batch_size < batch_size:
            print(f"⚠️ Warning: Not enough test data for {difficulty}. Using all available.")

        human_test_batch = human_test_files[:test_batch_size//2]
        ai_test_batch = ai_test_files[:test_batch_size//2]

        human_data, human_labels = create_data_with_labels(human_test_batch, human_path, human_label)
        ai_data, ai_labels = create_data_with_labels(ai_test_batch, ai_path, ai_label)

        if human_data.size == 0 or ai_data.size == 0:
            print(f"⚠️ Skipping empty test batch for {difficulty}")
            continue

        test_batch = {
            'data': np.concatenate((human_data, ai_data), axis=0),
            'labels': np.concatenate((human_labels, ai_labels), axis=0)
        }

        save_pickle_file(test_batch, f"{folder}test_batches/test_{difficulty}_batch.pickle")
        print(f"✅ Saved {difficulty} test batch")

# Gọi hàm
organize_data(
    human_path="train_data/real_and_fake_face/training_real",
    ai_path="train_data/real_and_fake_face/training_fake",
    folder="train_data/processed/"
)

Total human files: 1081
Total AI files: 960
Easy AI files: 240
Mid AI files: 480
Hard AI files: 240


Processing easy train batches: 100%|██████████| 4/4 [00:19<00:00,  4.78s/it]
Processing mid train batches: 100%|██████████| 9/9 [00:42<00:00,  4.71s/it]
Processing hard train batches: 100%|██████████| 4/4 [00:18<00:00,  4.64s/it]


✅ Saved easy validation batch
✅ Saved mid validation batch
✅ Saved hard validation batch
✅ Saved easy test batch
✅ Saved mid test batch
✅ Saved hard test batch
