## Image Processing

The code organizes the dataset of images into training and test sets for three categories: human, dalle, and stable. Random samples are selected from each category for training and the remaining samples are used for testing. The images are preprocessed, resized, and converted into numpy arrays. Training and test batches are created and saved as pickle files for later use.

In [3]:
import os
import random
import pickle as pkl
import numpy as np
from PIL import Image
from tqdm import tqdm

# Tiền xử lý ảnh
def preprocess_image(image_path, target_size=(256, 256)):
    img = Image.open(image_path).resize(target_size)
    return np.array(img.convert("RGB")) / 255.

# Tạo dữ liệu kèm nhãn
def create_data_with_labels(file_paths, folder, label):
    data = []
    labels = []
    for file in file_paths:
        if os.path.splitext(file)[1].lower() in ['.jpg', '.jpeg', '.png', '.webp']:
            try:
                img = preprocess_image(os.path.join(folder, file))
                data.append(img)
                labels.append(label)
            except Exception as e:
                print(f"Error processing {file}: {e}")
    return np.array(data), np.array(labels, dtype=np.float32)

# Lấy mẫu ngẫu nhiên
def get_random_samples(file_list, sample_size):
    return random.sample(file_list, sample_size)

# Phần còn lại sau khi lấy mẫu
def get_remaining_samples(file_list, selected_samples):
    return [sample for sample in file_list if sample not in selected_samples]

# Phân loại AI theo độ khó
def split_ai_by_difficulty(ai_files):
    easy = [f for f in ai_files if f.lower().startswith("easy")]
    mid  = [f for f in ai_files if f.lower().startswith("mid")]
    hard = [f for f in ai_files if f.lower().startswith("hard")]
    return {'easy': easy, 'mid': mid, 'hard': hard}

# Lưu file pickle
def save_pickle_file(data, file_path):
    with open(file_path, 'wb') as f:
        pkl.dump(data, f)

# Hàm chính tổ chức dữ liệu
def organize_data(human_path, ai_path, folder, human_train_size=700, ai_train_size=640):
    os.makedirs(folder + 'file_names/', exist_ok=True)
    os.makedirs(folder + 'train_batches/', exist_ok=True)
    os.makedirs(folder + 'test_batches/', exist_ok=True)

    human_files = os.listdir(human_path)
    ai_files = os.listdir(ai_path)

    easy_human_train = get_random_samples(human_files, human_train_size)
    mid_human_train = get_random_samples(human_files, human_train_size)
    hard_human_train = get_random_samples(human_files, human_train_size)
    used_human = easy_human_train + mid_human_train + hard_human_train
    human_test = get_remaining_samples(human_files, used_human)

    ai_train = get_random_samples(ai_files, ai_train_size)
    ai_test = get_remaining_samples(ai_files, ai_train)

    save_pickle_file([easy_human_train + mid_human_train + hard_human_train, ai_train], folder + 'file_names/train.pickle')
    save_pickle_file([human_test, ai_test], folder + 'file_names/test.pickle')

    ai_by_difficulty = split_ai_by_difficulty(ai_files)
    train_ratio = 0.7

    def split_train_test(files):
        train_size = int(len(files) * train_ratio)
        train_files = get_random_samples(files, train_size)
        test_files = get_remaining_samples(files, train_files)
        return train_files, test_files

    easy_ai_train, easy_ai_test = split_train_test(ai_by_difficulty['easy'])
    mid_ai_train, mid_ai_test = split_train_test(ai_by_difficulty['mid'])
    hard_ai_train, hard_ai_test = split_train_test(ai_by_difficulty['hard'])

    save_pickle_file([easy_human_train, easy_ai_train], folder + 'file_names/train_easy.pickle')
    save_pickle_file([mid_human_train, mid_ai_train], folder + 'file_names/train_mid.pickle')
    save_pickle_file([hard_human_train, hard_ai_train], folder + 'file_names/train_hard.pickle')

    save_pickle_file([human_test, easy_ai_test], folder + 'file_names/test_easy.pickle')
    save_pickle_file([human_test, mid_ai_test], folder + 'file_names/test_mid.pickle')
    save_pickle_file([human_test, hard_ai_test], folder + 'file_names/test_hard.pickle')

    num_batches = 10
    batch_size = 32
    human_label = [0, 1]
    ai_label = [1, 0]

    for difficulty in ['easy', 'mid', 'hard']:
        ai_train_batch = eval(f'{difficulty}_ai_train')
        human_train_batch_all = eval(f'{difficulty}_human_train')

        for batch in tqdm(range(num_batches), desc=f"Processing {difficulty} train batches"):
            human_batch_files = human_train_batch_all[batch * batch_size:(batch + 1) * batch_size]
            ai_batch_files = ai_train_batch[batch * batch_size:(batch + 1) * batch_size]

            human_data, human_labels = create_data_with_labels(human_batch_files, human_path, human_label)
            ai_data, ai_labels = create_data_with_labels(ai_batch_files, ai_path, ai_label)

            if human_data.size == 0 or ai_data.size == 0:
                print(f"⚠️ Skipping empty batch {batch} for {difficulty}")
                continue

            data_batch = {
                'data': np.concatenate((human_data, ai_data), axis=0),
                'labels': np.concatenate((human_labels, ai_labels), axis=0)
            }

            save_pickle_file(data_batch, f"{folder}train_batches/batch_{difficulty}_{batch}.pickle")

    for difficulty in ['easy', 'mid', 'hard']:
        ai_test_batch = eval(f'{difficulty}_ai_test')

        human_test_data, human_test_labels = create_data_with_labels(human_test, human_path, human_label)
        ai_test_data, ai_test_labels = create_data_with_labels(ai_test_batch, ai_path, ai_label)

        if human_test_data.size == 0 or ai_test_data.size == 0:
            print(f"⚠️ Skipping empty test batch for {difficulty}")
            continue

        test_data = np.concatenate((human_test_data, ai_test_data), axis=0)
        test_labels = np.concatenate((human_test_labels, ai_test_labels), axis=0)
        test_batch = {'data': test_data, 'labels': test_labels}

        save_pickle_file(test_batch, f"{folder}test_batches/test_{difficulty}_batch.pickle")
        print(f"✅ Saved {difficulty} test batch")

# Gọi hàm
organize_data(
    human_path="train_data/real_and_fake_face/training_real",
    ai_path="train_data/real_and_fake_face/training_fake",
    folder="train_data/proccessed"
)


Processing easy train batches:  70%|███████   | 7/10 [00:47<00:12,  4.18s/it]

⚠️ Skipping empty batch 6 for easy


Processing easy train batches:  80%|████████  | 8/10 [00:48<00:06,  3.14s/it]

⚠️ Skipping empty batch 7 for easy


Processing easy train batches:  90%|█████████ | 9/10 [00:48<00:02,  2.42s/it]

⚠️ Skipping empty batch 8 for easy


Processing easy train batches: 100%|██████████| 10/10 [00:49<00:00,  4.96s/it]


⚠️ Skipping empty batch 9 for easy


Processing mid train batches: 100%|██████████| 10/10 [00:47<00:00,  4.75s/it]
Processing hard train batches:  70%|███████   | 7/10 [00:23<00:07,  2.49s/it]

⚠️ Skipping empty batch 6 for hard


Processing hard train batches:  80%|████████  | 8/10 [00:23<00:03,  1.88s/it]

⚠️ Skipping empty batch 7 for hard


Processing hard train batches:  90%|█████████ | 9/10 [00:24<00:01,  1.47s/it]

⚠️ Skipping empty batch 8 for hard


Processing hard train batches: 100%|██████████| 10/10 [00:24<00:00,  2.48s/it]

⚠️ Skipping empty batch 9 for hard





✅ Saved easy test batch
✅ Saved mid test batch
✅ Saved hard test batch
