In [None]:
import os
import cv2
import numpy as np
import shutil
import random
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# Set the dataset folder path
DATASET_DIR = "path/to/your/folder"  # Change this to your actual path
OUTPUT_DIR = "processed_data"  # Output folder for processed images

# Select only relevant folders (Upper GI tract: Esophagus, Stomach, Duodenum)
UPPER_GI_FOLDERS = [
    "Barrett's esophagus", "Esophageal varices", "Esophagitis", "Gastroesophageal_junction_normal z-line",
    "Gastric polyps", "Erythema", "Mucosal inflammation large bowel", "Normal stomach",
    "Duodenal bulb", "Ulcer"
]

# Image preprocessing parameters
IMG_SIZE = (224, 224)  # Resize to 224x224
VALID_SPLIT = 0.1  # 10% validation
TEST_SPLIT = 0.1   # 10% test

# Ensure output directories exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Function to preprocess images
def preprocess_and_save_images():
    for folder in UPPER_GI_FOLDERS:
        input_path = os.path.join(DATASET_DIR, folder)
        output_path = os.path.join(OUTPUT_DIR, folder)
        os.makedirs(output_path, exist_ok=True)

        for filename in os.listdir(input_path):
            img_path = os.path.join(input_path, filename)

            try:
                # Read the image
                img = cv2.imread(img_path)
                if img is None:
                    print(f"Skipping corrupted image: {filename}")
                    continue

                # Resize image
                img = cv2.resize(img, IMG_SIZE)

                # Normalize (convert pixel values to range [0,1])
                img = img / 255.0

                # Save preprocessed image
                save_path = os.path.join(output_path, filename)
                cv2.imwrite(save_path, (img * 255).astype(np.uint8))

            except Exception as e:
                print(f"Error processing {filename}: {e}")

# Function to split dataset into train/val/test
def split_dataset():
    for folder in UPPER_GI_FOLDERS:
        folder_path = os.path.join(OUTPUT_DIR, folder)
        images = os.listdir(folder_path)

        train, test = train_test_split(images, test_size=TEST_SPLIT, random_state=42)
        train, val = train_test_split(train, test_size=VALID_SPLIT, random_state=42)

        for split, split_data in zip(["train", "val", "test"], [train, val, test]):
            split_folder = os.path.join(OUTPUT_DIR, split, folder)
            os.makedirs(split_folder, exist_ok=True)
            for img in split_data:
                shutil.move(os.path.join(folder_path, img), os.path.join(split_folder, img))

# Function to apply Data Augmentation
def augment_images():
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.1,
        height_shift_range=0.1,
        brightness_range=[0.8, 1.2],
        horizontal_flip=True
    )

    for folder in UPPER_GI_FOLDERS:
        folder_path = os.path.join(OUTPUT_DIR, "train", folder)
        images = [f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))]

        for img_name in images:
            img_path = os.path.join(folder_path, img_name)
            img = cv2.imread(img_path)
            img = cv2.resize(img, IMG_SIZE)
            img = img / 255.0  # Normalize

            img = np.expand_dims(img, axis=0)  # Add batch dimension
            aug_iter = datagen.flow(img, batch_size=1)

            for i in range(2):  # Generate 2 augmented versions per image
                aug_img = next(aug_iter)[0]
                aug_img = (aug_img * 255).astype(np.uint8)

                aug_name = f"aug_{i}_{img_name}"
                cv2.imwrite(os.path.join(folder_path, aug_name), aug_img)

# Run preprocessing steps
preprocess_and_save_images()
split_dataset()
augment_images()

print("Preprocessing complete! Images saved in:", OUTPUT_DIR)
