<a href="https://colab.research.google.com/github/MahiKhan5360/Segmentation-using-Capsule-layers-and-CNN/blob/main/Dataset_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Check if running in Colab
def is_colab():
    try:
        import google.colab
        return True
    except:
        return False

if is_colab():
    print("Running in Google Colab environment.")
else:
    print("Not running in Google Colab environment.")

# Mount Google Drive if in Colab
if is_colab():
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")

# Fix for keras.utils.generic_utils error
import os
os.environ['TF_KERAS'] = '1'  # Force using tf.keras instead of standalone keras

# Install required packages
import sys
import subprocess

# Fix for TensorFlow Addons compatibility
!pip uninstall -y tensorflow-addons
!pip install tensorflow-addons==0.17.1

# Monkey patch for missing generic_utils
import keras
if not hasattr(keras.utils, 'generic_utils'):
    import tensorflow as tf
    keras.utils.generic_utils = tf.keras.utils

# Check if packages are already installed to avoid reinstallation
required_packages = {
    'tensorflow': 'tensorflow>=2.8.0',
    'tensorflow_addons': 'tensorflow-addons==0.17.1',  # with specific version
    'segmentation_models': 'git+https://github.com/qubvel/segmentation_models',  # Use GitHub version
    'albumentations': 'albumentations>=1.1.0',
    'opencv-python': 'opencv-python>=4.5.5',
    'scikit-image': 'scikit-image>=0.19.2',
    'matplotlib': 'matplotlib>=3.5.1',
    'pandas': 'pandas>=1.4.2',
    'tqdm': 'tqdm>=4.64.0',
    'einops': 'einops>=0.4.1',
    'timm': 'timm>=0.5.4',
    'seaborn': 'seaborn>=0.11.2'
}

# Only install packages that are not already installed
for package, install_name in required_packages.items():
    if package == 'tensorflow_addons':  # Skip as we already installed it
        continue
    if package == 'segmentation_models':  # Special handling for segmentation_models
        try:
            __import__(package)
            print(f"{package} is already installed.")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", install_name])
            print(f"{package} installed successfully.")
    else:
        try:
            __import__(package)
            print(f"{package} is already installed.")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", install_name])
            print(f"{package} installed successfully.")

# Set up TensorFlow and GPU configurations
import tensorflow as tf

# Check for GPU availability
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to avoid allocating all memory at once
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Found {len(gpus)} GPU(s). Memory growth enabled.")

        # Print GPU information
        gpu_info = !nvidia-smi
        print("GPU Information:")
        for line in gpu_info:
            print(line)

    except RuntimeError as e:
        print(f"GPU configuration error: {e}")
else:
    print("No GPU found. Running on CPU.")

# Set up mixed precision for faster training
try:
    policy = tf.keras.mixed_precision.Policy('mixed_float16')
    tf.keras.mixed_precision.set_global_policy(policy)
    print(f"Mixed precision policy set to: {policy.name}")
except:
    print("Mixed precision not supported or failed to set up.")

# Print TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

# Verify dataset paths
import os

# Original dataset paths
original_input_path = '/content/drive/MyDrive/ISIC2018_original/ISIC2018_Task1_2_Training_Input'
original_gt_path = '/content/drive/MyDrive/ISIC2018_original/ISIC2018_Task1_Training_GroundTruth'

# Processed dataset paths
processed_base_path = '/content/drive/MyDrive/ISIC2018'

# Check if original dataset exists
if os.path.exists(original_input_path) and os.path.exists(original_gt_path):
    input_files = len(os.listdir(original_input_path))
    gt_files = len(os.listdir(original_gt_path))
    print(f"Original dataset found:")
    print(f"  - Input images: {input_files}")
    print(f"  - Ground truth masks: {gt_files}")
else:
    print("Warning: Original dataset not found at the specified path.")
    print(f"Expected paths: \n  {original_input_path}\n  {original_gt_path}")

# Check if processed dataset directory exists
if not os.path.exists(processed_base_path):
    print(f"Creating processed dataset directory at: {processed_base_path}")
    os.makedirs(processed_base_path, exist_ok=True)
else:
    print(f"Processed dataset directory exists at: {processed_base_path}")


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import albumentations as A
from tqdm import tqdm
import shutil
from glob import glob
import os
import tensorflow as tf

# Define paths (these should match what you set in the setup_environment cell)
original_input_path = '/content/drive/MyDrive/ISIC2018_original/ISIC2018_Task1_2_Training_Input'
original_gt_path = '/content/drive/MyDrive/ISIC2018_original/ISIC2018_Task1_Training_GroundTruth'
processed_base_path = '/content/drive/MyDrive/ISIC2018'

class ISIC2018DataPreprocessor:
    def __init__(self,
                 original_input_path,
                 original_gt_path,
                 processed_base_path,
                 img_size=(256, 256),
                 val_split=0.15,
                 test_split=0.15):
        """
        Initialize the ISIC2018 data preprocessor.

        Args:
            original_input_path: Path to original input images
            original_gt_path: Path to original ground truth masks
            processed_base_path: Base path for processed dataset
            img_size: Target image size (height, width)
            val_split: Validation split ratio
            test_split: Test split ratio
        """
        self.original_input_path = original_input_path
        self.original_gt_path = original_gt_path
        self.processed_base_path = processed_base_path
        self.img_size = img_size
        self.val_split = val_split
        self.test_split = test_split

        # Create processed dataset directories
        self.train_img_dir = os.path.join(processed_base_path, 'train', 'images')
        self.train_mask_dir = os.path.join(processed_base_path, 'train', 'masks')
        self.val_img_dir = os.path.join(processed_base_path, 'validation', 'images')
        self.val_mask_dir = os.path.join(processed_base_path, 'validation', 'masks')
        self.test_img_dir = os.path.join(processed_base_path, 'test', 'images')
        self.test_mask_dir = os.path.join(processed_base_path, 'test', 'masks')

        self._create_directories()

     def _create_directories(self):
        """Create necessary directories for processed dataset."""
        directories = [
            self.train_img_dir, self.train_mask_dir,
            self.val_img_dir, self.val_mask_dir,
            self.test_img_dir, self.test_mask_dir
        ]

        for directory in directories:
            os.makedirs(directory, exist_ok=True)

        print("Created processed dataset directories.")


    def organize_dataset(self):
        """
        Organize the dataset by matching input images with their corresponding masks
        and splitting into train, validation, and test sets.
        """
        print("Organizing dataset...")

        # Get list of all input images and masks
        input_files = sorted(os.listdir(self.original_input_path))
        mask_files = sorted(os.listdir(self.original_gt_path))

        # Extract image IDs from filenames
        input_ids = [os.path.splitext(f)[0] for f in input_files]
        mask_ids = [os.path.splitext(f)[0].replace('_segmentation', '') for f in mask_files]

        # Create mapping from image ID to filenames
        input_map = {id: f for id, f in zip(input_ids, input_files)}
        mask_map = {id: f for id, f in zip(mask_ids, mask_files)}

        # Find common IDs (images that have corresponding masks)
        common_ids = sorted(list(set(input_ids).intersection(set(mask_ids))))

        print(f"Found {len(common_ids)} images with corresponding masks.")

        # Split into train, validation, and test sets
        train_ids, temp_ids = train_test_split(
            common_ids,
            test_size=self.val_split + self.test_split,
            random_state=42
        )

        # Further split temp_ids into validation and test
        val_ratio = self.val_split / (self.val_split + self.test_split)
        val_ids, test_ids = train_test_split(
            temp_ids,
            test_size=1 - val_ratio,
            random_state=42
        )

        print(f"Split dataset: {len(train_ids)} train, {len(val_ids)} validation, {len(test_ids)} test")

        # Copy files to their respective directories
        self._copy_files(train_ids, input_map, mask_map, self.train_img_dir, self.train_mask_dir)
        self._copy_files(val_ids, input_map, mask_map, self.val_img_dir, self.val_mask_dir)
        self._copy_files(test_ids, input_map, mask_map, self.test_img_dir, self.test_mask_dir)

        return {
            'train_ids': train_ids,
            'val_ids': val_ids,
            'test_ids': test_ids
        }


    def _copy_files(self, ids, input_map, mask_map, img_dir, mask_dir):
        """
        Copy image and mask files to their respective directories.

        Args:
            ids: List of image IDs to copy
            input_map: Mapping from image ID to input filename
            mask_map: Mapping from image ID to mask filename
            img_dir: Destination directory for images
            mask_dir: Destination directory for masks
        """
        for id in tqdm(ids, desc=f"Copying files to {os.path.basename(os.path.dirname(img_dir))}"):
            # Copy input image
            src_img = os.path.join(self.original_input_path, input_map[id])
            dst_img = os.path.join(img_dir, f"{id}.jpg")

            # Copy mask
            src_mask = os.path.join(self.original_gt_path, mask_map[id])
            dst_mask = os.path.join(mask_dir, f"{id}_mask.png")

            # Read, resize, and save image
            img = cv2.imread(src_img)
            img = cv2.resize(img, self.img_size[::-1])  # cv2 uses (width, height)
            cv2.imwrite(dst_img, img)

            # Read, resize, and save mask
            mask = cv2.imread(src_mask, cv2.IMREAD_GRAYSCALE)
            mask = cv2.resize(mask, self.img_size[::-1], interpolation=cv2.INTER_NEAREST)
            # Ensure binary mask (0 or 255)
            mask = (mask > 127).astype(np.uint8) * 255
            cv2.imwrite(dst_mask, mask)

    def create_augmentation_pipeline(self):
        """
        Create data augmentation pipelines for training and validation.

        Returns:
            Dictionary containing train and validation augmentation pipelines
        """
        # Training augmentations
        train_augmentation = A.Compose([
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.RandomRotate90(p=0.5),
            A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=15, p=0.5),
            A.OneOf([
                A.ElasticTransform(alpha=120, sigma=120 * 0.05, p=0.5),  # Remove alpha_affine
                A.GridDistortion(p=0.5),
                A.OpticalDistortion(distort_limit=1.0, p=0.5),  # Remove shift_limit
            ], p=0.3),

            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ])

        # Validation/Test augmentations (only normalization)
        val_augmentation = A.Compose([
            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ])

        return {
            'train': train_augmentation,
            'val': val_augmentation
        }