# Basic Image Preprocessing_1

### Perona–Malik Diffusion (PMD) Filter
PMD is a type of anisotropic diffusion that reduces image noise while preserving important edges and structures, unlike simple Gaussian blur which blurs everything indiscriminately.
  
In medical imaging tasks like Pap smear classification, PMD helps keep key details (like nuclei edges) intact while smoothing out grainy background noise.

### Contrast-Limited Adaptive Histogram Equalization (CLAHE)
CLAHE enhances local contrast by dividing the image into small tiles and applying histogram equalization within each. It prevents over-amplifying noise through a clip limit .
  
Particularly effective on unevenly illuminated images like cervical cells, it makes subtle morphological features more visible without introducing artifacts

### Why Combine PMD + CLAHE?
The hybrid PMD → CLAHE pipeline has been empirically shown to significantly improve CNN-based cervical cell classification:
  
Boosts in accuracy (up to ~13.6%), precision, recall, and F1‑score have been reported when using this combined preprocessing, especially with models like ResNet‑50, EfficientNet, and DenseNet 
IJEECS.

We process each RGB channel separately: apply PMD and CLAHE per channel, then recombine to preserve color balance

In [1]:
import os
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm

# Base directories
RAW_DATA_DIR = Path("../Dataset/Raw Dataset/SipakMed Dataset")
PREPROCESSED_DATA_DIR = Path("../Dataset/Preprocessed Dataset")

# Get all class folders
classes = [d for d in RAW_DATA_DIR.iterdir() if d.is_dir()]

def extract_and_save_crops(class_path, class_name):
    inner_dir = class_path / class_name  # e.g., im_Dyskeratotic/im_Dyskeratotic
    output_dir = PREPROCESSED_DATA_DIR / class_name
    output_dir.mkdir(parents=True, exist_ok=True)

    # Get .dat files for cyt/nuc
    dat_files = [f for f in inner_dir.glob("*.dat") if "_cyt" in f.name or "_nuc" in f.name]

    for dat_path in tqdm(dat_files, desc=f"Processing {class_name}"):
        image_id = dat_path.stem.split("_")[0]  # '001'
        image_path = inner_dir / f"{image_id}.bmp"

        if not image_path.exists():
            print(f"Image not found for {dat_path.name}, skipping.")
            continue

        img = cv2.imread(str(image_path))
        if img is None:
            print(f"Failed to load image: {image_path}")
            continue

        # Read (x, y) points from .dat file
        with open(dat_path, "r") as f:
            coords = [list(map(float, line.strip().split(","))) for line in f if "," in line]

        if len(coords) < 3:
            print(f"Invalid polygon in {dat_path.name}, skipping.")
            continue

        points = np.array(coords, dtype=np.int32)
        x, y, w, h = cv2.boundingRect(points)

        # Crop and resize
        crop = img[y:y+h, x:x+w]
        resized_crop = cv2.resize(crop, (224, 224), interpolation=cv2.INTER_AREA)

        # Save cropped image
        label = "cyt" if "_cyt" in dat_path.name else "nuc"
        save_name = f"{image_id}_{label}_{dat_path.stem.split('_')[-1]}.png"
        save_path = output_dir / save_name
        cv2.imwrite(str(save_path), resized_crop)

# Loop through all classes and process
for class_dir in classes:
    class_name = class_dir.name  # e.g., im_Dyskeratotic
    extract_and_save_crops(class_dir, class_name)

print("✅ All classes processed and saved to Preprocessed Dataset.")


Processing im_Dyskeratotic: 100%|██████████| 1626/1626 [00:48<00:00, 33.26it/s]
Processing im_Koilocytotic: 100%|██████████| 1650/1650 [00:59<00:00, 27.94it/s]
Processing im_Metaplastic: 100%|██████████| 1586/1586 [00:56<00:00, 27.97it/s]
Processing im_Parabasal: 100%|██████████| 1574/1574 [00:51<00:00, 30.65it/s]
Processing im_Superficial-Intermediate: 100%|██████████| 1662/1662 [00:55<00:00, 29.71it/s]

✅ All classes processed and saved to Preprocessed Dataset.





## Basic Image Processing and parsing polygon from .dat files 

In [9]:
import os
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm

# Base directories
RAW_DATA_DIR = Path("../Dataset/Raw Dataset/SipakMed Dataset")
PREPROCESSED_DATA_DIR = Path("../Dataset/Preprocessed SipakMed_2")  

# Ensure output dir exists
PREPROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Get all class folders
classes = [d for d in RAW_DATA_DIR.iterdir() if d.is_dir()]

def apply_clahe(img, clip_limit=2.0, tile_grid_size=(8,8)):
    """Apply CLAHE to all 3 channels."""
    clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
    if len(img.shape) == 2:
        return clahe.apply(img)
    # For RGB: apply channel-wise
    channels = cv2.split(img)
    clahe_channels = [clahe.apply(c) for c in channels]
    return cv2.merge(clahe_channels)

def apply_median_filter(img, ksize=3):
    """Apply median filter (PMD-style) channel wise."""
    if len(img.shape) == 2:
        return cv2.medianBlur(img, ksize)
    return cv2.merge([cv2.medianBlur(img[:,:,i], ksize) for i in range(img.shape[2])])

def extract_and_save_crops(class_path, class_name):
    inner_dir = class_path / class_name
    output_dir = PREPROCESSED_DATA_DIR / class_name
    output_dir.mkdir(parents=True, exist_ok=True)

    # Get .dat files for cyt/nuc
    dat_files = [f for f in inner_dir.glob("*.dat") if "_cyt" in f.name or "_nuc" in f.name]

    for dat_path in tqdm(dat_files, desc=f"Processing {class_name}"):
        image_id = dat_path.stem.split("_")[0]
        image_path = inner_dir / f"{image_id}.bmp"

        if not image_path.exists():
            print(f"Image not found for {dat_path.name}, skipping.")
            continue

        img = cv2.imread(str(image_path))
        if img is None:
            print(f"Failed to load image: {image_path}")
            continue

        # Parse polygon from .dat
        with open(dat_path, "r") as f:
            coords = [list(map(float, line.strip().split(","))) for line in f if "," in line]
        if len(coords) < 3:
            print(f"Invalid polygon in {dat_path.name}, skipping.")
            continue

        points = np.array(coords, dtype=np.int32)
        x, y, w, h = cv2.boundingRect(points)

        # Crop region of interest
        crop = img[y:y+h, x:x+w]

        # --- Optional PMD: Median filter for denoising
        crop_pmd = apply_median_filter(crop, ksize=3)

        # --- Optional: Morphological opening to smooth small noise (can be tuned)
        kernel = np.ones((3,3), np.uint8)
        crop_pmd = cv2.morphologyEx(crop_pmd, cv2.MORPH_OPEN, kernel)

        # --- CLAHE for contrast enhancement
        crop_clahe = apply_clahe(crop_pmd)

        # Resize
        resized_crop = cv2.resize(crop_clahe, (224, 224), interpolation=cv2.INTER_AREA)

        # Save processed crop
        label = "cyt" if "_cyt" in dat_path.name else "nuc"
        save_name = f"{image_id}_{label}_{dat_path.stem.split('_')[-1]}.png"
        save_path = output_dir / save_name
        cv2.imwrite(str(save_path), resized_crop)

# Loop through all classes and process
for class_dir in classes:
    class_name = class_dir.name
    extract_and_save_crops(class_dir, class_name)

print("✅ All classes processed and saved to Preprocessed SipakMed_2 (with PMD & CLAHE).")


Processing im_Dyskeratotic: 100%|██████████| 1626/1626 [00:56<00:00, 28.86it/s]
Processing im_Koilocytotic: 100%|██████████| 1650/1650 [00:58<00:00, 28.18it/s]
Processing im_Metaplastic: 100%|██████████| 1586/1586 [00:56<00:00, 27.86it/s]
Processing im_Parabasal: 100%|██████████| 1574/1574 [00:50<00:00, 31.19it/s]
Processing im_Superficial-Intermediate: 100%|██████████| 1662/1662 [00:56<00:00, 29.58it/s]

✅ All classes processed and saved to Preprocessed SipakMed_2 (with PMD & CLAHE).



