In [None]:
#hair removal

import cv2
import numpy as np

def remove_hair(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Black-hat filtering to find hair
    kernel = cv2.getStructuringElement(1, (17,17))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)

    # Threshold to create a mask
    _, thresh = cv2.threshold(blackhat, 10, 255, cv2.THRESH_BINARY)

    # Inpaint to remove hair
    dst = cv2.inpaint(img, thresh, 1, cv2.INPAINT_TELEA)
    return dst


In [None]:
#reduce color bias
def gray_world_normalization(img):
    img = img.astype(np.float32)
    avg_b = np.mean(img[:,:,0])
    avg_g = np.mean(img[:,:,1])
    avg_r = np.mean(img[:,:,2])

    avg_gray = (avg_b + avg_g + avg_r) / 3

    img[:,:,0] = img[:,:,0] * (avg_gray / avg_b)
    img[:,:,1] = img[:,:,1] * (avg_gray / avg_g)
    img[:,:,2] = img[:,:,2] * (avg_gray / avg_r)

    return np.clip(img, 0, 255).astype('uint8')


In [None]:
#helps the model run better on darker skin tone
def dark_skin_augment(img, strength=0.25):
    # strength: 0 = no change, 0.5 = strong darkening
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.float32)

    hsv[:,:,2] = hsv[:,:,2] * (1 - strength)     # reduce brightness
    hsv[:,:,1] = hsv[:,:,1] * (1 + strength/2)   # increase saturation slightly

    hsv = np.clip(hsv, 0, 255).astype('uint8')
    return cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)


In [None]:
import torchvision.transforms as T

transform_train = T.Compose([
    T.ToPILImage(),
    T.Resize((224, 224)),
    T.RandomHorizontalFlip(),
    T.RandomRotation(20),
    T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406],
                [0.229, 0.224, 0.225])
])

transform_test = T.Compose([
    T.ToPILImage(),
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406],
                [0.229, 0.224, 0.225])
])


In [None]:
def preprocess_image(img, dark_skin_aug=False):
    # Step 1: Hair removal
    img = remove_hair(img)

    # Step 2: Color normalization
    img = gray_world_normalization(img)

    # Step 3: Optional dark-skin augmentation
    if dark_skin_aug:
        img = dark_skin_augment(img, strength=0.3)

    return img


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tschandl/ham10000-lesion-segmentations")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tschandl/ham10000-lesion-segmentations?dataset_version_number=1...


100%|██████████| 10.3M/10.3M [00:00<00:00, 48.4MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/tschandl/ham10000-lesion-segmentations/versions/1


In [None]:
import os
import cv2
from tqdm import tqdm

# ----- your pipeline -----
def preprocess_image(img, dark_skin_aug=False):
    img = remove_hair(img)
    img = gray_world_normalization(img)

    if dark_skin_aug:
        img = dark_skin_augment(img, strength=0.3)

    return img
# --------------------------


# ROOT = the folder returned by kagglehub
ROOT = path

input_dirs = [
    os.path.join(ROOT, "HAM10000_images_part_1"),
    os.path.join(ROOT, "HAM10000_images_part_2")
]

output_dir = os.path.join(ROOT, "HAM10000_preprocessed")
os.makedirs(output_dir, exist_ok=True)

for folder in input_dirs:
    if not os.path.exists(folder):
        print("Missing:", folder)
        continue

    for fname in tqdm(os.listdir(folder), desc=f"Processing {folder}"):
        if not fname.lower().endswith(".jpg"):
            continue

        fpath = os.path.join(folder, fname)
        img = cv2.imread(fpath)
        if img is None:
            print("Bad image:", fpath)
            continue

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # ---- YOUR PIPELINE ----
        processed = preprocess_image(img)
        # ------------------------

        out = cv2.cvtColor(processed, cv2.COLOR_RGB2BGR)
        cv2.imwrite(os.path.join(output_dir, fname), out)


Missing: /root/.cache/kagglehub/datasets/tschandl/ham10000-lesion-segmentations/versions/1/HAM10000_images_part_1
Missing: /root/.cache/kagglehub/datasets/tschandl/ham10000-lesion-segmentations/versions/1/HAM10000_images_part_2
