Before this code execution, there is a need to download into root dir `kaggle.json` file with your credentials

In [None]:
!pip install h5py
!pip install pillow
!pip install scikit-learn
!pip install numpy
!pip install opencv-python
!pip install torchvision

In [None]:
import h5py
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import os
import random
import cv2
import numpy as np
from torchvision import transforms
import shutil
from sklearn.model_selection import train_test_split

In [None]:
!mkdir -p ~/.kaggle
!mv /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c isic-2024-challenge

In [None]:
!unzip /content/isic-2024-challenge.zip  -d /content/isic-2024-challenge

In [None]:
root_dir = '/content/isic-2024-challenge'

In [None]:
# Check the given data provided in hdf5 format

with h5py.File(f'{root_dir}/test-image.hdf5', 'r') as f:
    print("Keys in the file:", list(f.keys()))
    keys = list(f.keys())
    for key in keys:
        dataset = f[key]
        print(f"\nInspecting '{key}':")
        if dataset.shape == ():
            print(f"'{key}' is a scalar with value: {dataset[()]}")
        else:
            print(f"'{key}' is an array with shape: {dataset.shape}")
            plt.show()


In [None]:
# Visualize test dataset

with h5py.File(f'{root_dir}/test-image.hdf5', 'r') as f:
    keys = list(f.keys())
    print("Keys in the file:", keys)

    for key in keys:
        binary_data = f[key][()]
        image = Image.open(BytesIO(binary_data))
        plt.imshow(image)
        plt.axis('off')
        plt.title(key)
        plt.show()


In [None]:
# Visualize random ten images from the train dataset

image_dir = f'{root_dir}/train-image/image'
images = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
random_images = random.sample(images, 10)

plt.figure(figsize=(15, 10))
for i, img_name in enumerate(random_images):
    img_path = os.path.join(image_dir, img_name)
    img = Image.open(img_path)
    plt.subplot(2, 5, i+1)
    plt.imshow(img)
    plt.axis('off')
    plt.title(img_name)

plt.tight_layout()
plt.show()


In [None]:
def remove_hair(image):
    """
    Removes hair artifacts from an image using the DullRazor approach
    :param image: np.ndarray
    :return: np.ndarray
    """
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 7))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)
    _, thresh = cv2.threshold(blackhat, 12, 255, cv2.THRESH_BINARY)
    inpainted = cv2.inpaint(image, thresh, inpaintRadius=1, flags=cv2.INPAINT_TELEA)
    return inpainted


In [None]:
def preprocess_image(image_path, output_size=(224, 224)):
    """
    Preprocess an image by removing hair, resizing, and normalizing
    :param image_path: str
    :param output_size: tuple
    :return: torch.Tensor
    """
    image = Image.open(image_path).convert('RGB')
    image_np = np.array(image)
    hair_removed_image = remove_hair(image_np)

    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(image_np)
    plt.title("Original Image")
    plt.axis('off')
    plt.subplot(1, 2, 2)
    plt.imshow(hair_removed_image)
    plt.title("After Hair Removal")
    plt.axis('off')
    plt.show()

    hair_removed_image = Image.fromarray(hair_removed_image)
    preprocess_transform = transforms.Compose([
        transforms.Resize(output_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image_tensor = preprocess_transform(hair_removed_image)

    return image_tensor

image_path = f'{root_dir}/train-image/image/ISIC_5186979.jpg'
preprocessed_image = preprocess_image(image_path)

In [None]:
def split_data(ratio):
    """
    Splits train data into two datasets - train and validation
    :param ratio: float
    :return: None
    """
    base_dir = f'{root_dir}/train-image'
    train_dir = os.path.join(base_dir, 'train')
    val_dir = os.path.join(base_dir, 'val')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)

    all_images = [img for img in os.listdir(preprocessed_image) if os.path.isfile(os.path.join(image_dir,img))]
    train_images, val_images = train_test_split(all_images, train_size=ratio, random_state=42)

    for img_name in train_images:
        src_path = os.path.join(image_dir, img_name)
        dst_path = os.path.join(train_dir, img_name)
        shutil.copyfile(src_path, dst_path)

    for img_name in val_images:
        src_path = os.path.join(image_dir, img_name)
        dst_path = os.path.join(val_dir, img_name)
        shutil.copyfile(src_path, dst_path)

    print("Dataset split completed.")
