In [25]:
import pandas as pd
from pathlib import Path 
from torch.utils.data import Dataset, Subset
from PIL import Image
from torchvision import transforms
from collections import Counter
import os
import torch

In [19]:
CLASSES = ["MEL", "NV", "BCC", "AKIEC", "BKL", "DF", "VASC"]

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()   #[0,255] â†’ [0,1]
])

In [20]:
class ISICDataset(Dataset):
    def __init__(self, images_dir, groundtruth_csv, transform=None):
        self.images_dir = Path(images_dir)
        self.df = pd.read_csv(groundtruth_csv)
        self.transform = transform

        self.image_ids = self.df["image"].values
        self.labels = self.df[CLASSES].values.argmax(axis=1)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        label = self.labels[idx]
        img_path = self.images_dir / f"{image_id}.jpg"
        img = Image.open(img_path).convert("RGB")

        if self.transform:
            img = self.transform(img)

        return img, label


In [21]:
path_data = Path("../data/isic")

train_dataset = ISICDataset(
    images_dir=path_data / "train",  
    groundtruth_csv=path_data / "train" / "groundtruth.csv",
    transform=transform
)

val_dataset = ISICDataset(
    images_dir=path_data / "val",
    groundtruth_csv=path_data / "val" / "groundtruth.csv",
    transform=transform
)

test_dataset = ISICDataset(
    images_dir=path_data / "test", 
    groundtruth_csv=path_data / "test" / "groundtruth.csv",
    transform=transform
)

print("Datasets loaded")
print("Train:", len(train_dataset))
print("Val  :", len(val_dataset))
print("Test :", len(test_dataset))


Datasets loaded
Train: 10015
Val  : 193
Test : 1512


In [None]:
train_dataset[0]


(tensor([[[0.9490, 0.9569, 0.9569,  ..., 0.9569, 0.9529, 0.9490],
          [0.9451, 0.9490, 0.9529,  ..., 0.9451, 0.9412, 0.9451],
          [0.9255, 0.9294, 0.9373,  ..., 0.9490, 0.9451, 0.9412],
          ...,
          [0.8863, 0.8902, 0.8784,  ..., 0.8667, 0.8549, 0.8549],
          [0.8863, 0.8824, 0.8863,  ..., 0.8667, 0.8667, 0.8588],
          [0.8863, 0.8784, 0.8824,  ..., 0.8706, 0.8706, 0.8549]],
 
         [[0.5804, 0.5922, 0.5922,  ..., 0.5922, 0.5922, 0.5843],
          [0.5608, 0.5725, 0.5843,  ..., 0.5804, 0.5804, 0.5843],
          [0.5373, 0.5451, 0.5608,  ..., 0.5843, 0.5843, 0.5882],
          ...,
          [0.5451, 0.5490, 0.5333,  ..., 0.5569, 0.5569, 0.5490],
          [0.5608, 0.5608, 0.5608,  ..., 0.5569, 0.5569, 0.5569],
          [0.5608, 0.5608, 0.5608,  ..., 0.5569, 0.5647, 0.5608]],
 
         [[0.6549, 0.6627, 0.6706,  ..., 0.6745, 0.6745, 0.6627],
          [0.6431, 0.6549, 0.6588,  ..., 0.6627, 0.6588, 0.6627],
          [0.6235, 0.6314, 0.6392,  ...,

In [22]:
print("Train distribution:", Counter(train_dataset.labels))
print("Val distribution  :", Counter(val_dataset.labels))
print("Test distribution :", Counter(test_dataset.labels))


Train distribution: Counter({1: 6705, 0: 1113, 4: 1099, 2: 514, 3: 327, 6: 142, 5: 115})
Val distribution  : Counter({1: 123, 4: 22, 0: 21, 2: 15, 3: 8, 6: 3, 5: 1})
Test distribution : Counter({1: 909, 4: 217, 0: 171, 2: 93, 5: 44, 3: 43, 6: 35})


In [None]:
def split_dataset_by_class(dataset):
    class_subsets = {}
    for c in range(len(CLASSES)):
        indices = [i for i, y in enumerate(dataset.labels) if y == c]
        class_subsets[c] = Subset(dataset, indices)
    return class_subsets

In [None]:
train_sets_by_class = split_dataset_by_class(train_dataset)
for c in train_sets_by_class:
    print(f"Class {CLASSES[c]}: {len(train_sets_by_class[c])} images")

Class MEL: 1113 images
Class NV: 6705 images
Class BCC: 514 images
Class AKIEC: 327 images
Class BKL: 1099 images
Class DF: 115 images
Class VASC: 142 images


In [29]:
SAVE_DIR = Path("../data/processed_data/isic")
os.makedirs(SAVE_DIR, exist_ok=True)

torch.save(train_dataset, SAVE_DIR / "train_dataset.pt")
torch.save(val_dataset, SAVE_DIR / "val_dataset.pt")
torch.save(test_dataset, SAVE_DIR / "test_dataset.pt")

torch.save(train_sets_by_class, SAVE_DIR / "train_sets_by_class.pt")

In [None]:
#This how to get the data from the files

train_dataset = torch.load("../data/processed_data/isic/train_dataset.pt")
val_dataset = torch.load("../data/processed_data/isic/val_dataset.pt")
test_dataset = torch.load("../data/processed_data/isic/test_dataset.pt")
train_sets_by_class = torch.load("../data/processed_data/isic/train_sets_by_class.pt")
