<a href="https://colab.research.google.com/github/JonasRiber/Dl_project_HAM10000/blob/main/DL_project_HAM10000_w_dataloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Deep learning project - HAM10000

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


The dataset consists of 10000 images of skin lesions that have multiple different labels. These labebls include: ages, genders, image_ids, images, lesion_categories, lesion_ids, localizations and sources.

In [1]:
# download dataset package
# https://github.com/activeloopai/deeplake
!pip3 install deeplake # Uncomment to install


# Loading dataset
import deeplake
ds = deeplake.load("hub://activeloop/ham10000")
ds.summary()



-

Opening dataset in read-only mode as you don't have write permissions.


|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/ham10000



|

hub://activeloop/ham10000 loaded successfully.



 /

Dataset(path='hub://activeloop/ham10000', read_only=True, tensors=['ages', 'genders', 'image_ids', 'images', 'lesion_categories', 'lesion_ids', 'localizations', 'sources'])

      tensor           htype            shape           dtype  compression
      -------         -------          -------         -------  ------- 
       ages         class_label       (10015, 1)       uint32    None   
      genders       class_label       (10015, 1)       uint32    None   
     image_ids      class_label       (10015, 1)       uint32    None   
      images           image     (10015, 450, 600, 3)   uint8    jpeg   
 lesion_categories  class_label       (10015, 1)       uint32    None   
    lesion_ids      class_label       (10015, 1)       uint32    None   
   localizations    class_label       (10015, 1)       uint32    None   
      sources       class_label       (10015, 1)       uint32    None   


 

In [15]:
# other useful imports
import torch
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split


In [4]:
ds.tensors

{'ages': Tensor(key='ages'),
 'genders': Tensor(key='genders'),
 'image_ids': Tensor(key='image_ids'),
 'images': Tensor(key='images'),
 'lesion_categories': Tensor(key='lesion_categories'),
 'lesion_ids': Tensor(key='lesion_ids'),
 'localizations': Tensor(key='localizations'),
 'sources': Tensor(key='sources')}

In [6]:
#meta data from the dataset
data = []
for i in range(len(ds)):
  row = {
      'image_ids': ds.image_ids[i].numpy().item(),
      'lesion_ids': ds.lesion_ids[i].numpy().item(),
      'lesion_categories': ds.lesion_categories[i].numpy().item(),
      'genders': ds.genders[i].numpy().item(),
      'ages': ds.ages[i].numpy().item(),
      'localizations': ds.localizations[i].numpy().item()
  }
  data.append(row)

df_meta = pd.DataFrame(data)


df_meta.shape
df_meta.head()



Unnamed: 0,image_ids,lesion_ids,lesion_categories,genders,ages,localizations
0,9668,7232,1,0,4,6
1,8784,6704,1,0,1,3
2,6475,5335,1,1,7,9
3,9332,6993,1,0,15,3
4,8796,6711,1,0,10,9


In [None]:
# showcase a single image
image = ds.images[0].numpy()

print(image.shape) # shape of the images
#show the image
plt.imshow(image)
plt.show()

In [None]:
distribution_classes = df_meta['lesion_categories'].value_counts()
distribution_classes

In [None]:
# class translation
classes = {4: ('nv', ' Nevus'),
           6: ('mel', 'Melanoma'),
           2 :('bkl', 'Seborrheic Keratosis'),
           1:('bcc' , ' Basal Cell Carcinoma '),
           5: ('vasc', 'Vascular Lesion'),
           0: ('akiec', 'Actinic Keratosis'),
           3: ('df', 'Dermatofibroma')}

class_codes = [class_code for class_code in classes]
class_names = [classes[class_code][1] for class_code in classes]
counts = [distribution_classes.get(class_code, 0) for class_code in classes]

#plottig
plt.figure(figsize=(8, 6))
plt.bar(class_names, counts, color='skyblue', edgecolor = "black")
# labels
plt.xlabel('Class Names')
plt.ylabel('Frequency')
plt.title('Distribution of classes within laseion categories')
plt.xticks(rotation=60)
plt.tight_layout()

#display the plot
plt.show()

There is a big difference between the classes within the main category of interest.
To solve this we can try some different things to correct for it.
- downsample, only take as many images in each class as the lowest one
- upsampling, take the classes we have few datapoints in repeatedly. could be done with a dataloader
- data-augmentation, perhaps in combination with upsampling augment the classes we have few off.
- weightings, an images we have alot of should be weighted less

#### Basic model

We need a dataloader to load in the data since all 10015 images at once require too many resources

In [14]:
# Create a custom dataset class for images and metadata
class DeepLakeImageDataset(Dataset):
    def __init__(self, deeplake_dataset, indices=None):
        self.dataset = deeplake_dataset
        # If no indices are provided, use all indices
        self.indices = indices if indices is not None else range(len(deeplake_dataset['images']))

    def __len__(self):
        return len(self.indices)  # Return the length of the subset

    def __getitem__(self, idx):
        actual_idx = self.indices[idx]  # Get the actual index in the full dataset
        # Load the image and associated metadata
        image = self.dataset['images'][actual_idx].numpy()  # Load the image
        age = self.dataset['ages'][actual_idx].numpy()
        gender = self.dataset['genders'][actual_idx].numpy()
        image_id = self.dataset['image_ids'][actual_idx].numpy(aslist=True)
        lesion_category = self.dataset['lesion_categories'][actual_idx].numpy(aslist=True)
        lesion_id = self.dataset['lesion_ids'][actual_idx].numpy(aslist=True)
        localization = self.dataset['localizations'][actual_idx].numpy(aslist=True)
        source = self.dataset['sources'][actual_idx].numpy(aslist=True)

        # Return the image and a dictionary of metadata
        return image, {
            'age': age,
            'gender': gender,
            'image_id': image_id,
            'lesion_category': lesion_category,
            'lesion_id': lesion_id,
            'localization': localization,
            'source': source
        }

# Create the subset with the desired number of samples (e.g., first 100 samples)
subset_size = 100
subset_indices = list(range(subset_size))  # Create a list of indices for the first 100 images
subset_dataset = Subset(ds, subset_indices)  # Create a subset of the original dataset

# Create an instance of the image dataset using the subset
image_dataset = DeepLakeImageDataset(ds, subset_indices)

# Create a DataLoader for the image dataset
dataloader = DataLoader(image_dataset, batch_size=8, shuffle=True)

# Iterate through the DataLoader
for images, metadata in dataloader:
    print(f"Images batch shape: {images.shape}")  # Shape of the batch of images
    print("Metadata sample:")
    print("Age:", metadata['age'])
    print("Gender:", metadata['gender'])
    print("Image ID:", metadata['image_id'])
    # Add your training or testing code here

Images batch shape: torch.Size([8, 450, 600, 3])
Metadata sample:
Age: tensor([[2],
        [7],
        [4],
        [5],
        [4],
        [7],
        [2],
        [2]], dtype=torch.uint32)
Gender: tensor([[0],
        [0],
        [0],
        [1],
        [1],
        [0],
        [1],
        [1]], dtype=torch.uint32)
Image ID: tensor([[4192],
        [9460],
        [4526],
        [2764],
        [ 138],
        [4727],
        [8970],
        [1692]], dtype=torch.uint32)
Images batch shape: torch.Size([8, 450, 600, 3])
Metadata sample:
Age: tensor([[ 5],
        [ 2],
        [15],
        [ 4],
        [ 9],
        [ 8],
        [ 4],
        [ 5]], dtype=torch.uint32)
Gender: tensor([[0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1]], dtype=torch.uint32)
Image ID: tensor([[9838],
        [1427],
        [8533],
        [3750],
        [1073],
        [6183],
        [1477],
        [2659]], dtype=torch.uint32)
Images batch sha