This Notebook is made to run in Google Collab

Sources:

Understanding how to interpret the HAM10000 dataset: https://github.com/rtharungowda/Skin-Lesion-Pytorch-HAM10000

Understanding PyTorch.Lightning: https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial5/Inception_ResNet_DenseNet.html

# Downloading HAM10000 data (you'll need a kaggle key)

In [None]:
#upload your Kaggle key (kaggle.json)
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
# downloading the data into content
!mkdir /content/input
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000 -p /content/input
!mkdir /content/input/HAM10000
#unzip the data
!unzip -q /content/input/skin-cancer-mnist-ham10000.zip -d /content/input/HAM10000/
#merge the patial folders
!cp -a /content/input/HAM10000/HAM10000_images_part_2/. /content/input/HAM10000/HAM10000_images_part_1/
!cp -a /content/input/HAM10000/ham10000_images_part_2/. /content/input/HAM10000/ham10000_images_part_1/
#remove part 2 stuff an rename part 1
!rm -r /content/input/HAM10000/HAM10000_images_part_2
!rm -r /content/input/HAM10000/ham10000_images_part_2
!mv /content/input/HAM10000/HAM10000_images_part_1 /content/input/HAM10000/HAM10000_images
!mv /content/input/HAM10000/ham10000_images_part_1 /content/input/HAM10000/ham10000_images
# where did that one come from? The dataset seems to be twice in the zip...
!rm -r /content/input/HAM10000/ham10000_images

#Imports

In [38]:
# install libraries that are not installed on colab by default
!pip install --quiet pytorch-lightning

# general tools
from collections import Counter
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image

# torch
import torch
from torch import optim,nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,Dataset
from torchvision import models,transforms
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

# sklearn
from sklearn.model_selection import train_test_split

#Preparing the Data

In [39]:
metadata_path = '/content/input/HAM10000/HAM10000_metadata.csv'
dataset_dir = '/content/input/HAM10000/HAM10000_images/'
CHECKPOINT_PATH = '/content/saved_models'

In [40]:
dataset = pd.read_csv(metadata_path)

lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

# have a column where we have the dx as number, we nned that to get a tensor
dataset['dx_num'] = pd.Categorical(dataset['dx']).codes

# define what our input (x) and labels (y) are
x = dataset['image_id'].values.tolist()
x = [dataset_dir + s + ".jpg" for s in x]
y = dataset['dx_num'].values.tolist()

# split our data up into training and validation data
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.2, random_state=shuffle_random)

In [None]:
# let's count how many entrys we have for each label
# we can nicely see what label represents what dx here

print(Counter(y))
print(Counter(dataset['dx'].values.tolist()))

print(Counter(y_train))
print(Counter(y_validation))

In [42]:
def compute_img_mean_std(image_paths):
    imgs = []
    means, stdevs = [], []

    for i in tqdm(range(len(image_paths))):
        img = Image.open(image_paths[i])
        imgs.append(img)

    imgs = np.stack(imgs, axis=0)
    imgs = imgs.astype(np.float32) / 255.

    for i in range(3):
        pixels = imgs[:, :, :, i].ravel()
        means.append(np.mean(pixels))
        stdevs.append(np.std(pixels))

    return means.reverse(),stdevs.reverse()

In [43]:
#norm_mean, norm_std = compute_img_mean_std(x)
# the above sometimes crashes due to memory shortage, so here's the mean and standard derivation from when it has worked
norm_mean = [0.76303625, 0.5456404, 0.5700425]
norm_std = [0.140928, 0.15261285, 0.1699707]

In [44]:
# define the transformation of the training images.
train_transform = transforms.Compose([transforms.Resize((224,224)),transforms.RandomHorizontalFlip(),
                                      transforms.RandomVerticalFlip(),transforms.RandomRotation(20),
                                      transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1),
                                        transforms.ToTensor(), transforms.Normalize(norm_mean, norm_std)])
# define the transformation of the validation images.
val_transform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor(),
                                    transforms.Normalize(norm_mean, norm_std)])

In [45]:
#defining the dataset
class image_dataset(Dataset):
  def __init__(self, x, y, transform=None):
    self.x = x
    self.y = y
    self.transform = transform

  def __len__(self):
    return len(self.x)
  
  def __getitem__(self, index):
    x = Image.open(self.x[index])
    x = self.transform(x)
    y = self.y[index]
    return x, y

In [46]:
training_set = image_dataset(x_train, y_train, transform=train_transform)
training_loader = DataLoader(training_set, batch_size=8, shuffle=True, num_workers=2)

validation_set = image_dataset(x_validation, y_validation, transform=val_transform)
validation_loader = DataLoader(training_set, batch_size=8, shuffle=False, num_workers=2)

#Training

In [47]:
class mymodule(pl.LightningModule):

    def __init__(self):

        super().__init__()

        # Create model
        self.model = models.densenet121(pretrained=True)
        num_ftrs = self.model.classifier.in_features
        self.model.classifier = nn.Linear(num_ftrs, 7)

        # Create loss module
        self.loss_module = nn.CrossEntropyLoss()

        # Example input for visualizing the graph in Tensorboard
        self.example_input_array = torch.zeros((1, 3, 32, 32), dtype=torch.float32)

    def forward(self, imgs):
        return self.model(imgs)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.model.parameters(), lr=1e-3)

        scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=[100, 150], gamma=0.1)
        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs)
        loss = self.loss_module(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()
        return loss  # Return tensor to call ".backward" on

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs).argmax(dim=-1)
        acc = (labels == preds).float().mean()
        # By default logs it per epoch (weighted average over batches)
        self.log('val_acc', acc)

    def test_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs).argmax(dim=-1)
        acc = (labels == preds).float().mean()
        # By default logs it per epoch (weighted average over batches), and returns it afterwards
        self.log('test_acc', acc)

In [48]:
from torch._C import device
def train_model(model_name):
    save_name = model_name

    # Create a PyTorch Lightning trainer with the generation callback
    trainer = pl.Trainer(default_root_dir=os.path.join(CHECKPOINT_PATH, save_name),
                         gpus=1,
                         max_epochs=10,
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"), LearningRateMonitor("epoch")],
                         progress_bar_refresh_rate=1
                         )

    model = mymodule()
    trainer.fit(model, training_loader, validation_loader)
    model = mymodule.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # Load best checkpoint after training

    # Test best model on validation set
    val_result = trainer.test(model, test_dataloaders=validation_loader, verbose=False)
    result = {"val": val_result[0]["test_acc"]}

    return model, result

In [None]:
densenet_model, densenet_results = train_model(model_name="DenseNet")

#Export

In [35]:
torch.save(densenet_model.state_dict(), '/content/HAM10000-Densenet.pth')