First, we will need to upload the data. On the left hand sidebar, click the "folder" icon to open the file view. Click the upload icon at the top of the pane. Select the file "data.zip" to upload it. It will take some time to upload; there should be a progress wheel to monitor its status. This file contains a large number of images (about 5000) along with a CSV (comma-separated values) file with information on each file. Once the file is uploaded, run the following cell ("Ctrl + Enter") to unzip it.

In [None]:
! if [ -e /data.zip ]; then unzip -o /data.zip; elif [ -e /content/data.zip ]; then unzip -o /content/data.zip; else echo "Data not found! Did you forget to upload the data?"; fi

When doing machine learning, it is always a good idea to look at the data! Start by opening up the directory /content/data in the sidebar, and double click on some of the images to look at them.

Here we define some python classes and functions that we will use for training and testing our models. We provide these since the primary aim of this lab is not to learn to code or to use machine learning libraries; instead, we want to focus on high-level understand of machine learning.

In this lab, we use a popular machine-learning library called _PyTorch_, which is used with the _Python_ programming language.

In [None]:
import torchvision # deep learning library
import torch # deep learning library
import pandas as pd # for loading CSV files
import sklearn.metrics # for assessing performance
from PIL import Image # for loading images
from tqdm import tqdm # for progress bars


# Define a pytorch dataset.
class DermatologyDataset(torch.utils.data.DataLoader):
    DIAGNOSES = [
            'actinic keratosis',
            'basal cell carcinoma',
            'dermatofibroma',
            'melanoma',
            'nevus',
            'seborrheic keratosis',
            'solar lentigo',
            'squamous cell carcinoma'
            ]
    MALIGNANCIES = [
            'basal cell carcinoma',
            'melanoma',
            'squamous cell carcinoma'
            ]
    def __init__(self, transform):
        '''
        Args:
            transform: (torchvision transform) A series of transforms that
                converts from a PIL Image to a torch tensor
        '''
        self.data = pd.read_csv('/content/data/labels.csv')
        self.data = self.data.query('diagnosis in @self.DIAGNOSES')
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        '''
        Returns:
            img: An image, typically represented as a pytorch tensor, though
                this is determined by the transform supplied when initializing
                the dataset.
            label: (int) A binary indicator that is 1 if the image represents a
                malignancy, and 0 otherwise.
        '''
        isic_id = self.data.iloc[idx].isic_id
        img = Image.open(f'/content/data/{isic_id}.jpg')
        img = self.transform(img)
        label = 1 if self.data.iloc[idx].diagnosis in self.MALIGNANCIES else 0
        return img, label

def get_mobilenet():
    '''
    Returns:
        model: (torch.nn.Module) A pytorch neural network, of the MobileNetV3
           architecture. The model is pretrained on ImageNet, and has its final
           classification layer modified for a binary prediction task.
        transform: (torchvision transforms) The torchvision transforms to be
           used with the model (these transform from a "PIL image" to a properly
           scaled pytorch tensor).
    '''
    weights = torchvision.models.MobileNet_V3_Small_Weights.DEFAULT # get weights pretrained on ImageNet
    model = torchvision.models.mobilenet_v3_small(weights=weights)
    model.classifier[-1] = torch.nn.Linear(model.classifier[-1].in_features, 1) # modify model for binary classification
    transform = weights.transforms()
    # freeze most layers (makes the network train faster)
    for param in model.parameters():
        param.requires_grad = False
    for param in model.classifier[-1].parameters():
        param.requires_grad = True
    return model, transform

def train(model, train_dataset, n_epochs=5):
    '''
    Train a pytorch neural network using the data in train_dataset.

    Args:
        model: (torch.nn.Module) A pytorch module, i.e., the neural network
            model to train.
        train_dataset: (torch.utils.data.Dataset) A pytorch dataset containing
            the training data.
        n_epochs: (int) The number of epochs, i.e., the number of passes to take
            through the dataset when training.
    Returns:
        model: (torch.nn.Module) The trained pytorch neural network.
    '''
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=4,
        shuffle=True
    )
    criterion = torch.nn.BCEWithLogitsLoss() # The training objective to minimize
    opt = torch.optim.Adam(model.parameters()) # The optimizer
    for epoch in range(5):
        # train model
        for batch in tqdm(train_dataloader, total=len(train_dataloader)):
            imgs, labels = batch
            predictions = model(imgs)
            loss = criterion(predictions, labels.unsqueeze(1).to(torch.float32))
            opt.zero_grad() # reset all previously stored gradients to zero
            loss.backward() # backpropagate to calculate gradients
            opt.step() # use the gradients to update model's parameters
    return model

def predict(model, test_dataset):
    '''
    Calculate the model's predictions for the given dataset.

    Args:
        model: (torch.nn.Module) A pytorch module, i.e., the (trained) neural
            network
        test_dataset: The dataset to pass through the model to calculate
            predictions.
    Returns:
        predictions: (torch.Tensor) A pytorch tensor of shape (n,), where n is
            the number of samples in `test_dataset`. Each element is a floating
            point value between 0 and 1, which may be interpreted as the
            probability of melanoma.
        labels: (torch.Tensor) A pytorch tensor of shape (n,), where n is the
            number of samples in `test_dataset`. Each element is either zero or
            one, where one indicates a positive label (e.g., malignant).
    '''
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=4,
        shuffle=False
    )
    all_predictions = []
    all_labels = []
    with torch.no_grad(): # gradient calculations are not needed during testing
        for batch in tqdm(test_dataloader, total=len(test_dataloader)):
            imgs, labels = batch
            predictions = model(imgs)
            all_predictions.append(predictions)
            all_labels.append(labels)
    predictions_logits = torch.concat(all_predictions)
    predictions_probs = 1/(1+torch.exp(-1*predictions_logits)) # squish to [0,1] range
    all_labels = torch.concat(all_labels).squeeze()
    return predictions_probs.squeeze(), all_labels
def classification_report(labels, predictions, threshold):
    report = sklearn.metrics.classification_report(labels, predictions>threshold, output_dict=True)
    sensitivity = report['1']['recall']
    specificity = report['0']['recall'] # specificity for malignancy is the same as recall for "benign"
    precision = report['1']['precision']
    acc = sklearn.metrics.accuracy_score(labels, predictions>threshold)
    roc_auc = sklearn.metrics.roc_auc_score(labels, predictions)
    print('Sensitivity: ', "{:.03f}".format(sensitivity))
    print('Specificity: ', "{:.03f}".format(specificity))
    print('Precision: ', "{:.03f}".format(precision))
    print('Accuracy: ', "{:.03f}".format(acc))
    print('ROC-AUC score: ', "{:.03f}".format(roc_auc))

In [None]:
model, transform = get_mobilenet()
train_dataset = DermatologyDataset(transform)
# Subsample the training dataset to reserve a portion for testing.
train_dataset.data = train_dataset.data.sample(frac=0.9)
test_dataset = DermatologyDataset(transform)
# This is a simple shorthand to obtain a test dataset containing all of the
# images not in the training dataset
test_dataset.data = test_dataset.data.query("isic_id not in @train_dataset.data.isic_id")

In [None]:
model = train(model, train_dataset)


The following two blocks of code evaluate the machine's performance on the test data.

In [None]:
predictions, labels = predict(model, test_dataset)


In [None]:
classification_report(labels, predictions, 0.5) # specify threshold here

The following two blocks of code evaluate the machine's performance on the training data.

In [None]:
train_predictions, train_labels = predict(model, train_dataset)

In [None]:
classification_report(train_labels, train_predictions, 0.5)

Next, let's look at how generalizable our model is. Instead of splitting the data randomly into "test" and "training" groups we will seperate our data by hospital site. We will reserve all the images from Memorial Sloan Kettering Cancer Center for testing.

In [None]:
model_2, transform = get_mobilenet()
train_dataset_2 = DermatologyDataset(transform)
test_dataset_2 = DermatologyDataset(transform)
# This time, we want our test data to come only from a single institution
test_dataset_2.data = test_dataset_2.data.query("attribution == 'Memorial Sloan Kettering Cancer Center'")

# We want the training dataset to contain the remainder of the images (from
# institutions other than the Memorial Sloan Kettering Cancer Center).
train_dataset.data = train_dataset_2.data.query("isic_id not in @test_dataset_2.data.isic_id")
print("Length of train dataset: ", len(train_dataset_2))
print("Length of test dataset: ", len(test_dataset_2))

Length of train dataset:  4352
Length of test dataset:  531


In [None]:
model_2 = train(model_2, train_dataset_2)


In [None]:
predictions_2, labels_2 = predict(model_2, test_dataset_2)

In [None]:
classification_report(labels_2, predictions_2, 0.5)

To save you from having to program this yourself, we pre-programmed the code in the following box to allow you to create datasets split on sex, age, and the type of instrument (dermatoscope) used to acquire the image. You should not need to edit this code at all, but you can read the description under each function to see how to use it; see the comments under the functions marked with triple hash-tags ("###").

In [None]:
def check_dataset(dataset):
    labels = [1 if diagnosis in dataset.MALIGNANCIES else 0 for diagnosis in dataset.data.diagnosis]
    if not 1 in labels:
        print("Warning! A dataset contains only benign lesions. This will cause "
              "unexpected behavior when training and testing models. Choose a "
              "different way to split data in order to avoid this situation.")
        return False
    if not 0 in labels:
        print("Warning! A dataset contains only malignant lesions. This will cause "
              "unexpected behavior when training and testing models. Choose a "
              "different way to split data in order to avoid this situation.")
        return False
    return True

def run_precheck(dataset_a, dataset_b):
    print("Length of first dataset is: ", len(dataset_a))
    print("Length of second dataset is: ", len(dataset_b))
    print("Number of malignancies in first dataset is: ",
          sum([1 if diagnosis in dataset_a.MALIGNANCIES else 0 for diagnosis in dataset_a.data.diagnosis]))
    print("Number of malignancies in second dataset is: ",
          sum([1 if diagnosis in dataset_b.MALIGNANCIES else 0 for diagnosis in dataset_b.data.diagnosis]))
    check_dataset(dataset_a)
    check_dataset(dataset_b)
    return

### FOR SPLITTING DATASETS ON SEX ###
def get_dataset_split_sex(transform, dataset_a_sex='female'):
    '''
    Returns two datasets, where each dataset contains patients of a single sex.

    Args:
      transform: The transforms to be used in the dataset
      dataset_a_sex: (str) Either 'female' or 'male'. The first dataset returned
        will contain patients of only this sex, while the second dataset will
        contain the remaining patients.
    Returns:
      dataset_a: A dataset of dermatology images.
      dataset_b: Another dataset of dermatology images, disjoint from dataset_a.
    '''
    if not dataset_a_sex.lower() in ['female', 'male']:
        raise ValueError('dataset_a_sex needs to be either "female" or "male"')
    dataset_a_sex = dataset_a_sex.lower()
    dataset_a = DermatologyDataset(transform)
    dataset_b = DermatologyDataset(transform)
    dataset_a.data = dataset_a.data.query('sex == @dataset_a_sex')
    dataset_b.data = dataset_b.data.query('isic_id not in @dataset_a.data.isic_id')
    check_dataset(dataset_a)
    check_dataset(dataset_b)
    return dataset_a, dataset_b

### FOR SPLITTING DATASETS ON AGE ###
def get_dataset_split_age(transform, dataset_a_upper_bound=30):
    '''
    Returns two datasets, where each dataset contains patients of differing
    ages. The first dataset will contain younger patients, and the second dataset
    contains older patients.

    Args:
      transform: The transforms to be used in the dataset
      dataset_a_upper_bound: (int) An integer between 5 and 75, specifying the
        maximum age (inclusive) of patients in dataset_a.
    Returns:
      dataset_a: A dataset of dermatology images.
      dataset_b: Another dataset of dermatology images, disjoint from dataset_a.
    '''
    if not isinstance(dataset_a_upper_bound, int):
        raise ValueError('dataset_a_upper_bound needs to be an integer.')
    if dataset_a_upper_bound < 5 or dataset_a_upper_bound > 75:
        raise ValueError('dataset_a_upper_bound needs to be between 5 and 75')
    dataset_a = DermatologyDataset(transform)
    dataset_b = DermatologyDataset(transform)
    dataset_a.data = dataset_a.data.query('age_approx <= @dataset_a_upper_bound')
    dataset_b.data = dataset_b.data.query('isic_id not in @dataset_a.data.isic_id')
    run_precheck(dataset_a, dataset_b)
    return dataset_a, dataset_b

### FOR SPLITTING DATASETS ON IMAGE ACQUISITION METHOD ###
def get_dataset_split_dermoscopic_type(
        transform,
        dataset_a_type='contact polarized',
        dataset_b_type='contact non-polarized'):
    '''
    Returns two datasets, where each dataset contains images acquired with
    different types of dermoscopy. Most dermoscopy is "contact" dermoscopy in
    which the dermatoscope touches the skin, and contact dermoscopy can use
    either polarized or non-polarized light. There is also non-contact
    dermoscopy, which always uses polarized light.

    Args:
      transform: The transforms to be used in the dataset
      dataset_a_type: (str) Either 'contact polarized', 'contact non-polarized',
        or 'non-contact polarized'. Specifies the type of dermatoscope used to
        acquire images in dataset_a.
      dataset_b_type: (str) Either 'contact polarized', 'contact non-polarized',
        or 'non-contact polarized'. Specifies the type of dermatoscope used to
        acquire images in dataset_b. THIS MUST BE DIFFERENT THAN dataset_a_type.
    Returns:
      dataset_a: A dataset of dermatology images.
      dataset_b: Another dataset of dermatology images, disjoint from dataset_a.
    '''
    dataset_a_type == dataset_a_type.lower()
    dataset_b_type == dataset_b_type.lower()
    options = ['contact polarized', 'contact non-polarized', 'non-contact polarized']
    if not dataset_a_type in options:
        raise ValueError("dataset_a_type must be one of " + str(options))
    if not dataset_b_type in options:
        raise ValueError("dataset_b_type must be one of " + str(options))
    if dataset_a_type == dataset_b_type:
        raise ValueError("dataset_a_type and dataset_b_type must differ.")
    dataset_a = DermatologyDataset(transform)
    dataset_b = DermatologyDataset(transform)
    dataset_a.data = dataset_a.data.query('dermoscopic_type == @dataset_a_type')
    dataset_b.data = dataset_b.data.query('dermoscopic_type == @dataset_b_type')
    run_precheck(dataset_a, dataset_b)
    return dataset_a, dataset_b