**Breast Cancer Detection**
=

***Advanced Methods of Artificial Vision***

**Authors:** *Alejandro Araque Robles, Ander Bodegas Díez, Lucía Gonzalez Ratón y Gonzalo Sabando Alonso*

# 1. Introduction

This is the introduction for the final project.

# 2. Data Loading

First we need to import all the packages that we are going to need for the project.

In [4]:
!git config --global user.name "Lgonrat"
!git config --global user.email "lgonrat@teleco.upv.es"
!git clone https://github.com/Lgonrat/trabajo-admeav.git
!mv *.ipynb trabajo-admeav/

fatal: destination path 'trabajo-admeav' already exists and is not an empty directory.
mv: cannot stat '*.ipynb': No such file or directory


In [None]:
from google.colab import drive

import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch
import time

Then, we mount the drive to get access to the dataset.

In [None]:
drive.mount('/content/drive')
pathBase = '/content/drive/MyDrive/FinalProjectAdmeav'

Mounted at /content/drive


# 3. Data Exploration and Preprocessing

In this section, we are going to explore the data structure and perform the adequate operations to obtain a consistent dataset.

In [None]:
# # Benign and malignant paths
# pathBenign = os.path.join(pathBase, 'breast_ultrasound', 'benign')
# pathMalignant = os.path.join(pathBase, 'breast_ultrasound', 'malignant')

# # Loop to read, preprocess each image and save in the new directory
# for pathType in [pathBenign, pathMalignant]:
#   for name in os.listdir(pathType):

#     # Remove '.png' from name
#     name = name[:-4]

#     # Skip is mask
#     if 'mask' in name:
#       continue

#     # Read image
#     img = cv2.imread(
#       filename = os.path.join(pathType, f'{name}.png'),
#       flags = cv2.IMREAD_UNCHANGED
#     )

#     # Read base mask
#     mas = cv2.imread(
#       filename = os.path.join(pathType, f'{name}_mask.png'),
#       flags = cv2.IMREAD_UNCHANGED
#     )

#     # Read additional masks if exist
#     for i in range(1, 3):
#       if f'{name}_mask_{i}.png' in os.listdir(pathType):

#         # Read additional mask
#         masAdd = cv2.imread(
#           filename = os.path.join(pathType, f'{name}_mask_{i}.png'),
#           flags = cv2.IMREAD_UNCHANGED
#         )

#         # Add additional mask to base mask
#         try:
#           mas = cv2.bitwise_or(src1 = mas, src2 = masAdd)
#         except:
#           print(name)
#           mas = cv2.bitwise_or(src1 = mas, src2 = masAdd[:, :, 0])

#     # Resize image to 512x512 using linear interpolation
#     img = cv2.resize(
#       src = img,
#       dsize = (512, 512),
#       interpolation = cv2.INTER_LINEAR
#     )

#     # Resize mask to 512x512 using nearest neighbor interpolation
#     mas = cv2.resize(
#       src = mas,
#       dsize = (512, 512),
#       interpolation = cv2.INTER_NEAREST
#     )

#     # Save image and mask
#     cv2.imwrite(
#       os.path.join(pathBase, 'DATASET/IMAGES', f'{name}.png'),
#       img
#     )
#     np.save(
#       os.path.join(pathBase, 'DATASET/MASKS', f'{name}.npy'),
#       mas
#     )

In order to train the segmentation model, we first need to create a PyTorch Dataset.

In [None]:
class BreastCancerDataset(torch.utils.data.Dataset):
  def __init__(self, dir: str, tra = None):
    super(BreastCancerDataset, self).__init__()
    self.tra = tra
    self.imgDir = os.path.join(dir, 'IMAGES')
    self.masDir = os.path.join(dir, 'MASKS')
    self.imgLis = os.listdir(self.imgDir)
    self.masLis = os.listdir(self.masDir)

  def __getitem__(self, idx):
    img = cv2.imread(
      filename = os.path.join(self.imgDir, self.imgLis[idx]),
      flags = cv2.IMREAD_UNCHANGED
    )
    img = torch.from_numpy(img).to(dtype = torch.float32)
    img = img.permute(2, 0, 1)
    mas = np.load(file = os.path.join(self.masDir, self.masLis[idx]))
    mas = torch.from_numpy(mas / 255.).to(dtype = torch.long)
    if self.tra:
      img = self.tra(img)
    return img, mas

  def __len__(self):
    return len(self.imgLis)

# 4. Model Selection and Desing

We're going to design a simple UNet architecture, with a skip-connected (optional) encoder and decoder.

In [None]:
def convBlock(inChannels: int, outChannels: int, k: int):
  return torch.nn.Sequential(
    torch.nn.Conv2d(
      in_channels = inChannels,
      out_channels = outChannels,
      kernel_size = (k, k),
      stride = 1,
      padding = 'same'
    ),
    torch.nn.BatchNorm2d(outChannels),
    torch.nn.ReLU(),
    torch.nn.Conv2d(
      in_channels = outChannels,
      out_channels = outChannels,
      kernel_size = (3, 3),
      stride = 1,
      padding = 'same'
    ),
    torch.nn.BatchNorm2d(outChannels),
    torch.nn.ReLU()
  )

#------------------------------------------------------------------------------#

def up(inChannels: int, outChannels: int, k: int):
  return torch.nn.Sequential(
    torch.nn.ConvTranspose2d(
      in_channels = inChannels,
      out_channels = outChannels,
      kernel_size = (k, k),
      stride = k
    )
  )

#------------------------------------------------------------------------------#

class Encoder(torch.nn.Module):
  def __init__(
    self,
    inChannels: int = 1,
    initChannels: int = 16,
    depthLevels: int = 4
  ):
    super(Encoder, self).__init__()
    self.depthLevels = depthLevels

    # Initialize list of modules
    self.layers = torch.nn.ModuleList()

    # Initialize out channels
    outChannels = initChannels

    # Loop to generate all layers
    for i in range(depthLevels):

      # Convolutional block
      self.layers.append(
        convBlock(
          inChannels = inChannels,
          outChannels = outChannels,
          k = 3
        )
      )

      # Max pooling
      self.layers.append(
        torch.nn.MaxPool2d(kernel_size = (2, 2), stride = 2)
      )

      # Update in and out channels
      inChannels = outChannels
      outChannels = outChannels * 2

    # Bottle neck (last layer of the encoder)
    self.layers.append(
      convBlock(
        inChannels = inChannels,
        outChannels = outChannels,
        k = 3
      )
    )

    # Save feature dimensions
    self.featureDim = outChannels

  def forward(self, x: torch.tensor):
    features = list()

    # Forward loop
    for i in range(self.depthLevels):
      x = self.layers[2 * i](x)
      features.append(x)
      x = self.layers[2 * i + 1](x)
    x = torch.nn.Dropout(0.2)(self.layers[-1](x))
    features.append(x)
    return features

#------------------------------------------------------------------------------#

class Decoder(torch.nn.Module):
  def __init__(
    self,
    inChannels: int,
    nClasses: int = 1,
    depthLevels: int = 4,
    skip: bool = True
  ):
    super(Decoder, self).__init__()
    self.depthLevels = depthLevels
    self.skip = skip

    # Initialize list of modules
    self.layers = torch.nn.ModuleList()

    # Initialize out channels
    outChannels = inChannels // 2

    # Loop to generate all layers
    for _ in range(depthLevels):

      # Up block
      self.layers.append(
        up(
          inChannels = inChannels,
          outChannels = outChannels,
          k = 2
        )
      )

      # Convolutional block
      self.layers.append(
        convBlock(
          inChannels = outChannels * 2 if self.skip else outChannels,
          outChannels = outChannels,
          k = 3
        )
      )

      # Update in and out channels
      inChannels = outChannels
      outChannels = inChannels // 2

    # Final (last layer of the decoder)
    self.layers.append(
      torch.nn.Conv2d(
        in_channels = inChannels,
        out_channels = nClasses,
        kernel_size = (3, 3),
        stride = 1,
        padding = 'same'
      )
    )

  def forward(self, features: list):

    # Get bottleneck features
    x = features[-1]

    # Forward loop with or without skip-connections
    for i in range(self.depthLevels):
      x = self.layers[2 * i](x)
      if self.skip:
        x = torch.cat([x, features[-(i + 2)]], dim = 1)
      x = self.layers[2 * i + 1](x)
    return self.layers[-1](x)

#-----------------------------------------------------------------------------#

class UNet(torch.nn.Module):
  def __init__(
    self,
    inChannels: int = 3,
    nClasses: int = 1,
    initChannels: int = 16,
    depthLevels: int = 4,
    skip: bool = True
  ):
    super(UNet, self).__init__()

    # Encoder
    self.encoder = Encoder(
      inChannels = inChannels,
      initChannels = initChannels,
      depthLevels = depthLevels
    )

    # Decoder
    self.decoder = Decoder(
      inChannels = self.encoder.featureDim,
      nClasses = nClasses,
      depthLevels = depthLevels,
      skip = skip
    )

  def forward(self, x: torch.tensor):
    features = self.encoder(x)
    return self.decoder(features)

# 5. Training and Validation

First we define the Dice loss function.

In [None]:
def diceLoss(preds: torch.Tensor, targs: torch.Tensor):

  # Compute probabilities with a sigmoid function
  preds = torch.sigmoid(preds[:, 0])

  # Compute intersection and union
  inter = (preds * targs).sum()
  union = preds.sum() + targs.sum()

  # Compute dice and return 1 - dice
  dice = (2 * inter + 1e-6) / (union + 1e-6)
  return 1 - dice

Then we create a training function.

In [None]:
def train(
  model: torch.nn.Module,
  optimizer,
  criterion,
  epochs: int,
  dlT: torch.utils.data.DataLoader,
  dlV: torch.utils.data.DataLoader,
  dev: torch.DeviceObjType
):
  # Number of steps for the training and validation dataset
  stepsT = len(dlT)
  stepsV = len(dlV)

  # Logs dataframe
  logs = {
    'epoch': list(),
    'time': list(),
    'lossT': list(),
    'lossV': list()
  }

  # Move model to device
  model.to(device = dev)

  # Traing loop
  for epoch in range(epochs):

    # Initialize time
    t0 = time.time()

    # Update logs
    logs['epoch'].append(epoch)
    logs['lossT'].append(0)
    logs['lossV'].append(0)

    # Set model in training mode
    model.train()

    # Training batch loop
    for batch in dlT:

      # Read batch and move to device
      img, mas = batch
      img = img.to(device = dev)
      mas = mas.to(device = dev)

      # Reset gradient
      optimizer.zero_grad()

      # Forward pass
      preds = model(img)

      # Compute loss
      loss = criterion(preds, mas)

      # Backward pass
      loss.backward()
      optimizer.step()

      # Update logs
      logs['lossT'][-1] += loss.item()

    # Set model in evaluation mode
    model.eval()

    # Disable gradient calculations
    with torch.no_grad():

      # Validation batch loop
      for batch in dlV:

        # Read batch and move to device
        img, mas = batch
        img = img.to(device = dev)
        mas = mas.to(device = dev)

        # Forward pass
        preds = model(img)

        # Compute loss
        loss = criterion(preds, mas)

        # Update logs
        logs['lossV'][-1] += loss.item()

      # Update logs based on the number of steps
      logs['lossT'][-1] /= stepsT
      logs['lossV'][-1] /= stepsV

    # Calculate time
    t1 = time.time()
    t = (t1 - t0) / 60.0
    logs['time'].append(t)

    # Print progress
    print((
      f'Epoch [{epoch} / {epochs}] | '
      f'Time: {logs['time'][-1]: .2f} | '
      f'T. Loss: {logs['lossT'][-1]: .2f} | '
      f'V. Loss: {logs['lossV'][-1]: .2f} | '
    ))

  return logs

Then we can create the dataset, dataloaders, initialize the model and start training.

In [None]:
# Check device
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {dev}')

# Model initialization
model = UNet(inChannels = 3, nClasses = 1, initChannels = 16, depthLevels = 4)

# Base dataset
ds = BreastCancerDataset(dir = os.path.join(pathBase, 'DATASET'))

# Set seed for reproducibility
torch.manual_seed(42)

# Train (70%), validation (15%) and test (15%)
n = len(ds)
nTrain = int(0.7 * n)
nVal = int(0.15 * n)
nTest = n - nTrain - nVal

# Split
dsTrain, dsVal, dsTest = torch.utils.data.random_split(
  ds,
  [nTrain, nVal, nTest]
)

# Dataloaders
dlT = torch.utils.data.DataLoader(
  dataset = dsTrain,
  batch_size = 16,
  shuffle = True
)
dlV = torch.utils.data.DataLoader(
  dataset = dsVal,
  batch_size = 16,
  shuffle = False
)

# Define criterion and optimizer
criterion = diceLoss
optimizer = torch.optim.AdamW(
  params = model.parameters(),
  lr = 1e-3,
  weight_decay = 1e-4
)

# Train
logs = train(
  model = model,
  optimizer = optimizer,
  criterion = criterion,
  epochs = 30,
  dlT = dlT,
  dlV = dlV,
  dev = dev
)

Device: cuda


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/FinalProjectAdmeav/DATASET/IMAGES'

Plot training process.

In [None]:
# Plot training process
epochs = len(logs['lossT'])
plt.style.use('ggplot')
plt.plot(
  range(1, epochs + 1),
  logs['lossT'],
  label = 'Train Loss',
  color = 'red',
  linestyle = '-'
)
plt.plot(
  range(1, epochs + 1),
  logs['lossV'],
  label = 'Val Loss',
  color = 'red',
  linestyle = '--'
)
plt.title('Training Process')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim(0, 1)
plt.legend()

# 6. Model Evaluation and Metrics

Here we evaluate and measure the model

# 7. Discussion and Conclusion

Here we discuss the results.