In [1]:
import os

import cv2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from tqdm import tqdm

np.random.seed(42)

# Load Datasets

In [2]:
DATASET_DIR = "../datasets/CCBIS-DDSM/"

In [3]:
train_df = pd.read_csv(os.path.join(DATASET_DIR, "train.csv"))
validation_df = pd.read_csv(os.path.join(DATASET_DIR, "validation.csv"))
test_df = pd.read_csv(os.path.join(DATASET_DIR, "test.csv"))

## Patch Generation

In [4]:
# Define helper functions
def file_exists(path):
    return os.path.isfile(path)

In [5]:
def overlap(bbox_a, bbox_b):
    """
    Checks for overlap between two bounding boxes
    Boxes must be in x, y, w, h format
    """
    x1 = max(bbox_a[0], bbox_b[0])
    x2 = min(bbox_a[0] + bbox_a[2], bbox_b[0] + bbox_b[2])
    y1 = max(bbox_a[1], bbox_b[1])
    y2 = min(bbox_a[1] + bbox_a[3], bbox_b[1] + bbox_b[3])
    
    area = max(0, x2-x1+1) * max(0, y2-y1+1)
    return area

def get_patch_location(bbox, image_shape):
    """
    Gets the patch starting points for a lession in the image.
    Puts the starting points so that the extracted patch is in the center of the lession.
    """
    # Unpack bbox
    x, y, w, h = bbox
    
    # Set the starting points for the patch
    patch_x = x + (w/2) - (PATCH_WIDTH/2)
    patch_y = y + (h/2) - (PATCH_HEIGHT/2)
    
    # Ensure they are all inside the image
    if patch_x < 0: patch_x = 0
    if patch_x + PATCH_WIDTH > image_shape[1]: patch_x = image_shape[1] - PATCH_WIDTH
    if patch_y < 0: patch_y = 0
    if patch_y + PATCH_HEIGHT > image_shape[0]: patch_y = image_shape[0] - PATCH_HEIGHT
    
    return int(patch_x), int(patch_y)

def get_background_location(bboxes, image_shape):
    """
    Get the patch starting points for the background patches in the image
    Selects background from places where there are no lessions
    """
    while(True):
        # Get a random value for the background starting point
        bg_x = np.random.randint(image_shape[1] - PATCH_WIDTH)
        bg_y = np.random.randint(image_shape[0] - PATCH_HEIGHT)
        
        # Counter to ensure all bounding boxes don't collide with the background class
        i = 0
        for bbox in bboxes:
            # Unpack values
            x, y, w, h = bbox
            # Check that the starting point is not inside any bounding box
            if overlap(bbox, [bg_x, bg_y, PATCH_WIDTH, PATCH_HEIGHT]) == 0:
                i = i+1
        
        # If i is the same as the number of boxes, it means the background doesn't collide with any bbox
        # So we can use it.
        if(i == len(bboxes)):
            return int(bg_x), int(bg_y)

In [6]:
def get_lession_patches(bboxes, image):
    patches = []
    for bbox in bboxes:
        # For each bounding box get the starting coordinates for the patch
        x, y = get_patch_location(bbox, image.shape)
        # Extract the lession patch
        patches.append(image[y:y+PATCH_HEIGHT, x:x+PATCH_WIDTH])
    return patches
    
def get_background_patches(bboxes, image):
    backgrounds = []
    # Get the same number of background patches as there are patches
    for i in range(len(bboxes)):
        patch = [0]
        # Iterate over different backgrounds until we have one that has useful information (isn't just black)
        while(np.amax(patch) == 0):
            x, y = get_background_location(bboxes, image.shape)
            patch = image[y:y+PATCH_HEIGHT, x:x+PATCH_WIDTH]
        backgrounds.append(patch)
    return backgrounds

In [7]:
def create_S1(df, target_folder):
    j = 0
    for image_name in tqdm(df["Image"].unique()):
        # Load the image and mask
        if not file_exists(os.path.join(DATASET_DIR, image_name)):
            print("[ERROR] {} does not exist!".format(image_name))
            continue
        image = np.array(Image.open(os.path.join(DATASET_DIR, image_name)))

        # Get all lessions related to a specific image
        tmp_df = df[df["Image"] == image_name]

        # Extract bounding boxes information for each lession
        bboxes = np.array(tmp_df.iloc[:, -4:].values)

        # Get all lession patches
        patches = get_lession_patches(bboxes, image)

        # Get all background patches
        backgrounds = get_background_patches(bboxes, image)

        # Save images
        for i in range(len(patches)):
            file_path = target_folder + tmp_df.iloc[i]["Abnormality_Type"] + "_" + tmp_df.iloc[i]["Pathology"] + "/" + tmp_df.iloc[i]["Patient_ID"] + "_" + tmp_df.iloc[i]["Left_Right_Breast"] + "_" + tmp_df.iloc[i]["Image_View"] + "_" + str(i) + ".png"
            im = Image.fromarray(patches[i])
            im.save(file_path)

        for i in range(len(backgrounds)):
            file_path = target_folder + "background/" + str(j) + ".png"
            j = j+1
            im = Image.fromarray(backgrounds[i])
            im.save(file_path)

## S1 Dataset Creation

The S1 Dataset consists of small patches taken from the bounding box indicating where the lesion is.
Patches are taken from a centered position.

We also create the S1-Big dataset, taking into consideration the average size of the bounding boxes found in the CCBIS-DDSM Dataset, and use a patch of 512x512.

In [8]:
PATCH_HEIGHT = 224
PATCH_WIDTH = 224

In [9]:
create_S1(train_df, "../datasets/S1/train/")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1938/1938 [04:16<00:00,  7.55it/s]


In [11]:
create_S1(validation_df, "../datasets/S1/validation/")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 548/548 [01:19<00:00,  6.93it/s]


In [12]:
create_S1(test_df, "../datasets/S1/test/")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 632/632 [01:28<00:00,  7.14it/s]


## S1-Big Dataset Creation

In [13]:
PATCH_HEIGHT = 512
PATCH_WIDTH = 512

In [14]:
create_S1(train_df, "../datasets/S1-Big/train/")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1938/1938 [06:17<00:00,  5.14it/s]


In [15]:
create_S1(validation_df, "../datasets/S1-Big/validation/")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 548/548 [01:44<00:00,  5.22it/s]


In [16]:
create_S1(test_df, "../datasets/S1-Big/test/")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 632/632 [02:01<00:00,  5.20it/s]
