In [1]:
import os
from PIL import Image
import numpy as np
import math
import pydicom as pdcm
from skimage.draw import polygon
import xml.etree.ElementTree as ET
from matplotlib import pyplot as plt
import random
import shutil

In [2]:
class Annotation:
  
  def __init__(self, xml_path, filename, shape):

    self.xml_path = xml_path + filename + '.xml'
    self.mask     = self.create_mask_array(shape)
    self.fill_mask()


  """
  This function creates the proper contour/polygon for each ROI and stores
  the information in the corresponding position of the mask object
  """
  def fill_mask(self):
    if os.path.exists(self.xml_path):
      rois, num_rois   = self.parse_XML(self.xml_path)
      for roi in rois:
        roi_info       = self.get_roi_info(roi)
        r_poly, c_poly = self.create_polygon_lists(self.mask, roi_info["points"])
        rr, cc         = polygon(r_poly, c_poly)
        roi_channel    = self.select_mask_channel(roi_info["roi_type"])
        try:
          self.mask[rr, cc, roi_channel] = 1
        except IndexError:
          print(self.xml_path)


  """
  Inputs:
    -- xml_path: Path to the corresponding xml file
  Outputs:
    -- rois: Array with the ROI objects
    -- num_of_rois: Number of ROIs 
  """
  def parse_XML(self, xml_path):
    tree        = ET.parse(xml_path)
    root        = tree.getroot()       # The root of the XML file
    data        = root[0][1]           # The essential info
    rois        = data[0][5]           # Array containing the ROI objects
    num_of_rois = int(data[0][3].text) # Number of ROI objects

    return rois, num_of_rois


  """
  Inputs:
    -- img_shape: The preferred shape of the mask to be created
  Outputs:
    -- 3-dimensional numpy array of type uint8 
  """
  def create_mask_array(self,img_shape):
    return np.zeros((img_shape[0], img_shape[1], 3), dtype = np.uint8)

  def get_roi_info(self, roi):
    roi_info      = {
      "points":        roi[21],           # Array containing the points of a ROI
      "num_of_points": int(roi[17].text), # Number of points of the area
      "roi_index":     int(roi[7].text),  # Identifier of the ROI
      "roi_type":      roi[15].text       # (Mass, Calcification, other)
    }

    return roi_info



  """
  Inputs:
    -- mask: numpy object of the mask
    -- points: x-y coordinates of a ROI's points
  Outputs:
    -- r_poly: array containing the x-axis coordinates
    -- c_poly: array containing the y-axis coordinates
  """
  def create_polygon_lists(self, mask, points):
    mask_width  = mask.shape[0]
    mask_height = mask.shape[1]
    r_poly      = np.array([])
    c_poly      = np.array([])
    roi_img     = np.zeros((mask_width, mask_height), dtype=np.uint8)

    for point in points:

      temp_tuple = point.text[1:-1].split(",")
      y          = int(math.trunc(float(temp_tuple[0]))) 
      x          = int(math.trunc(float(temp_tuple[1])))
      r_poly     = np.append(r_poly, x)
      c_poly     = np.append(c_poly, y)

    return r_poly, c_poly


  """
  Input:
    -- roi_type: The type of a specific ROI, extracted from the XML file
  Output:
    -- roi_channel: The type of the ROI defines the integer value of this var
  """
  def select_mask_channel(self, roi_type):
    roi_ch = 2
    if roi_type == "Mass":
      roi_ch = 0
    elif roi_type == "Calcification":
      roi_ch = 1
    return roi_ch

In [5]:
XML_PATH = "./INBreast/AllXML/"
DCM_PATH = "./INBreast/AllDICOMs/"

Rename file names to keep only patient ids in them

In [10]:
import os

# List all files in the folder
files = os.listdir(DCM_PATH)

# Iterate through each file and rename it
for file_name in files:
    if file_name.endswith('.dcm'):
        # Extract the first part of the filename before the underscore
        new_name = file_name.split('_')[0] + "_" + file_name.split('_')[1] + '.dcm'

        # Create the full path for the old and new filenames
        old_path = os.path.join(DCM_PATH, file_name)
        new_path = os.path.join(DCM_PATH, new_name)

        # Rename the file
        os.rename(old_path, new_path)

print("Files renamed successfully.")


Files renamed successfully.


Define folder structure where extracted images will be saved

In [11]:
inbreast_path = './Extracted'
images_output_path = './Extracted/images'
labels_output_path = './Extracted/labels'

In [12]:
if not os.path.exists(inbreast_path):
    os.makedirs(inbreast_path)
if not os.path.exists(images_output_path):
    os.makedirs(images_output_path)
if not os.path.exists(labels_output_path):
    os.makedirs(labels_output_path)

Define palette for each class (None, Mass)

In [13]:
palette = ([0,0,0],[255,0,0])

Extract images and masks

In [None]:
for img_name in os.listdir(DCM_PATH):
    img_name = img_name.split('.')[0]
    mask_img_name = img_name.split('_')[0]
    dcm = pdcm.dcmread(DCM_PATH + img_name + '.dcm')
    img = dcm.pixel_array
    a = Annotation(XML_PATH, mask_img_name, img.shape)

    all_zeroes_0 = np.all(a.mask[:,:,0] == 0)

    if all_zeroes_0:
        continue
    
    mask = a.mask[:,:,0]

    # Create an image from the array
    mask = Image.fromarray(mask)

    mask.save(os.path.join(labels_output_path, img_name + '.png'))
    img = img.astype(np.uint8)
    img = Image.fromarray(img)

    img.save(os.path.join(images_output_path, img_name + '.png'))

Count number of images for each patient

In [15]:
patient_num_of_files = {}
for fn in os.listdir('./Extracted/images'):
    patient_num_of_files[fn.split('_')[1]] = 0

for fn in os.listdir('./Extracted/images'):
    patient_num_of_files[fn.split('_')[1]] += 1   

In [16]:
patient_num_of_files

{'6c613a14b80a8591.png': 4,
 'f4b2d377f43ba0bd.png': 2,
 '81cd83d2f4d78528.png': 2,
 '8dbbd4e51f549ff0.png': 2,
 '024ee3569b2605dc.png': 2,
 '8d0b9620c53c0268.png': 2,
 '493155e17143edef.png': 2,
 'bf1a6aaadb05e3df.png': 2,
 '036aff49b8ac84f0.png': 1,
 'd713ef5849f98b6c.png': 2,
 'bbd6a3a35438c11b.png': 2,
 '6200187f3f1ccc18.png': 2,
 '5530d5782fc89dd7.png': 2,
 '5eae9beae14d26fd.png': 4,
 'fe7d005dcbbfb46d.png': 4,
 'dcafa6ba6374ec07.png': 2,
 '45c7f44839fd9e68.png': 2,
 '6bd24a0a42c19ce1.png': 2,
 '1e5c3af078f74b05.png': 2,
 'd065adcb9905b973.png': 2,
 '2dec4948fbe6336d.png': 2,
 'e1f51192f7bf3f5f.png': 2,
 '98429c0bdf78c0c7.png': 2,
 '7e677f3d530e41ed.png': 2,
 'e15a16f87b4f9782.png': 1,
 '0b7396cdccacca82.png': 2,
 '61b13c59bcba149e.png': 2,
 '64a22c47765f0c5c.png': 2,
 '1e10aef17c9fe149.png': 2,
 'ac3185e18ffdc7b6.png': 4,
 'c4b995eddb3c510c.png': 2,
 'd8205a09c8173f44.png': 2,
 '5291e1aee2bbf5df.png': 2,
 'fbb55bf7fff48540.png': 2,
 '349323117bf0fd93.png': 2,
 '6968748e66837bc7.p

Create 5 subfolders for each cross validation split but images from one patient can only appear in one folder

In [17]:
def move_random_images(img_source_folder, lab_source_folder, destination_folder_img, destination_folder_lab, num_images_to_move):
    # Get a list of all files in the source folder
    all_images = os.listdir(img_source_folder)
    counter = 0
    i = 0
    # Move selected images to the destination folder
    while i < len(all_images):
        image = all_images[i]
        patient_id = image.split('_')[1]
        while True:
            source_path = os.path.join(img_source_folder, image)
            destination_path = os.path.join(destination_folder_img, image)
            shutil.move(source_path, destination_path)
            lab_image = image.split('.')[0] + '.png'
            source_path = os.path.join(lab_source_folder, lab_image)
            destination_path = os.path.join(destination_folder_lab, lab_image)
            shutil.move(source_path, destination_path)
            counter += 1
            image = all_images[i + 1]
            if image.split('_')[1] != patient_id:
                break
            i += 1
        if counter >= num_images_to_move:
            break
        i += 1

In [19]:
os.makedirs("./Final_Images", exist_ok=True)

In [None]:
folder_names = ['A', 'B', 'C', 'D', 'E']
for fn in folder_names:
    destination_folder_img = './Final_Images/' + fn
    destination_folder_lab = './Final_Images/' + fn + '_labels'
    if not os.path.exists(destination_folder_img):
        os.makedirs(destination_folder_img)

    if not os.path.exists(destination_folder_lab):
        os.makedirs(destination_folder_lab)

    # Replace 'source_folder' and 'destination_folder' with your actual folder paths
    img_source_folder = './Extracted/images/'
    lab_source_folder = './Extracted/labels/'

    # Specify the fraction of images to move (1/5 in this case)
    num_images_to_move = 21

    # Call the function to move random images
    move_random_images(img_source_folder, lab_source_folder, destination_folder_img, destination_folder_lab, num_images_to_move)


Resize images to 512x512

In [22]:
def resize_images(input_directory, output_directory):
    # Loop through each PNG file in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".png"):
            # Open the original image
            original_path = os.path.join(input_directory, filename)
            original_image = Image.open(original_path)

            # Resize the image to 512x512
            resized_image = original_image.resize((512, 512))

            # Save the resized image to the output directory
            output_path = os.path.join(output_directory, filename)
            resized_image.save(output_path)

In [23]:
for fn in folder_names:
    directory = "./Final_Images/" + fn
    directory = "./Final_Images/" + fn
    resize_images(directory, directory)

In [24]:
for fn in folder_names:
    directory = "./Final_Images/" + fn + "_labels"
    directory = "./Final_Images/" + fn
    resize_images(directory, directory)