In [1]:
import os
import argparse
import torch
import math
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.optim.lr_scheduler import CosineAnnealingLR, StepLR

from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
import sklearn.metrics as metrics
from skimage.exposure import rescale_intensity
from skimage import morphology
from scipy import ndimage

from tqdm.notebook import tqdm
from importlib import reload

import pydicom as dicom
import pylibjpeg

import matplotlib.pyplot as plt
import cv2

In [2]:
""" Reads in a dicom file and returns the pixel array as a numpy array. """
def get_np_from_dicom(path):

    def _get_window_center(ds: dicom.dataset.FileDataset) -> np.float32:
        return np.float32(ds[0x5200, 0x9229][0][0x0028, 0x9132][0][0x0028, 0x1050].value)

    def _get_window_width(ds: dicom.dataset.FileDataset) -> np.float32:
        return np.float32(ds[0x5200, 0x9229][0][0x0028, 0x9132][0][0x0028, 0x1051].value)

    def _remove_noise(slice):
        segmentation = morphology.dilation(slice, np.ones((1, 1)))
        labels, label_nb = ndimage.label(segmentation)
        
        label_count = np.bincount(labels.ravel().astype(np.int))
        label_count[0] = 0

        mask = labels == label_count.argmax()
    
        mask = morphology.dilation(mask, np.ones((1, 1)))
        mask = ndimage.morphology.binary_fill_holes(mask)
        mask = morphology.dilation(mask, np.ones((3, 3)))
        masked_image = mask * slice    
        return masked_image
    
    def __preproc__(filepath):
        ds = dicom.dcmread(filepath)
        ds.decompress(handler_name="pylibjpeg")
        pixel_array = ds.pixel_array
        
        """ Rescale image pixel intensity values """
        window_center = _get_window_center(ds)
        window_width = _get_window_width(ds)

        low = (2 * window_center - window_width) / 2
        high = (2 * window_center + window_width) / 2
        
        pixel_array = rescale_intensity(
            pixel_array, in_range=(low, high), out_range="dtype"
        )

        """ Normalize the image pixel values to [0, 1] """
        pixel_array = (pixel_array - pixel_array.min()) / (
            pixel_array.max() - pixel_array.min())
        
        """ Get the 7 middle ajacent slices """
        pixel_array = pixel_array[math.floor(pixel_array.shape[0]/2)-3:math.floor(pixel_array.shape[0]/2)+4]
        
        """ Remove noise from the slices """
        slices = np.array([_remove_noise(s) for s in pixel_array])
        
        """ Normalize the slices to be the same size """
        slices = np.array([cv2.resize(s, (2048, 2048)) for s in slices])   

        return slices
    
    slices = __preproc__(path)
    return slices

In [7]:
""" Converts every .dcm file contained inside root_fodler from DICOM to a numpy array and stores it to inside dest_folder. """
def convert_dataset_from_DICOM_to_np(root_folder, dest_folder):
    data = []
    i = 0
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".dcm"):
                i+=1
                file_path = os.path.join(root, file)
                np_array = get_np_from_dicom(file_path)
                # print shape of each file 
                print("Shape of np array is : ", np_array.shape)
                # convert np array to 32 bit float precision
                np_array = np_array.astype(np.float32)
                np.save(os.path.join(dest_folder, str(i)+'.npy'), np_array)
                print("Completed decompressing file: " + file_path)

In [8]:
# Number of dicom files in data/training: 1029 
# Number of dicom files in data/testing: 134

""" Create training and testing sets """
convert_dataset_from_DICOM_to_np("data/training", "data_np/training")
convert_dataset_from_DICOM_to_np("data/testing", "data_np/testing")

  f"The (0028,0101) Bits Stored value '{bits_stored}' in the "
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


Shape of np array is :  (7, 2048, 2048)
Completed decompressing file: data/training\manifest-1617905855234\Breast-Cancer-Screening-DBT\DBT-P00697\01-01-2000-DBT-S04996-MAMMO screening digital bilateral-57783\3233.000000-NA-84307\1-1.dcm
Shape of np array is :  (7, 2048, 2048)
Completed decompressing file: data/training\manifest-1617905855234\Breast-Cancer-Screening-DBT\DBT-P00697\01-01-2000-DBT-S04996-MAMMO screening digital bilateral-57783\3234.000000-NA-18984\1-1.dcm
Shape of np array is :  (7, 2048, 2048)
Completed decompressing file: data/training\manifest-1617905855234\Breast-Cancer-Screening-DBT\DBT-P00697\01-01-2000-DBT-S04996-MAMMO screening digital bilateral-57783\3235.000000-NA-23462\1-1.dcm
Shape of np array is :  (7, 2048, 2048)
Completed decompressing file: data/training\manifest-1617905855234\Breast-Cancer-Screening-DBT\DBT-P00697\01-01-2000-DBT-S04996-MAMMO screening digital bilateral-57783\3236.000000-NA-64868\1-1.dcm
Shape of np array is :  (7, 2048, 2048)
Completed de