In [None]:
import numpy as np
from skimage import morphology
from skimage import measure
from sklearn.cluster import KMeans
from skimage.transform import resize
from glob import glob
import os
import matplotlib.pyplot as plt


In [None]:
working_path = "D:/Desktop/LUNA16-Data/Output/"
file_list=glob(working_path+"images_*.npy")

In [None]:
def standardize_image(img):
    mean = np.mean(img)
    std = np.std(img)
    img = img - mean
    img = img / std
    return img


def find_threshold(img):
    middle = img[100:400,100:400]
    kmeans = KMeans(n_clusters=2 , n_init=10).fit(np.reshape(middle,[np.prod(middle.shape),1]))
    centers = sorted(kmeans.cluster_centers_.flatten())
    threshold = np.mean(centers)
    print("Threshold : " , threshold)
    return middle,threshold

def threshold_image(img, threshold):
    thresh_img = np.where(img<threshold,1.0,0.0)
    return thresh_img

def erode_image(img):
    eroded = morphology.erosion(img,np.ones([4,4]))
    return eroded

def dilate_image(img):
    dilation = morphology.dilation(img,np.ones([10,10]))
    return dilation

def label_regions(img):
    labels = measure.label(img)
    regions = measure.regionprops(labels)
    good_labels = []
    for prop in regions:
        B = prop.bbox
        if B[2]-B[0]<475 and B[3]-B[1]<475 and B[0]>40 and B[2]<472:
            good_labels.append(prop.label)
    mask = np.ndarray([512,512],dtype=np.int8)
    mask[:] = 0
    for N in good_labels:
        mask = mask + np.where(labels==N,1,0)
    return mask

def process_image(img_file):
    # Load image
    imgs_to_process = np.load(img_file).astype(np.float64)

    # Loop through all images
    for i in range(len(imgs_to_process)):
        img = imgs_to_process[i]

        # Standardize image
        img = standardize_image(img)

        # Find threshold
        middle,threshold = find_threshold(img)

        # Threshold image
        thresh_img = threshold_image(img, threshold)

        # Erode image
        eroded_img = erode_image(thresh_img)

        # Dilate image
        dilated_img = dilate_image(eroded_img)

        # Label regions
        lung_mask = label_regions(dilated_img)

        # Save lung mask
        imgs_to_process[i] = lung_mask
        np.save(img_file.replace("images","lungmask"),imgs_to_process)
        

In [None]:
for img_file in file_list:
    print("Processing image:", img_file)
    process_image(img_file)
    

In [None]:

file_list=glob(working_path+"lungmask_*.npy")
out_images = []      #final set of images
out_nodemasks = []   #final set of nodemasks
for fname in file_list:
    print ("working on file ", fname)
    imgs_to_process = np.load(fname.replace("lungmask","images"))
    masks = np.load(fname)
    node_masks = np.load(fname.replace("lungmask","masks"))

    for i in range(len(imgs_to_process)):
        mask = masks[i]
        node_mask = node_masks[i]
        img = imgs_to_process[i]
        new_size = [512,512]   # we're scaling back up to the original size of the image
        img= mask*img          # apply lung mask
        #
        # renormalizing the masked image (in the mask region)
        #
        new_mean = np.mean(img[mask>0])  
        new_std = np.std(img[mask>0])
        #
        #  Pulling the background color up to the lower end
        #  of the pixel range for the lungs
        #
        old_min = np.min(img)       # background color
        img[img==old_min] = new_mean-1.2*new_std   # resetting backgound color
        img = img-new_mean
        img = img/new_std

        #make image bounding box  (min row, min col, max row, max col)
        labels = measure.label(mask)
        regions = measure.regionprops(labels)

        #
        # Finding the global min and max row over all regions
        #
        min_row = 512
        max_row = 0
        min_col = 512
        max_col = 0
        for prop in regions:
            B = prop.bbox
            if min_row > B[0]:
                min_row = B[0]
            if min_col > B[1]:
                min_col = B[1]
            if max_row < B[2]:
                max_row = B[2]
            if max_col < B[3]:
                max_col = B[3]
        width = max_col-min_col
        height = max_row - min_row
        if width > height:
            max_row=min_row+width
        else:
            max_col = min_col+height
        # 
        # cropping the image down to the bounding box for all regions
        #
        img = img[min_row:max_row,min_col:max_col]
        mask =  mask[min_row:max_row,min_col:max_col]
        if max_row-min_row <5 or max_col-min_col<5:  # skipping all images with no god regions
            continue
            
        else:
            # moving range to -1 to 1 to accomodate the resize function
            mean = np.mean(img)
            img = img - mean
            min = np.min(img)
            max = np.max(img)
            img = img/(max-min)
            new_img = resize(img,[512,512])
            new_node_mask = resize(node_mask[min_row:max_row,min_col:max_col],[512,512])
            out_images.append(new_img)
            out_nodemasks.append(new_node_mask)
            
print(len(out_nodemasks))


In [None]:

def split_dataset(out_images, out_nodemasks, working_path, test_ratio=0.2):
    """
    Splits the dataset of input images and output nodemasks into training and testing sets, and saves them
    as numpy arrays. Applies oversampling to the minority class (negative class) to balance the dataset.

    Args:
        out_images (numpy array): array of input images
        out_nodemasks (numpy array): array of output nodemasks
        working_path (str): directory path where the numpy arrays will be saved
        test_ratio (float): fraction of the dataset to be used for testing
        apply_oversampling (bool): whether to apply oversampling to the minority class

    Returns:
        None
    """

    rand_i = np.random.choice(range(num_images),size=num_images,replace=False)
    test_i = int(0.2*num_images)

    np.save(working_path+"trainImages.npy",out_images[rand_i[test_i:]])
    np.save(working_path+"trainMasks.npy",out_nodemasks[rand_i[test_i:]])
    np.save(working_path+"testImages.npy",out_images[rand_i[:test_i]])
    np.save(working_path+"testMasks.npy",out_nodemasks[rand_i[:test_i]])

In [None]:
split_dataset(out_images,out_nodemasks,working_path)