In [2]:
import glob
import os
import re
import pandas as pd
import numpy as np
import SimpleITK as sitk
import matplotlib.pyplot as plt

import skimage.transform
import scipy.ndimage
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
from skimage.measure import label,regionprops, perimeter
from skimage.morphology import binary_dilation, binary_opening
from skimage.filters import roberts, sobel
from skimage import measure, feature
from skimage.segmentation import clear_border
from skimage import data

import scipy.misc
from mpl_toolkits.mplot3d.art3d import Poly3DCollection

DATA_PATH = '/kaggle_2/luna/luna16/data/original_lungs/'
OUTPUT_FOLDER = '/kaggle_2/luna/luna16/data/pre_processed_chunks_segmented/'
ANNOTATIONS_PATH = '/kaggle_2/luna/luna16/data/original_lungs/csv/annotations.csv'
ANNOTATIONS_EXCLUDED_PATH = '/kaggle_2/luna/luna16/data/original_lungs/csv/annotations_excluded.csv'
CANDIDATES_PATH = '/kaggle_2/luna/luna16/data/original_lungs/csv/candidates.csv'
CHUNK_SIZE = 64
CANDIDATES_SIZE = 10000
NUM_CLASSES = 7

In [3]:
annotations = pd.read_csv(ANNOTATIONS_PATH)
annotations_excluded = pd.read_csv(ANNOTATIONS_EXCLUDED_PATH)
candidates = pd.read_csv(CANDIDATES_PATH)
candidates_sampled = candidates[candidates['class'] == 0].sample(CANDIDATES_SIZE)
candidates_sampled

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
134798,1.3.6.1.4.1.14519.5.2.1.6279.6001.174449669706...,-85.416904,-46.239316,-198.198174,0
19829,1.3.6.1.4.1.14519.5.2.1.6279.6001.111172165674...,-79.650000,122.600000,-198.770000,0
77775,1.3.6.1.4.1.14519.5.2.1.6279.6001.141345499716...,-125.277095,35.802345,-181.915714,0
199450,1.3.6.1.4.1.14519.5.2.1.6279.6001.206097113343...,-60.211014,-70.732742,-206.832101,0
437597,1.3.6.1.4.1.14519.5.2.1.6279.6001.416701701108...,49.678844,44.125300,-61.016406,0
517475,1.3.6.1.4.1.14519.5.2.1.6279.6001.822128649427...,-85.290000,-36.010000,-178.170000,0
281143,1.3.6.1.4.1.14519.5.2.1.6279.6001.254254303842...,-59.792482,-102.312889,1455.333333,0
211511,1.3.6.1.4.1.14519.5.2.1.6279.6001.215785045378...,-47.460000,219.730000,-625.550000,0
44913,1.3.6.1.4.1.14519.5.2.1.6279.6001.124663713663...,50.450000,36.970000,-97.210000,0
428782,1.3.6.1.4.1.14519.5.2.1.6279.6001.387954549120...,-32.324834,-108.451841,-242.652213,0


In [4]:
def load_itk(filename):
    # Reads the image using SimpleITK
    itkimage = sitk.ReadImage(filename)
    
    # Convert the image to a  numpy array first and then shuffle the dimensions to get axis in the order z,y,x
    ct_scan = sitk.GetArrayFromImage(itkimage)
    
    # Read the origin of the ct_scan, will be used to convert the coordinates from world to voxel and vice versa.
    origin = np.array(list(reversed(itkimage.GetOrigin())))
    
    # Read the spacing along each dimension
    spacing = np.array(list(reversed(itkimage.GetSpacing())))
    
    return ct_scan, origin, spacing

In [5]:
def get_label(diameter_mm):
    if int(diameter_mm) == -1:
        return 0
    elif (diameter_mm >= 0.0) and (diameter_mm <= 15.0):
        return int(diameter_mm/3.0) + 1
    elif diameter_mm > 15.0:
        return 6

all_annotations = pd.concat([annotations_excluded, annotations], ignore_index=True)
all_annotations['class'] = all_annotations.diameter_mm.apply(get_label)
all_annotations = all_annotations.drop('diameter_mm', 1)

train_annotations = pd.concat([all_annotations, candidates_sampled], ignore_index=True)

In [10]:
train_annotations['class'].value_counts()

0    40513
2     2876
3     1511
4      587
6      528
5      315
1       48
Name: class, dtype: int64

In [5]:
def world_2_voxel(world_coordinates, origin, spacing):
    stretched_voxel_coordinates = np.absolute(world_coordinates - origin)
    voxel_coordinates = stretched_voxel_coordinates / spacing
    return voxel_coordinates

In [6]:
def largest_label_volume(im, bg=-1):
    vals, counts = np.unique(im, return_counts=True)

    counts = counts[vals != bg]
    vals = vals[vals != bg]

    if len(counts) > 0:
        return vals[np.argmax(counts)]
    else:
        return None

def segment_lung_mask(image, fill_lung_structures=True):
    
    # not actually binary, but 1 and 2. 
    # 0 is treated as background, which we do not want
    binary_image = np.array(image > -320, dtype=np.int8)+1
    labels = measure.label(binary_image)
    
    # Pick the pixel in the very corner to determine which label is air.
    #   Improvement: Pick multiple background labels from around the patient
    #   More resistant to "trays" on which the patient lays cutting the air 
    #   around the person in half
    background_label = labels[0,0,0]
    
    #Fill the air around the person
    binary_image[background_label == labels] = 2
    
    
    # Method of filling the lung structures (that is superior to something like 
    # morphological closing)
    if fill_lung_structures:
        # For every slice we determine the largest solid structure
        for i, axial_slice in enumerate(binary_image):
            axial_slice = axial_slice - 1
            labeling = measure.label(axial_slice)
            l_max = largest_label_volume(labeling, bg=0)
            
            if l_max is not None: #This slice contains some lung
                binary_image[i][labeling != l_max] = 1

    
    binary_image -= 1 #Make the image actual binary
    binary_image = 1-binary_image # Invert it, lungs are now 1
    
    # Remove other air pockets insided body
    labels = measure.label(binary_image, background=0)
    l_max = largest_label_volume(labels, bg=0)
    if l_max is not None: # There are air pockets
        binary_image[labels != l_max] = 0
 
    return binary_image

In [None]:
patient_uids = train_annotations.seriesuid.unique()

patients_processed_files = glob.glob(OUTPUT_FOLDER + '[0-9\.]*_X.npy')
patients_processed = set()
for filename in patients_processed_files:
    m = re.match(r'([0-9\.]*)_X.npy', os.path.basename(filename))
    patients_processed.add(m.group(1))

for patient_uid in patient_uids:
    if patient_uid in patients_processed:
        print('Skipping already processed patient {}'.format(patient_uid))
        continue
    print('Processing patient {}'.format(patient_uid))
    
    patient_annotations = train_annotations[train_annotations.seriesuid == patient_uid]
    patient_scans_path = glob.glob(DATA_PATH + 'subset?/{}.mhd'.format(patient_uid))[0]
    img, origin, spacing = load_itk(patient_scans_path)

    #calculate resize factor
    RESIZE_SPACING = [1, 1, 1]
    resize_factor = spacing / RESIZE_SPACING
    new_real_shape = img.shape * resize_factor
    new_shape = np.round(new_real_shape)
    real_resize = new_shape / img.shape
    new_spacing = spacing / real_resize

    # Resample; resize image to 1mmx1mmx1mm spacing
    lung_img = scipy.ndimage.interpolation.zoom(img, real_resize) 

    # Segment
    lung_img = segment_lung_mask(lung_img, True)   
   
    count = 0
    X = np.ndarray([patient_annotations.shape[0], 64, 64, 64], dtype=np.int16)
    Y = np.ndarray([patient_annotations.shape[0], NUM_CLASSES], dtype=np.int16)
    for annotation in patient_annotations.itertuples():
        y = annotation[5]
        coordX = annotation[2]
        coordY = annotation[3]
        coordZ = annotation[4]
        imageCoord = np.array((coordZ, coordY, coordX))

        # Convert coords to voxel coords and slice lung_img
        imageCoord = world_2_voxel(imageCoord, origin, new_spacing)
        
        coordX1 = int(imageCoord[2] - (CHUNK_SIZE/2))
        coordX2 = int(imageCoord[2] + (CHUNK_SIZE/2))
        coordY1 = int(imageCoord[1] - (CHUNK_SIZE/2))
        coordY2 = int(imageCoord[1] + (CHUNK_SIZE/2))
        coordZ1 = int(imageCoord[0] - (CHUNK_SIZE/2))
        coordZ2 = int(imageCoord[0] + (CHUNK_SIZE/2))
        
        coordX1 = 0 if (coordX1 < 0) else coordX1
        coordY1 = 0 if (coordY1 < 0) else coordY1
        coordZ1 = 0 if (coordZ1 < 0) else coordZ1
        
        coordX2 = lung_img.shape[2] if (coordX2 > lung_img.shape[2]) else coordX2
        coordY2 = lung_img.shape[1] if (coordY2 > lung_img.shape[1]) else coordY2
        coordZ2 = lung_img.shape[0] if (coordZ2 > lung_img.shape[0]) else coordZ2
      
        chunk = np.full((64, 64, 64), -1000.0)
        chunk[0:coordZ2-coordZ1, 0:coordY2-coordY1, 0:coordX2-coordX1] = lung_img[coordZ1:coordZ2,coordY1:coordY2,coordX1:coordX2]

        X[count,:,:,:] = chunk
        Y[count,] = (np.arange(NUM_CLASSES) == y)+0
        count = count + 1
        
    np.save(OUTPUT_FOLDER + patient_uid + '_X.npy', X)
    np.save(OUTPUT_FOLDER + patient_uid + '_Y.npy', Y)

Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.100332161840553388986847034053
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793540579077826395208
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.100530488926682752765845212286
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.100620385482151095585000946543
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016233746780170740405
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.100953483028192176989979435275
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.101228986346984399347858840086
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.102133688497886810253331438797
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.102681962408431413578140925249
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.10311520171407599357978



Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.120196332569034738680965284519
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.120842785645314664964010792308
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.121108220866971173712229588402
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.121391737347333465796214915391
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.121805476976020513950614465787
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.121824995088859376862458155637
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.121993590721161347818774929286
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.122621219961396951727742490470
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.122763913896761494371822656720
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.122914038048856168343065566972
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.123654356399290048011621921476
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.12369763745143752206594