In [235]:
import glob
import os
import re
import pandas as pd
import numpy as np
import SimpleITK as sitk
import matplotlib.pyplot as plt

import skimage.transform
import scipy.ndimage
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
from skimage.measure import label,regionprops, perimeter
from skimage.morphology import binary_dilation, binary_opening
from skimage.filters import roberts, sobel
from skimage import measure, feature
from skimage.segmentation import clear_border
from skimage import data

import scipy.misc
from mpl_toolkits.mplot3d.art3d import Poly3DCollection

DATA_PATH = '/kaggle_2/luna/luna16/data/original_lungs/'
OUTPUT_FOLDER = '/kaggle_2/luna/luna16/data/pre_processed_chunks/'
ANNOTATIONS_PATH = '/kaggle_2/luna/luna16/data/original_lungs/csv/annotations.csv'
ANNOTATIONS_EXCLUDED_PATH = '/kaggle_2/luna/luna16/data/original_lungs/csv/annotations_excluded.csv'
CANDIDATES_PATH = '/kaggle_2/luna/luna16/data/original_lungs/csv/candidates.csv'
CHUNK_SIZE = 64
CANDIDATES_SIZE = 10000
NUM_CLASSES = 7

In [208]:
annotations = pd.read_csv(ANNOTATIONS_PATH)
annotations_excluded = pd.read_csv(ANNOTATIONS_EXCLUDED_PATH)
candidates = pd.read_csv(CANDIDATES_PATH)
candidates_sampled = candidates[candidates['class'] == 0].sample(CANDIDATES_SIZE)
candidates_sampled

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
262934,1.3.6.1.4.1.14519.5.2.1.6279.6001.244447966386...,-115.467000,192.111835,-438.928473,0
192180,1.3.6.1.4.1.14519.5.2.1.6279.6001.203179378754...,-62.330343,-4.349188,-146.953165,0
28538,1.3.6.1.4.1.14519.5.2.1.6279.6001.116492508532...,-51.497966,9.673401,-152.873565,0
85269,1.3.6.1.4.1.14519.5.2.1.6279.6001.145283812746...,46.066819,-14.695406,-198.011389,0
520031,1.3.6.1.4.1.14519.5.2.1.6279.6001.837810280808...,-129.044134,43.836266,-245.591334,0
376899,1.3.6.1.4.1.14519.5.2.1.6279.6001.312704771348...,39.423554,30.046510,-84.098177,0
51490,1.3.6.1.4.1.14519.5.2.1.6279.6001.128059192202...,-30.507234,34.870097,-133.341464,0
461902,1.3.6.1.4.1.14519.5.2.1.6279.6001.513023675145...,-120.825428,8.921853,-200.477510,0
235075,1.3.6.1.4.1.14519.5.2.1.6279.6001.227796349777...,-22.710000,-126.980000,-413.490000,0
373051,1.3.6.1.4.1.14519.5.2.1.6279.6001.310548927038...,79.443844,5.875151,-112.677778,0


In [209]:
def createImageList(subset, cads):
    imagesWithNodules = []
    subsetDir = DATA_PATH + 'subset{}'.format(subset)
    imagePaths = glob.glob("{}/*.mhd".format(subsetDir))
    for imagePath in imagePaths:
        imageName = os.path.split(imagePath)[1].replace('.mhd','')
        if len(cads[cads['seriesuid'] == imageName].index.tolist()) != 0: #dit moet efficienter kunnen!
            imagesWithNodules.append(imagePath)    
    return imagesWithNodules

def load_itk(filename):
    # Reads the image using SimpleITK
    itkimage = sitk.ReadImage(filename)
    
    # Convert the image to a  numpy array first and then shuffle the dimensions to get axis in the order z,y,x
    ct_scan = sitk.GetArrayFromImage(itkimage)
    
    # Read the origin of the ct_scan, will be used to convert the coordinates from world to voxel and vice versa.
    origin = np.array(list(reversed(itkimage.GetOrigin())))
    
    # Read the spacing along each dimension
    spacing = np.array(list(reversed(itkimage.GetSpacing())))
    
    return ct_scan, origin, spacing

In [233]:
def get_label(diameter_mm):
    if int(diameter_mm) == -1:
        return 0
    elif (diameter_mm >= 0.0) and (diameter_mm <= 15.0):
        return int(diameter_mm/3.0) + 1
    elif diameter_mm > 15.0:
        return 6

all_annotations = pd.concat([annotations_excluded, annotations], ignore_index=True)
all_annotations['class'] = all_annotations.diameter_mm.apply(get_label)
all_annotations = all_annotations.drop('diameter_mm', 1)

train_annotations = pd.concat([all_annotations, candidates_sampled], ignore_index=True)

array([0, 2, 6, 3, 4, 5, 1])

In [214]:
def world_2_voxel(world_coordinates, origin, spacing):
    stretched_voxel_coordinates = np.absolute(world_coordinates - origin)
    voxel_coordinates = stretched_voxel_coordinates / spacing
    return voxel_coordinates

In [None]:
patient_uids = train_annotations.seriesuid.unique()

patients_processed_files = glob.glob(OUTPUT_FOLDER + '[0-9\.]*_X.npy')
patients_processed = set()
for filename in patients_processed_files:
    m = re.match(r'([0-9\.]*)_X.npy', os.path.basename(filename))
    patients_processed.add(m.group(1))

for patient_uid in patient_uids:
    if patient_uid in patients_processed:
        print('Skipping already processed patient {}'.format(patient_uid))
        continue
    print('Processing patient {}'.format(patient_uid))
    
    patient_annotations = train_annotations[train_annotations.seriesuid == patient_uid]
    patient_scans_path = glob.glob(DATA_PATH + 'subset?/{}.mhd'.format(patient_uid))[0]
    img, origin, spacing = load_itk(patient_scans_path)

    #calculate resize factor
    RESIZE_SPACING = [1, 1, 1]
    resize_factor = spacing / RESIZE_SPACING
    new_real_shape = img.shape * resize_factor
    new_shape = np.round(new_real_shape)
    real_resize = new_shape / img.shape
    new_spacing = spacing / real_resize

    #resize image to 1mmx1mmx1mm spacing
    lung_img = scipy.ndimage.interpolation.zoom(img, real_resize) 

    count = 0
    X = np.ndarray([patient_annotations.shape[0], 64, 64, 64], dtype=np.int16)
    Y = np.ndarray([patient_annotations.shape[0], NUM_CLASSES], dtype=np.int16)
    for annotation in patient_annotations.itertuples():
        y = annotation[5]
        coordX = annotation[2]
        coordY = annotation[3]
        coordZ = annotation[4]
        imageCoord = np.array((coordZ, coordY, coordX))

        # Convert coords to voxel coords and slice lung_img
        imageCoord = world_2_voxel(imageCoord, origin, new_spacing)
        
        coordX1 = int(imageCoord[2] - (CHUNK_SIZE/2))
        coordX2 = int(imageCoord[2] + (CHUNK_SIZE/2))
        coordY1 = int(imageCoord[1] - (CHUNK_SIZE/2))
        coordY2 = int(imageCoord[1] + (CHUNK_SIZE/2))
        coordZ1 = int(imageCoord[0] - (CHUNK_SIZE/2))
        coordZ2 = int(imageCoord[0] + (CHUNK_SIZE/2))
        
        coordX1 = 0 if (coordX1 < 0) else coordX1
        coordY1 = 0 if (coordY1 < 0) else coordY1
        coordZ1 = 0 if (coordZ1 < 0) else coordZ1
        
        coordX2 = lung_img.shape[2] if (coordX2 > lung_img.shape[2]) else coordX2
        coordY2 = lung_img.shape[1] if (coordY2 > lung_img.shape[1]) else coordY2
        coordZ2 = lung_img.shape[0] if (coordZ2 > lung_img.shape[0]) else coordZ2
      
        chunk = np.full((64, 64, 64), -1000.0)
        chunk[0:coordZ2-coordZ1, 0:coordY2-coordY1, 0:coordX2-coordX1] = lung_img[coordZ1:coordZ2,coordY1:coordY2,coordX1:coordX2]

        X[count,:,:,:] = chunk
        Y[count,] = (np.arange(NUM_CLASSES) == y)+0
        count = count + 1
        
    np.save(OUTPUT_FOLDER + patient_uid + '_X.npy', X)
    np.save(OUTPUT_FOLDER + patient_uid + '_Y.npy', Y)

Skipping already processed patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860
Processing patient 1.3.6.1.4.1.14519.5.2.1.6279.6001.100332161840553388986847034053
