In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from glob import glob
from tqdm import tqdm
import SimpleITK as sitk
from skimage import measure
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

def get_filename(file_list, file):
    for f in file_list:
        if file in f:
            return f

def load_mhd(file):
    mhdimage = sitk.ReadImage(file)
    ct_scan = sitk.GetArrayFromImage(mhdimage)
    origin = np.array(list(mhdimage.GetOrigin()))
    space = np.array(list(mhdimage.GetSpacing()))
    return ct_scan, origin, space

In [None]:
!wget https://zenodo.org/records/4121926/files/subset9.zip?download=1
!unzip /kaggle/working/subset9.zip?download=1
!rm /kaggle/working/subset9.zip?download=1

In [4]:
subset0_5_path = '/kaggle/input/luna16/'
subset6_8_path = '/kaggle/input/d/wuwu55/luna16/'
subset9_path = '/kaggle/working/subset9'

file_list=[]
for subset in range(0,6):
    file_list.append(glob(f"{subset0_5_path}subset{subset}/subset{subset}/*.mhd"))
for subset in range(6,9):
    file_list.append(glob(f"{subset6_8_path}subset{subset}/subset{subset}/*.mhd"))
for subset in range(9,10):
    file_list.append(glob(f"{subset9_path}/*.mhd"))
file_list = [item for sublist in file_list for item in sublist]    


In [5]:
candidates_df = pd.read_csv('/kaggle/input/luna16/candidates.csv')
candidates_df=candidates_df[candidates_df['class']==0]
candidates_df["filename"] = candidates_df["seriesuid"].map(lambda file: get_filename(file_list, file))
candidates_df = candidates_df.dropna()

In [6]:
annotations_df = pd.read_csv(f"{subset0_5_path}/annotations.csv")
annotations_df=annotations_df[annotations_df['diameter_mm'] >= 3.9]
annotations_df["filename"] = annotations_df["seriesuid"].map(lambda file: get_filename(file_list, file))
annotations_df = annotations_df.dropna()

In [60]:
OUTPUT_FOLDER = 'Label0_cropped_100x100_nodules'
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [61]:
crop_size = 100
i=0
for each_ct in tqdm(annotations_df['filename'].unique()):
    #load the ct scan
    ct, origin, space = load_mhd(each_ct)
    #get the list of annotations for ct scan
    annotations = annotations_df[annotations_df['filename']==each_ct][['coordX', 'coordY','coordZ']]
    #convert to voxel coordinates
    annotations=(annotations-origin)/space
    #convert Z coord to slice index
    annotations['coordZ']=round(annotations['coordZ'])
    #get list of index values
    anno_indices = annotations['coordZ'].values
    anno_indices= anno_indices.astype(int)
    anno_indices=sum([[i-1, i, i+1] for i in anno_indices], [])
    #get a list of candidates for this CT scan
    list_of_candidates = candidates_df[candidates_df['filename']==each_ct]
    #convert candidates coords to voxel coords
    list_of_candidates=(list_of_candidates[['coordX', 'coordY','coordZ']]-origin)/space
    #get a list of candidates inside the central region
    list_of_candidates = list_of_candidates.query('100 < coordX < 400 and 100 < coordY < 400')
    #get a list of candidates slice indices
    list_of_candidates['coordZ']=round(list_of_candidates['coordZ'])
    if len(list_of_candidates) != 0:
        # get candidates for the slice where the annotations are visible
        filtered_candidates = list_of_candidates[list_of_candidates['coordZ'].isin(anno_indices)]
        if len(filtered_candidates) < 5:
            random_selection = list_of_candidates.sample(n=5, random_state=1)
        #randomly select 4 samples from candidates visible in nodule slice
        else:
            random_selection = filtered_candidates.sample(n=5, random_state=1)
        for index, row in random_selection.iterrows():
            coordX = row['coordX']
            coordY = row['coordY']
            coordZ = row['coordZ']
            x_start = max(0, int(coordX - crop_size / 2))
            x_end = x_start+100
            y_start = max(0, int(coordY - crop_size / 2))
            y_end = y_start+100
            cropped_image = ct[int(coordZ)][y_start:y_end, x_start:x_end]
            cropped_image = cv2.normalize(cropped_image, None, 0, 255, norm_type=cv2.NORM_MINMAX)
            np.save(f'{OUTPUT_FOLDER}/non_nodule_{i}.npy', cropped_image)
            i+=1

100%|██████████| 593/593 [10:36<00:00,  1.07s/it]


In [None]:
!zip -r Label0_cropped_100x100_nodules.zip /kaggle/working/Label0_cropped_100x100_nodules