In [1]:
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from skimage import measure
import SimpleITK as stk
from glob import glob
from tqdm import tqdm

In [8]:
root = 'Luna16/Dataset/'
target_root = 'Luna16/FPR_Processed/'

In [9]:
subset = 6
file_list = glob(root + f"subset{subset}/*.mhd")
print("Files Count:", len(file_list))

Files Count: 89


In [15]:
candidates_df = pd.read_csv("Candidates/candidates.csv")

In [16]:
candidates_df.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-56.08,-67.85,-311.92,0
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,53.21,-244.41,-245.17,0
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,103.66,-121.8,-286.62,0
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-33.66,-72.75,-308.41,0
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-32.25,-85.36,-362.51,0


In [17]:
print("Total Candidates: ", len(candidates_df))
print("Positives: ", candidates_df['class'].sum())

Total Candidates:  551065
Positives:  1351


In [18]:
candidates_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551065 entries, 0 to 551064
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   seriesuid  551065 non-null  object 
 1   coordX     551065 non-null  float64
 2   coordY     551065 non-null  float64
 3   coordZ     551065 non-null  float64
 4   class      551065 non-null  int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 21.0+ MB


In [19]:
def get_filename(file_list, file):
    for f in file_list:
        if file in f:
            return f

In [20]:
def load_mhd(file):
    mhdimage = stk.ReadImage(file)
    ct_scan = stk.GetArrayFromImage(mhdimage)
    origin = np.array(list(mhdimage.GetOrigin()))
    space = np.array(list(mhdimage.GetSpacing()))
    return ct_scan, origin, space

In [21]:
candidates_df["filename"] = candidates_df["seriesuid"].map(lambda file: get_filename(file_list, file))
candidates_df = candidates_df.dropna()
print(len(candidates_df))

52537


In [22]:
candidates_df.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class,filename
12105,1.3.6.1.4.1.14519.5.2.1.6279.6001.106630482085...,-76.16,52.87,-209.26,0,Luna16/Dataset/subset6/1.3.6.1.4.1.14519.5.2.1...
12106,1.3.6.1.4.1.14519.5.2.1.6279.6001.106630482085...,-97.16,-12.95,-93.76,0,Luna16/Dataset/subset6/1.3.6.1.4.1.14519.5.2.1...
12107,1.3.6.1.4.1.14519.5.2.1.6279.6001.106630482085...,-120.27,29.06,-284.61,0,Luna16/Dataset/subset6/1.3.6.1.4.1.14519.5.2.1...
12108,1.3.6.1.4.1.14519.5.2.1.6279.6001.106630482085...,20.46,24.86,-277.57,0,Luna16/Dataset/subset6/1.3.6.1.4.1.14519.5.2.1...
12109,1.3.6.1.4.1.14519.5.2.1.6279.6001.106630482085...,79.98,-12.95,-237.43,0,Luna16/Dataset/subset6/1.3.6.1.4.1.14519.5.2.1...


In [23]:
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))  # CLAHE(Contrast Limited Adaptive Histogram Equalization) filter for enhancing the contrast of an image
for i,file in tqdm(enumerate(np.unique(candidates_df['filename'].values)), total=len(np.unique(candidates_df['filename'].values))):
    candidates = candidates_df[candidates_df["filename"]==file]
    ct, origin, space = load_mhd(file)
    num_z, height, width = ct.shape
    ct_norm = cv2.normalize(ct, None, 0, 255, cv2.NORM_MINMAX)
    for idx, row in candidates.iterrows():
        node_x = int(row["coordX"])     # x-coordinate of the candidate
        node_y = int(row["coordY"])     # y-coordinate of the candidate
        node_z = int(row["coordZ"])     # z-coordinate of the candidate
        c = int(row["class"])           # class of the candidate (1: nodule, 0: non-nodule)

        center = np.array([node_x, node_y, node_z])   # nodule center
        v_center = np.rint((center-origin)/space)   # nodule center in voxel space (still x,y,z ordering)

        img_norm = ct_norm[int(v_center[2]),:,:]    # a slice of the CT scan containing the candidate
        img_norm = cv2.resize(img_norm, (512,512))  # resize the image to 512x512
        img_norm_improved = clahe.apply(img_norm.astype(np.uint8))  # apply CLAHE filter to the image

        x=abs(int(v_center[0]))
        y=abs(int(v_center[1]))
        box = img_norm_improved[max(0,y-25):min(y+25,512),max(0,x-25):min(x+25,512)]    # extract a box of size 25x25 around the candidate
        if box.shape != (50,50):
            box = cv2.resize(box, (50,50))

        if c:  # if the candidate is a nodule
            # applying different image transformations to increase the number of nodule candidates
            cv2.imwrite(os.path.join(target_root+"/nodule/", f"candidate_{subset}_{c}_{idx}.jpg"),box)
            cv2.imwrite(os.path.join(target_root+"/nodule/", f"candidate_{subset}_{c}_{idx}_1.jpg"),cv2.rotate(box,cv2.ROTATE_90_CLOCKWISE))
            cv2.imwrite(os.path.join(target_root+"/nodule/", f"candidate_{subset}_{c}_{idx}_2.jpg"),cv2.rotate(box, cv2.ROTATE_90_COUNTERCLOCKWISE))
            cv2.imwrite(os.path.join(target_root+"/nodule/", f"candidate_{subset}_{c}_{idx}_3.jpg"),cv2.rotate(box, cv2.ROTATE_180))
            cv2.imwrite(os.path.join(target_root+"/nodule/", f"candidate_{subset}_{c}_{idx}_4.jpg"),cv2.flip(box, 1))
        else:  # if the candidate is not a nodule
            cv2.imwrite(os.path.join(target_root+"/non-nodule-initial/", f"candidate_{subset}_{c}_{idx}.jpg"),box)

100%|███████████████████████████████████████████| 89/89 [00:52<00:00,  1.68it/s]
