In [None]:
###############################
# 准备肺结节真假阳性分类模型的数据
###############################
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import pydicom as dicom
import os
import scipy.ndimage

from sklearn.cluster import KMeans
from skimage import measure, morphology
import cell_magic_wand as cmw
import random
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

list32file="LIDC/list3.2.csv"
metafile="LIDC/LIDC-IDRI_MetaData.csv"
unetweightspath="modelpths/unet9.pth"                               # U-net模型的权重文件 
datafolder="processeddata"                                          # 第一步预处理后的数据存放文件夹
Datapath='H:/Datasets/LIDC-IDRI/manifest-1600709154662/LIDC-IDRI/'  # LIDC数据集路径

nodulelocations=pd.read_csv(list32file)
meta=pd.read_csv(metafile)

meta=meta.drop(meta[meta['Modality']!='CT'].index)
meta=meta.reset_index()

#Get folder names of CT data for each patient
patients=[Datapath+meta['Patient Id'][i] for i in range(len(meta))]
datfolder=[]
for i in range(0,len(meta)-1):
    for path in os.listdir(patients[i]):
        if os.path.exists(patients[i]+'/'+path+'/'+meta['Series UID'][i]):
            datfolder.append(patients[i]+'/'+path+'/'+meta['Series UID'][i])
patients=datfolder


In [None]:
# 使用cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(0)
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True
print(device)


In [None]:
# 导入模型
from NoduleUNet import UNet

model = UNet().to(device)
model.load_state_dict(torch.load(unetweightspath))
model.eval()


In [None]:
# Load the scans in given folder path
# code sourced from https://www.kaggle.com/gzuidhof/full-preprocessing-tutorial
def load_scan(path):
    slices = [dicom.read_file(path + '/' + s, force=True) for s in os.listdir(path) if s.endswith('.dcm')]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]), reverse=True)
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
        
    for s in slices:
        s.SliceThickness = slice_thickness
        
    return slices

#convert to ndarray
def get_pixels_hu(slices):
    image = np.stack([s.pixel_array for s in slices])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
            
        image[slice_number] += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

def processimage(img):
    #function sourced from https://www.kaggle.com/c/data-science-bowl-2017#tutorial
    #Standardize the pixel values
    mean = np.mean(img)
    std = np.std(img)
    img = img-mean
    img = img/std
    #plt.hist(img.flatten(),bins=200)
    #plt.show()
    #print(thresh_img[366][280:450])
    middle = img[100:400,100:400] 
    mean = np.mean(middle) 
    max = np.max(img)
    min = np.min(img)
    #move the underflow bins
    img[img==max]=mean
    img[img==min]=mean
    kmeans = KMeans(n_clusters=2).fit(np.reshape(middle,[np.prod(middle.shape),1]))
    del middle, mean, max, min

    centers = sorted(kmeans.cluster_centers_.flatten())
    threshold = np.mean(centers)
    del centers, kmeans

    thresh_img = np.where(img<threshold,1.0,0.0)  # threshold the image
    eroded = morphology.erosion(thresh_img,np.ones([4,4]))
    dilation = morphology.dilation(eroded,np.ones([10,10]))
    del thresh_img, eroded

    labels = measure.label(dilation)
    del dilation

    #plt.imshow(labels)
    #plt.show()
    regions = measure.regionprops(labels)
    good_labels = []
    for prop in regions:
        B = prop.bbox
        if B[2]-B[0]<475 and B[3]-B[1]<475 and B[0]>40 and B[2]<472:
            good_labels.append(prop.label)
    del regions

    mask = np.ndarray([512,512],dtype=np.int8)
    mask[:] = 0
    #
    #  The mask here is the mask for the lungs--not the nodes
    #  After just the lungs are left, we do another large dilation
    #  in order to fill in and out the lung mask 
    #
    for N in good_labels:
        mask = mask + np.where(labels==N,1,0)
    del labels, good_labels

    mask = morphology.dilation(mask,np.ones([10,10])) # one last dilation
    return mask*img

def processimagefromfile(ppix):
    processpix=np.ndarray([ppix.shape[0],512,512])
    for i in range(ppix.shape[0]):
        processpix[i]=processimage(ppix[i])
    return processpix

#predict mask from images
def predictmask(images):
    images=images.reshape(images.shape[0],1,512,512)
    num_test=images.shape[0]
    imgs_mask_test = np.ndarray([num_test,1,512,512],dtype=np.float32)
    for i in range(num_test):
        imgs_mask_test[i] = model.forward(torch.from_numpy(images[i:i+1]).to(device).float()).cpu().detach().numpy()[0]
    return imgs_mask_test

#find number of slices where a nodule is detected
def getnoduleindex(imgs_mask_test):
    masksum=[np.sum(maskslice[0]) for maskslice in imgs_mask_test]
    return [i for i in range(len(masksum)) if masksum[i]>5]

def nodule_coordinates(nodulelocations,meta):
    slices=nodulelocations["slice no."][nodulelocations.index[nodulelocations["case"]==int(meta["Patient Id"][-4:])]]
    xlocs=nodulelocations["x loc."][nodulelocations.index[nodulelocations["case"]==int(meta["Patient Id"][-4:])]]
    ylocs=nodulelocations["y loc."][nodulelocations.index[nodulelocations["case"]==int(meta["Patient Id"][-4:])]]
    nodulecoord=[]
    for i in range(len(slices)):
        nodulecoord.append([slices.values[i]-1,xlocs.values[i]-1,ylocs.values[i]-1])
    return nodulecoord

#generate nodule or non-nodule labels for mask predictions
def truenodules(noduleindex,masks,nodulecoords):
    label=[]
    for ind in noduleindex:
        for cord in nodulecoords:
            if abs(ind-cord[0])<2:
                com=scipy.ndimage.center_of_mass(masks[ind])
                if abs(com[1]-cord[2])<2 and abs(com[2]-cord[1])<2:
                    label.append(True)
            else:
                label.append(False)
    return label

# def slicecount(start,end):
#     slicecounts=[]
#     for i in range(start,end):
#         if len(nodule_coordinates(nodulelocations,meta.iloc[i]))>0:
#             patient_scan=load_scan(patients[i])
#             slicecounts.append(len(patient_scan))
#     return slicecounts

In [None]:
slicecounts=[]

for i in range(len(patients)):
    if len(nodule_coordinates(nodulelocations, meta.iloc[i]))>0:
        patient_scan=load_scan(patients[i])
        slicecounts.append(len(patient_scan))

import pickle
with open(datafolder+"/slicecountsCNN.pkl", 'wb') as f:
    pickle.dump(slicecounts, f)

In [None]:
# 获得TP与FP标签

def get_nodule_label(i):
    nodulelabels=[]
    nodulesensitivity=[]
    slicecounts=[]
    noduleimages = []
    print("Processing patient#",i)
    if len(nodule_coordinates(nodulelocations, meta.iloc[i]))>0:
        patient_scan=load_scan(patients[i])
        slicecounts.append(len(patient_scan))
        patient_pix=get_pixels_hu(patient_scan)
        del patient_scan
        processed_pix = processimagefromfile(patient_pix)
        coord = nodule_coordinates(nodulelocations,meta.iloc[i])
        print(coord)
        radius = nodulelocations["eq. diam."][nodulelocations.index[nodulelocations["case"]==int(meta["Patient Id"][i][-4:])]]
        print(radius)
        mask = predictmask(processed_pix)
        noduleindex = getnoduleindex(mask)        
        labels = np.zeros(len(noduleindex)).astype(bool)
        cordlabels=np.zeros(len(coord)).astype(bool)
        for j,cord in enumerate(coord): # loop through labeled nodules
            if radius.iloc[j]>5:
                nodulemask = cmw.cell_magic_wand(-patient_pix[int(cord[0])],[int(cord[2]),int(cord[1])],2,int(radius.iloc[j])+2)
                nodulepix=nodulemask*patient_pix[cord[0]]
                nodulepix[nodulepix<-500]=0 #lower HU threshold for nodule segmentation
                nodulepix[nodulepix!=0]=1
                nodulemask=nodulepix.astype(bool)
                del nodulepix
                for k,ind in enumerate(noduleindex): # loop through detected nodules
                    if abs(ind-cord[0])<2:
                        if np.sum(nodulemask*mask[ind][0])>1:
                            print("Nodule Detected at slice#",ind,"with actual coord",cord)
                            labels[k] = True
                            cordlabels[j] = True
        del patient_pix, mask, radius
        for j in range(len(coord)):
            nodulesensitivity.append(cordlabels[j])
        
        nodulelabels.append(labels[[k for k in range(len(noduleindex))]])
        noduleimages = processed_pix[[noduleindex[k] for k in range(len(noduleindex))]]

    return noduleimages, nodulelabels, nodulesensitivity, slicecounts

# 多线程处理，非常耗费内存和CPU

MAX_WORKERS = 28 # 根据你的CPU核心数设置

from tqdm.contrib.concurrent import thread_map
results = thread_map(get_nodule_label, range(len(patients)), max_workers=MAX_WORKERS)

# Save the results to corresponding variables
noduleimages = []
nodulelabels = []
nodulesensitivity = []
slicecounts = []

for res in results:
    if len(res[0]) > 0:
        noduleimages.append(res[0])
        nodulelabels.extend(res[1])
        nodulesensitivity.extend(res[2])
        slicecounts.extend(res[3])

del results

In [None]:
# 处理数据

nodulelabels = [item for sublist in nodulelabels for item in sublist]
nodulelabels = [item for sublist in nodulelabels for item in sublist]

noduleimages=np.concatenate(noduleimages)

noduleimages=noduleimages[:len(nodulelabels)]
noduleimages=noduleimages.reshape([noduleimages.shape[0],1,512,512])
nodulelabels=np.array(nodulelabels)

In [None]:
# 保存数据

np.save(datafolder+"/noduleimagesCNN.npy",noduleimages)
np.save(datafolder+"/nodulelabelsCNN.npy",nodulelabels)

import pickle
with open(datafolder+"/nodulesensitivityCNN.pkl", 'wb') as f:
    pickle.dump(nodulesensitivity, f)
with open(datafolder+"/slicecountsCNN.pkl", 'wb') as f:
    pickle.dump(slicecounts, f)

del noduleimages, nodulelabels, nodulesensitivity, slicecounts