# p1. 读取和处理candidates.csv和annotations.csv

In [1]:
import numpy as np
import copy
import pandas as pd
import functools
import glob
import os
import csv

from collections import namedtuple, defaultdict

In [2]:
path = "D:/Code/data/luna16/"

In [3]:
df = pd.read_csv(path + "candidates.csv")

In [4]:
df.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-56.08,-67.85,-311.92,0
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,53.21,-244.41,-245.17,0
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,103.66,-121.8,-286.62,0
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-33.66,-72.75,-308.41,0
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-32.25,-85.36,-362.51,0


In [5]:
df.shape

(551065, 5)

> https://luna16.grand-challenge.org/Data/  

官网上写了candidates.csv与candidates_V2.csv的区别。  
视频中使用的是candidates.csv

In [6]:
df_v2 = pd.read_csv(path + "candidates_V2.csv")
df_v2.head(5), df_v2.shape

(                                           seriesuid     coordX      coordY  \
 0  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  68.420000  -74.480000   
 1  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -95.209361  -91.809406   
 2  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -24.766755 -120.379294   
 3  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -63.080000  -65.740000   
 4  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  52.946688  -92.688873   
 
        coordZ  class  
 0 -288.700000      0  
 1 -377.426350      0  
 2 -273.361539      0  
 3 -344.240000      0  
 4 -241.067872      0  ,
 (754975, 5))

官网上表示：
- 1120/1186 个结节
- **It has to be noted that there can be multiple candidates per nodule.**  
可能意思是，这些候选位置可能有重叠，然后有些class为1的存在重复的结节。

In [7]:
df.iloc[:, -1].value_counts()

class
0    549714
1      1351
Name: count, dtype: int64

In [8]:
df_v2.iloc[:, -1].value_counts()

class
0    753418
1      1557
Name: count, dtype: int64

In [9]:
annotations = pd.read_csv(path + "annotations.csv")
annotations.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,diameter_mm
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-128.699421,-175.319272,-298.387506,5.651471
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,103.783651,-211.925149,-227.12125,4.224708
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...,69.639017,-140.944586,876.374496,5.786348
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,-24.013824,192.102405,-391.081276,8.143262
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,2.441547,172.464881,-405.493732,18.54515


In [10]:
annotations.shape

(1186, 5)

当使用 pandas 读取数据时,如果数据源没有提供明确的列名,pandas 会自动为这些列生成默认的列名,通常格式为 "Unnamed: 0"、"Unnamed: 1" 等。
使用 column.startswith("Unnamed") 可以方便地识别和筛选出这些自动生成的列名,以便进行后续的数据清理和处理。

In [11]:
columns = [column for column in annotations.columns if not column.startswith("Unnamed")]
columns, len(columns)

(['seriesuid', 'coordX', 'coordY', 'coordZ', 'diameter_mm'], 5)

这里额外获取了文件系统中的真正存在的数据文件，因为此处我们为了简化代码，只在data目录里解压了subset0和subset1的数据，后续处理就只需处理这两个子集，以防因数据集不全而出bug

In [12]:
mhd_list = glob.glob(path + "subset*/*.mhd")

In [13]:
len(mhd_list), mhd_list[0]

(178,
 'D:/Code/data/luna16\\subset0\\1.3.6.1.4.1.14519.5.2.1.6279.6001.105756658031515062000744821260.mhd')

In [14]:
# -1是只要路径最后的文件名，:-4是去掉扩展名
present_on_disk_set = {os.path.split(p)[-1][:-4] for p in mhd_list}

In [15]:
len(present_on_disk_set), present_on_disk_set

(178,
 {'1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.104562737760173137525888934217',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.105756658031515062000744821260',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.106719103982792863757268101375',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.108197895896446896160048741492',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.108231420525711026834210228428',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.109002525524522225658609808059',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.111017101339429664883879536171',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.111172165674661221381920536987',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.113697708991260454310623082679',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.114218724025049818743426522343',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.121824995088859376862458155637',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.122763913896761494371822656720',
  '1.3.6.1.4.1.14519.5.2.1.6279.6001.124154461048929153767743874565',
  '1.3.6.1.4.1

In [16]:
with open(path + "annotations.csv", "r") as f:
    reader = csv.reader(f)
    columns = next(reader)
    line_n = 0
    for row in reader:
        line_n += 1
    print(line_n)

1186


In [17]:
"""
diameter_dict = {}
with open(path + "annotations.csv", "r") as f:
    reader = csv.reader(f)
    columns = next(reader)
    for row in reader:
        # row是一个list
        series_uid = row[0]
        annotation_center_xyz = tuple([float(x) for x in row[1:4]])
        annotation_diameter_mm = float(row[4])
        # 通过使用setdefault()方法，可以确保每个series_uid都有一个对应的列表，即使在遍历annotations.csv文件时遇到多个具有相同series_uid的行也不会覆盖先前的值。
        diameter_dict.setdefault(series_uid, []).append(
            (annotation_center_xyz, annotation_diameter_mm)
        )
"""

'\ndiameter_dict = {}\nwith open(path + "annotations.csv", "r") as f:\n    reader = csv.reader(f)\n    columns = next(reader)\n    for row in reader:\n        # row是一个list\n        series_uid = row[0]\n        annotation_center_xyz = tuple([float(x) for x in row[1:4]])\n        annotation_diameter_mm = float(row[4])\n        # 通过使用setdefault()方法，可以确保每个series_uid都有一个对应的列表，即使在遍历annotations.csv文件时遇到多个具有相同series_uid的行也不会覆盖先前的值。\n        diameter_dict.setdefault(series_uid, []).append(\n            (annotation_center_xyz, annotation_diameter_mm)\n        )\n'

In [18]:
# 使用defaultdict来代替上面的setdefault()方法
diameter_dict = defaultdict(list)
with open(path + "annotations.csv", "r") as f:
    reader = csv.reader(f)
    columns = next(reader)
    for row in reader:
        series_uid = row[0]
        annotation_center_xyz = tuple([float(x) for x in row[1:4]])
        annotation_diameter_mm = float(row[4])
        diameter_dict[series_uid].append(
            (annotation_center_xyz, annotation_diameter_mm)
        )

In [46]:
annotations_df = pd.read_csv(path + "annotations.csv")
annotations_df.shape, annotations_df.head()

((1186, 5),
                                            seriesuid      coordX      coordY  \
 0  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -128.699421 -175.319272   
 1  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  103.783651 -211.925149   
 2  1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...   69.639017 -140.944586   
 3  1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...  -24.013824  192.102405   
 4  1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...    2.441547  172.464881   
 
        coordZ  diameter_mm  
 0 -298.387506     5.651471  
 1 -227.121250     4.224708  
 2  876.374496     5.786348  
 3 -391.081276     8.143262  
 4 -405.493732    18.545150  )

In [53]:
for idx, row in annotations_df.iterrows():
    print(row.tolist())
    if idx > 3:
        break

['1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860', -128.6994211, -175.3192718, -298.3875064, 5.651470635]
['1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860', 103.7836509, -211.9251487, -227.12125, 4.224708481]
['1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793540579077826395208', 69.63901724, -140.9445859, 876.3744957, 5.786347814]
['1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016233746780170740405', -24.0138242, 192.1024053, -391.0812764, 8.143261683]
['1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016233746780170740405', 2.441546798, 172.4648812, -405.4937318, 18.54514997]


In [50]:
first_line = np.array(annotations_df.iloc[0])
first_line

array(['1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860',
       -128.6994211, -175.3192718, -298.3875064, 5.651470635],
      dtype=object)

In [19]:
diameter_dict["1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860"]

[((-128.6994211, -175.3192718, -298.3875064), 5.651470635),
 ((103.7836509, -211.9251487, -227.12125), 4.224708481)]

In [20]:
len(diameter_dict)

601

In [21]:
require_on_disk_bool = True
CandidateInfoTuple = namedtuple(
    "CandidateInfoTuple", ["is_nodule_bool", "diameter_mm", "series_uid", "center_xyz"]
)

In [22]:
candidate_info_list = []
with open(path + "candidates.csv", "r") as f:
    reader = csv.reader(f)
    columns = next(reader)
    for row in reader:
        series_uid = row[0]

        if series_uid not in present_on_disk_set and require_on_disk_bool:
            continue

        is_nodule_bool = bool(int(row[4]))
        candidate_center_xyz = tuple([float(x) for x in row[1:4]])

        candidate_diameter_mm = 0.0
        # 因为上面使用了defaultdict，所以这里不需要判断series_uid是否在diameter_dict中
        # 这里对比了候选结节的中心点和直径与annotations.csv中的中心点和直径，如果候选结节的中心点和直径在annotations.csv中的中心点和直径的1/4范围内，则认为是同一个结节
        # 就可以得到候选结节的直径
        # 至于不在1/4范围内的候选结节，直径就是0.0
        for annotation_tuple in diameter_dict[series_uid]:
            annotation_center_xyz, annotation_diameter_mm = annotation_tuple
            for i in range(3):
                delta_mm = abs(candidate_center_xyz[i] - annotation_center_xyz[i])
                if delta_mm > annotation_diameter_mm / 4:
                    break
            else:
                candidate_diameter_mm = annotation_diameter_mm
                break

        candidate_info_list.append(
            CandidateInfoTuple(
                is_nodule_bool, candidate_diameter_mm, series_uid, candidate_center_xyz
            )
        )

In [23]:
candidate_info_list[:5]

[CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866', center_xyz=(-25.40019966, -199.1034346, -114.4721508)),
 CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866', center_xyz=(-42.31, -162.9, -170.18)),
 CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866', center_xyz=(-72.42, -209.82, -206.73)),
 CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866', center_xyz=(-35.31, -125.79, -145.59)),
 CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866', center_xyz=(-57.72, -113.88, -123.8))]

In [24]:
len(candidate_info_list)

110143

In [25]:
candidate_info_list

[CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866', center_xyz=(-25.40019966, -199.1034346, -114.4721508)),
 CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866', center_xyz=(-42.31, -162.9, -170.18)),
 CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866', center_xyz=(-72.42, -209.82, -206.73)),
 CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866', center_xyz=(-35.31, -125.79, -145.59)),
 CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866', center_xyz=(-57.72, -113.88, -123.8)),
 CandidateInfoTuple(is_nodule_bool=False, diameter_mm=0.0, series_uid='1.3.6.1.4.1.14519.5.2.1.6

## p2. 读取和处理CT文件

In [26]:
import SimpleITK as sitk

In [27]:
series_uid = "1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866"
mhd_path = glob.glob(path + f"subset*/{series_uid}.mhd")[0]
ct_mhd = sitk.ReadImage(mhd_path)
ct_arr = np.array(sitk.GetArrayFromImage(ct_mhd), dtype=np.float32)

In [28]:
ct_arr, ct_arr.shape

(array([[[ -990., -1007., -1016., ...,  -988.,  -973.,  -966.],
         [ -985., -1001., -1014., ...,  -978.,  -960.,  -961.],
         [ -994., -1013., -1017., ...,  -972.,  -956.,  -969.],
         ...,
         [ -941.,  -953.,  -959., ...,  -918.,  -917.,  -943.],
         [ -960.,  -981.,  -979., ...,  -929.,  -903.,  -926.],
         [ -978.,  -980.,  -965., ...,  -953.,  -925.,  -924.]],
 
        [[ -996., -1019., -1019., ...,  -998.,  -997.,  -994.],
         [ -989., -1005., -1011., ...,  -991.,  -991.,  -988.],
         [ -987.,  -996.,  -996., ...,  -989.,  -993.,  -991.],
         ...,
         [ -974.,  -972.,  -953., ...,  -946.,  -949.,  -964.],
         [ -974.,  -984.,  -965., ...,  -948.,  -944.,  -963.],
         [ -989.,  -992.,  -969., ...,  -948.,  -949.,  -971.]],
 
        [[-1009., -1014., -1008., ..., -1010., -1014., -1011.],
         [ -997.,  -992.,  -999., ..., -1003., -1005.,  -991.],
         [ -999.,  -987.,  -981., ...,  -988.,  -994.,  -986.],
      

In [29]:
import sys

sys.path.append("../../src/")

import util.util as ut

In [30]:
ut.irc2xyz

<function util.util.irc2xyz(coord_irc, origin_xyz, vxSize_xyz, direction_a)>

In [31]:
ct_arr.clip(-1000, 1000, ct_arr)

hu_arr = ct_arr
IrcTuple = namedtuple("IrcTuple", ["index", "row", "col"])
XyzTuple = namedtuple("XyzTuple", ["x", "y", "z"])

origin_xyz = XyzTuple(*ct_mhd.GetOrigin())
vx_size_xyz = XyzTuple(*ct_mhd.GetSpacing())
direction_arr = np.array(ct_mhd.GetDirection()).reshape(3, 3)

In [32]:
origin_xyz, vx_size_xyz, direction_arr

(XyzTuple(x=-127.755859375, y=-277.755859375, z=-351.5),
 XyzTuple(x=0.48828125, y=0.48828125, z=1.7999999523162842),
 array([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]))

In [33]:
def getRawCandidate(center_xyz, width_irc):
    center_irc = ut.xyz2irc(center_xyz, origin_xyz, vx_size_xyz, direction_arr)

    slice_list = []
    for axis, center_val in enumerate(center_irc):
        start_ndx = int(round(center_val - width_irc[axis] / 2))
        end_ndx = int(start_ndx + width_irc[axis])

        assert center_val >= 0 and center_val < hu_arr.shape[axis], repr(
            [series_uid, center_xyz, origin_xyz, vx_size_xyz, center_irc, axis]
        )

        # 处理边界情况
        if start_ndx < 0:
            start_ndx = 0
            end_ndx = int(width_irc[axis])

        if end_ndx > hu_arr.shape[axis]:
            end_ndx = hu_arr.shape[axis]
            start_ndx = int(hu_arr.shape[axis] - width_irc[axis])

        slice_list.append(slice(start_ndx, end_ndx))

    ct_chunk = hu_arr[tuple(slice_list)]
    return ct_chunk, center_irc

In [34]:
temp = getRawCandidate((-30, -30, -60), (10, 10, 10))
temp

(array([[[-910., -923., -945., -938., -907., -903., -924., -920., -887.,
          -879.],
         [-927., -922., -930., -925., -894., -880., -901., -924., -918.,
          -904.],
         [-917., -907., -912., -917., -903., -884., -892., -919., -940.,
          -937.],
         [-903., -901., -910., -925., -920., -893., -877., -897., -935.,
          -952.],
         [-920., -911., -919., -937., -930., -889., -851., -862., -905.,
          -939.],
         [-954., -937., -934., -944., -938., -896., -844., -836., -874.,
          -910.],
         [-968., -941., -931., -932., -931., -909., -869., -844., -860.,
          -892.],
         [-942., -933., -913., -904., -904., -913., -907., -888., -879.,
          -889.],
         [-903., -912., -903., -892., -895., -915., -934., -931., -899.,
          -879.],
         [-875., -888., -897., -899., -907., -921., -934., -935., -910.,
          -889.]],
 
        [[-919., -913., -905., -898., -884., -892., -941., -967., -935.,
          -903

In [35]:
temp[0].shape, temp[1]

((10, 10, 10), IrcTuple(index=162, row=507, col=200))