In [3]:
import copy
import csv
import functools
import glob
import os

from pathlib import Path
from collections import namedtuple

import SimpleITK as sitk
import numpy as np 

import torch
import torch.cuda
from torch.utils.data import Dataset

from util.disk import getCache
from util.util import XyzTuple, xyz2irc
from util.logconf import logging

In [4]:
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)

raw_cache = getCache('luna_raw')
data_dir  = Path.cwd().parent / "LUNA"

In [7]:
CandidateInfoTuple = namedtuple(
    'CandidateInfoTuple',
    'isNodule_bool, diameter_mm, series_uid, center_xyz',
)

In [12]:
{os.path.split(p)[-1][:-4] for p in glob.glob('../LUNA/subset*/*.mhd')}

.4.1.14519.5.2.1.6279.6001.307946352302138765071461362398',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.308153138776443962077214577161',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.308183340111270052562662456038',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.308655308958459380153492314021',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.309672797925724868457151381131',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.309901913847714156367981722205',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.309955814083231537823157605135',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.309955999522338651429118207446',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.310395752124284049604069960014',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.310548927038333190233889983845',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.310626494937915759224334597176',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.311236942972970815890902714604',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.311476128731958142981941696518',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.311981398931043315779172047718',
 '1.3.6.1.4.1.14519.5.2.1.6279.6001.3121279

In [13]:
@functools.lru_cache(1)
def getCandidateInfoList(requireOnDisk_bool=True):
    # We construct a set with all series_uids that are present on disk.
    # This will let us use the data, even if we haven't downloaded all of
    # the subsets yet.
    mhd_list = glob.glob('../LUNA/subset*/*.mhd')
    presentOnDisk_set = {os.path.split(p)[-1][:-4] for p in mhd_list}

    diameter_dict = {}
    with open('../LUNA/annotations.csv', "r") as f:
        for row in list(csv.reader(f))[1:]:
            series_uid = row[0]
            annotationCenter_xyz = tuple([float(x) for x in row[1:4]])
            annotationDiameter_mm = float(row[4])

            diameter_dict.setdefault(series_uid, []).append(
                (annotationCenter_xyz, annotationDiameter_mm)
            )

    candidateInfo_list = []
    with open('../LUNA/candidates.csv', "r") as f:
        for row in list(csv.reader(f))[1:]:
            series_uid = row[0]

            if series_uid not in presentOnDisk_set and requireOnDisk_bool:
                continue

            isNodule_bool = bool(int(row[4]))
            candidateCenter_xyz = tuple([float(x) for x in row[1:4]])

            candidateDiameter_mm = 0.0
            for annotation_tup in diameter_dict.get(series_uid, []):
                annotationCenter_xyz, annotationDiameter_mm = annotation_tup
                for i in range(3):
                    delta_mm = abs(candidateCenter_xyz[i] - annotationCenter_xyz[i])
                    if delta_mm > annotationDiameter_mm / 4:
                        break
                else:
                    candidateDiameter_mm = annotationDiameter_mm
                    break

            candidateInfo_list.append(CandidateInfoTuple(
                isNodule_bool,
                candidateDiameter_mm,
                series_uid,
                candidateCenter_xyz,
            ))

    candidateInfo_list.sort(reverse=True)
    return candidateInfo_list