In [1]:
FOLDER = '../manifest-1557326747206'

In [2]:
import pandas as pd

df = pd.read_csv(f'{FOLDER}/metadata.csv')
df_locations = df[['Data Description URI', 'Manufacturer', 'File Location']]

# After this sort the locations will be like: test1-CT, test1-RT, test2-CT, test2-RT, ...
df_locations = df_locations.sort_values(['Data Description URI', 'Manufacturer'])

# Reduce data size for testing purposes
df_locations = df_locations.iloc[:6]

In [3]:
df_locations

Unnamed: 0,Data Description URI,Manufacturer,File Location
1.3.6.1.4.1.14519.5.2.1.7014.4598.106943890850011666503487579262,LCTSC-Test-S1-101,CT,.\LCTSC\LCTSC-Test-S1-101\03-03-2004-NA-NA-081...
1.3.6.1.4.1.14519.5.2.1.7014.4598.280355341349691222365783556597,LCTSC-Test-S1-101,RTSTRUCT,.\LCTSC\LCTSC-Test-S1-101\03-03-2004-NA-NA-081...
1.3.6.1.4.1.14519.5.2.1.7014.4598.639871532605224417554459681163,LCTSC-Test-S1-102,CT,.\LCTSC\LCTSC-Test-S1-102\11-04-2003-NA-RTRCCT...
1.3.6.1.4.1.14519.5.2.1.7014.4598.110977663386843546355807661874,LCTSC-Test-S1-102,RTSTRUCT,.\LCTSC\LCTSC-Test-S1-102\11-04-2003-NA-RTRCCT...
1.3.6.1.4.1.14519.5.2.1.7014.4598.578895540487402949445746417374,LCTSC-Test-S1-103,CT,.\LCTSC\LCTSC-Test-S1-103\11-09-2003-NA-RTRCCT...
1.3.6.1.4.1.14519.5.2.1.7014.4598.941697026234857133685181755546,LCTSC-Test-S1-103,RTSTRUCT,.\LCTSC\LCTSC-Test-S1-103\11-09-2003-NA-RTRCCT...


In [4]:
# Get Numpy arrays from the CT
import numpy as np
import SimpleITK as sitk

def ct_to_np(path):
    '''# Get Numpy arrays from a CT image'''
    path = FOLDER + path.replace('\\', '/').strip('.')
    reader = sitk.ImageSeriesReader()
    dicom_names = reader.GetGDCMSeriesFileNames(path)
    reader.SetFileNames(dicom_names)
    image = reader.Execute()
    npa = sitk.GetArrayViewFromImage(image)
    return npa

def get_metadata(path):
    '''Get metadata from an RT file.'''
    path = FOLDER + path.replace('\\', '/').strip('.') + '/1-1.dcm'
    ds = pydicom.dcmread(path)
    gender = ds[0x0010, 0x0040].value
    return gender

In [5]:
import pydicom

X = []
y = []

for ct, rt in zip(df_locations['File Location'][0::2], df_locations['File Location'][1::2]):
    X.append(ct_to_np(ct))
    y.append(get_metadata(rt))

In [6]:
# Gender
print(y)

['M', 'F', 'M']


In [7]:
# The sizes are not standardized, hence we cannot create a 4D numpy array
X[0].shape, X[1].shape, X[2].shape

((130, 512, 512), (148, 512, 512), (152, 512, 512))

I've only found the gender yet, so now I will look for other metadata.\
We want to find the following data:\
-classification: gender, contrast vs non-contrast, patient orientation\
-regression: height, weight, age

In [8]:
rt = df_locations['File Location'][1]
path = FOLDER + rt.replace('\\', '/').strip('.') + '/1-1.dcm'
ds = pydicom.dcmread(path)

In [9]:
# ds

(0010, 0030) Patient's Birth Date                DA: ''\
(0010, 0040) Patient's Sex                       CS: 'M'\
I have not found the others

In [10]:
ds.keys()

dict_keys([(0008, 0005), (0008, 0012), (0008, 0013), (0008, 0014), (0008, 0016), (0008, 0018), (0008, 0020), (0008, 0030), (0008, 0050), (0008, 0060), (0008, 0070), (0008, 0090), (0008, 1030), (0008, 103e), (0008, 1090), (0010, 0010), (0010, 0020), (0010, 0030), (0010, 0040), (0012, 0062), (0012, 0063), (0012, 0064), (0013, 0010), (0013, 1010), (0013, 1013), (0018, 0015), (0018, 1020), (0020, 000d), (0020, 000e), (0020, 0010), (0020, 0011), (0020, 0013), (0028, 0303), (3006, 0002), (3006, 0004), (3006, 0008), (3006, 0009), (3006, 0010), (3006, 0020), (3006, 0039), (3006, 0080)])

According to the standard:\
(0010,1010) Age (of course Birth Date would be good as well if it wouldn't be empty)\
(0010,1020) Height\
(0010,1030) Weight\
(0020,0020) Patient Orientation\
(0018,0010) Contrast/Bolus Agent Attribute\
We definitely don't have these here.