In [1]:
import os
import glob
import shutil
import zipfile
from tqdm import tqdm

import pandas as pd

adam_dir = '../../../data/ADAM'
fps = glob.glob(os.path.join(adam_dir, '*.zip'))

In [2]:
intermediate_dir = os.path.join(adam_dir, 'extracted')

output_dir = '../../../data/raw/ADAM'

# Extract them to raw dir

In [6]:
for scan_zip_fp  in tqdm(fps):
    scan_name = os.path.basename(scan_zip_fp).rstrip('.zip')
    tof_scan_bias_corrected_fp = os.path.join(scan_name, 'orig', 'TOF.nii.gz')
    aneurysm_fp = os.path.join(scan_name, 'aneurysms.nii.gz')
    location_fp = os.path.join(scan_name, 'location.txt')
    
    with zipfile.ZipFile(scan_zip_fp, 'r') as zip_ref:
        zip_ref.extract(tof_scan_bias_corrected_fp, intermediate_dir)
        zip_ref.extract(aneurysm_fp, intermediate_dir)
        zip_ref.extract(location_fp, intermediate_dir)

    intermediate_path_tof = os.path.join(intermediate_dir, scan_name, 'orig', 'TOF.nii.gz')
    intermediate_path_aneurysm = os.path.join(intermediate_dir, scan_name, 'aneurysms.nii.gz')
    intermidiate_aneurysm_file_info = os.path.join(intermediate_dir, scan_name, 'location.txt')
    
    # Make folder for each scan
    scan_dir_path = os.path.join(output_dir, scan_name)
    os.makedirs(scan_dir_path, exist_ok=True)
    
    # copy new_path_tof to output_dir_tof
    new_path_tof = os.path.join(scan_dir_path, f'{scan_name}_TOF.nii.gz')
    shutil.copy(intermediate_path_tof, new_path_tof)

    # copy new_path_aneurysm to output_dir_label
    new_path_aneurysm = os.path.join(scan_dir_path, f'{scan_name}_aneurysms.nii.gz')
    shutil.copy(intermediate_path_aneurysm, new_path_aneurysm)
    
    # copy intermidiate_aneurysm_file_info to output_dir_label
    new_path_aneurysm_file_info = os.path.join(scan_dir_path, f'{scan_name}_location.txt')
    shutil.copy(intermidiate_aneurysm_file_info, new_path_aneurysm_file_info)


100%|██████████| 113/113 [00:37<00:00,  2.98it/s]


## Extract the bias corrected TOF scans

In [11]:
split_dif  = 4
start_from = 2
split_id   = 3
ps         = []
for k in range(start_from + split_id * split_dif, start_from + split_dif * (split_id+1)):
    print(k)

14
15
16
17


In [5]:
output_dir = '../../../data/preprocessed/0_bias_corrected/ADAM'

In [6]:
for scan_zip_fp  in tqdm(fps):
    scan_name = os.path.basename(scan_zip_fp).rstrip('.zip')
    tof_scan_bias_corrected_fp = os.path.join(scan_name, 'pre', 'TOF.nii.gz')
    aneurysm_fp = os.path.join(scan_name, 'aneurysms.nii.gz')
    
    with zipfile.ZipFile(scan_zip_fp, 'r') as zip_ref:
        zip_ref.extract(tof_scan_bias_corrected_fp, intermediate_dir)
        zip_ref.extract(aneurysm_fp, intermediate_dir)

    intermediate_path_tof = os.path.join(intermediate_dir, scan_name, 'pre', 'TOF.nii.gz')
    intermediate_path_aneurysm = os.path.join(intermediate_dir, scan_name, 'aneurysms.nii.gz')
    
    # Make folder for each scan
    scan_dir_path = os.path.join(output_dir, scan_name)
    os.makedirs(scan_dir_path, exist_ok=True)
    
    # copy new_path_tof to output_dir_tof
    new_path_tof = os.path.join(scan_dir_path, f'{scan_name}_tof.nii.gz')
    shutil.copy(intermediate_path_tof, new_path_tof)

    # copy new_path_aneurysm to output_dir_label
    new_path_aneurysm = os.path.join(scan_dir_path, f'{scan_name}_seg.nii.gz')
    shutil.copy(intermediate_path_aneurysm, new_path_aneurysm)
    


100%|██████████| 113/113 [01:57<00:00,  1.04s/it]


## Extract them to nn_net_dir

In [None]:
intermediate_dir = os.path.join(adam_dir, 'extracted')

output_dir_tof = '../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTr'
output_dir_label = '../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTr'

In [3]:

for scan_zip_fp  in tqdm(fps):
    scan_name = os.path.basename(scan_zip_fp).rstrip('.zip')
    tof_scan_bias_corrected_fp = os.path.join(scan_name, 'pre', 'TOF.nii.gz')
    aneurysm_fp = os.path.join(scan_name, 'aneurysms.nii.gz')
    
    with zipfile.ZipFile(scan_zip_fp, 'r') as zip_ref:
        zip_ref.extract(tof_scan_bias_corrected_fp, intermediate_dir)
        zip_ref.extract(aneurysm_fp, intermediate_dir)

    intermediate_path_tof = os.path.join(intermediate_dir, scan_name, 'pre', 'TOF.nii.gz')
    intermediate_path_aneurysm = os.path.join(intermediate_dir, scan_name, 'aneurysms.nii.gz')
    

    os.makedirs(output_dir_tof, exist_ok=True)
    os.makedirs(output_dir_label, exist_ok=True)

    # copy new_path_tof to output_dir_tof
    new_path_tof = os.path.join(output_dir_tof, f'{scan_name}_0000.nii.gz')
    shutil.copy(intermediate_path_tof, new_path_tof)

    # copy new_path_aneurysm to output_dir_label
    new_path_aneurysm = os.path.join(output_dir_label, f'{scan_name}.nii.gz')
    shutil.copy(intermediate_path_aneurysm, new_path_aneurysm)


100%|██████████| 113/113 [02:35<00:00,  1.37s/it]


In [5]:
assert len(os.listdir(output_dir_label)) == len(os.listdir(output_dir_tof)) == len(fps)

In [6]:
rm -r $intermediate_dir

## Create 80/20 train/holdout splits

In [7]:
scans_names = os.listdir(output_dir_label)
scans_names = [name.rstrip('.nuii.gz') for name in scans_names if name.endswith('.nii.gz')]
scans_names

['10053B',
 '10043',
 '10044B',
 '10044F',
 '10051F',
 '10047B',
 '10061B',
 '10064F',
 '10019',
 '10012',
 '10071B',
 '10052F',
 '10005',
 '10072B',
 '10059B',
 '10042',
 '10021',
 '10050B',
 '10066B',
 '10023',
 '10065B',
 '10077F',
 '10035',
 '10036',
 '10046B',
 '10074B',
 '10047F',
 '10041',
 '10051B',
 '10045B',
 '10024',
 '10064B',
 '10048B',
 '10040',
 '10055B',
 '10020',
 '10062F',
 '10057F',
 '10069B',
 '10078B',
 '10006',
 '10056B',
 '10078F',
 '10055F',
 '10029',
 '10014',
 '10003',
 '10013',
 '10010',
 '10060F',
 '10049B',
 '10059F',
 '10054F',
 '10052B',
 '10034',
 '10075F',
 '10009',
 '10070F',
 '10026',
 '10072F',
 '10037',
 '10008',
 '10074F',
 '10062B',
 '10001',
 '10038',
 '10058B',
 '10065F',
 '10031',
 '10071F',
 '10007',
 '10028',
 '10017',
 '10033',
 '10030',
 '10061F',
 '10027',
 '10032',
 '10075B',
 '10049F',
 '10004',
 '10068F',
 '10053F',
 '10045F',
 '10063F',
 '10077B',
 '10073F',
 '10048F',
 '10073B',
 '10015',
 '10076B',
 '10066F',
 '10068B',
 '10058F',
 '

In [8]:
import re

# get patient id from scan name. It is only the numbers before the first letter
patient_ids = [re.findall(r'\d+', name)[0] for name in scans_names]

# get the scan type from the scan name. It is the letter after the patient id
scan_types = [name.split(patient_id)[-1] for name, patient_id in zip(scans_names, patient_ids)]
scan_types = ['U' if scan_type == '' else scan_type for scan_type in scan_types]

In [9]:
df = pd.DataFrame({'patient_id': patient_ids, 'scan_type': scan_types})

In [10]:
df['two_scans'] = df.scan_type.isin(['B', 'F'])

In [11]:
# Do a stratified sampling of 80% and stratify on patient_id and two_scans
train_df = df[['patient_id', 'two_scans']].drop_duplicates().groupby(['two_scans']).sample(frac=0.8, random_state=42)

In [12]:
train_df.value_counts('two_scans')

two_scans
False    34
True     28
Name: count, dtype: int64

In [13]:
train_df.set_index('patient_id', inplace=True)
df.set_index('patient_id', inplace=True)

In [14]:
train_df = train_df.join(df, how='inner', rsuffix='_').drop(columns=['two_scans_'])

In [15]:
train_df

Unnamed: 0_level_0,two_scans,scan_type
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10001,False,U
10002,False,U
10003,False,U
10004,False,U
10005,False,U
...,...,...
10075,True,B
10076,True,B
10076,True,F
10078,True,B


In [16]:
test_df = df.drop(train_df.index)

In [17]:
len(test_df)

23

In [18]:
assert len(test_df) + len(train_df) == len(df)
assert test_df.index.isin(train_df.index).sum() == 0

### Let's move the test_df tof and label volumes to a new folder

In [19]:
output_dir_tof

'../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTr'

In [20]:
os.makedirs('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTs', exist_ok=True)
os.makedirs('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTs', exist_ok=True)

for scan_id in test_df.index:
    os.system(f'mv {output_dir_tof}/{scan_id}* ../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTs/')
    os.system(f'mv {output_dir_label}/{scan_id}* ../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTs/')

mv: cannot stat '../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTr/10047*': No such file or directory
mv: cannot stat '../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTr/10047*': No such file or directory
mv: cannot stat '../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTr/10072*': No such file or directory
mv: cannot stat '../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTr/10072*': No such file or directory
mv: cannot stat '../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTr/10049*': No such file or directory
mv: cannot stat '../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTr/10049*': No such file or directory
mv: cannot stat '../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTr/10077*': No such file or directory
mv: cannot stat '../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTr/10077*': No such file or directory
mv: cannot stat '../../../data/nnUNet_ra

In [None]:
len(test_df)

23

In [None]:
test_df.sort_index()

Unnamed: 0_level_0,scan_type,two_scans
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10009,U,False
10010,U,False
10015,U,False
10024,U,False
10028,U,False
10029,U,False
10035,U,False
10037,U,False
10039,U,False
10047,B,True


In [22]:
len(os.listdir('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTs/'))

23

In [23]:
len(os.listdir('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTs/'))

23

### Let's verify images were distributed correctly in each set 

#### trainset

In [40]:
full_image_ids = train_df.reset_index().apply(lambda row: str(row['patient_id']) + str(row['scan_type'] if row['scan_type'] != 'U' else ''), axis=1)

In [45]:
for image_id in full_image_ids:
    # Check they are in the trainset
    assert os.path.exists(os.path.join('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTr', f'{image_id}_0000.nii.gz'))
    assert os.path.exists(os.path.join('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTr', f'{image_id}.nii.gz'))
    
    # Check they are not in the testset
    assert not os.path.exists(os.path.join('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTs', f'{image_id}_0000.nii.gz'))
    assert not os.path.exists(os.path.join('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTs', f'{image_id}.nii.gz'))

#### Testset

In [48]:
full_image_ids = test_df.reset_index().apply(lambda row: str(row['patient_id']) + str(row['scan_type'] if row['scan_type'] != 'U' else ''), axis=1)
print(len(full_image_ids))

23


In [49]:
for image_id in full_image_ids:
    # Check they are in the not trainset
    assert not os.path.exists(os.path.join('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTr', f'{image_id}_0000.nii.gz'))
    assert not os.path.exists(os.path.join('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTr', f'{image_id}.nii.gz'))
    
    # Check they are in the testset
    assert os.path.exists(os.path.join('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/imagesTs', f'{image_id}_0000.nii.gz'))
    assert os.path.exists(os.path.join('../../../data/nnUNet_raw/Dataset005_ADAMBinaryAneurysmOnly/labelsTs', f'{image_id}.nii.gz'))

If you reach this point, it's all good!