In [9]:
import os, string 
from pathlib import Path
import pydicom as dicom
from tqdm import tqdm_notebook
from pydicom.errors import InvalidDicomError
import numpy as np
import sys

import logging
logger = logging.getLogger('importer')

In [10]:
from mammograpy_dicom_parser import find_dicoms, find_mammograms, make_patient_mapping

In [11]:
path = '/data/archives/DCIS/'
dicoms = find_dicoms(path)
mammograms, patient_ids, failed_to_parse = find_mammograms(dicoms)

with open('failed_to_parse.log', 'a') as f:
    for line in failed_to_parse:
        f.write(line + '\n')


/data/archives/DCIS/DICOMS does not contain dicom files, but 5 other files.


HBox(children=(IntProgress(value=0, max=4220), HTML(value='')))

Failed to import /data/archives/DCIS/DICOMS/DICOMS_2006-2010/Pt 520 (Mw) A.30.0000520/19300603 (Hosp 1-MASTERPAGE)/DICOM/IM_000002.dcm with error: 'FileDataset' object has no attribute 'PixelData'
Failed to import /data/archives/DCIS/DICOMS/DICOMS_2006-2010/Pt 520 (Mw) A.30.0000520/19300603 (Hosp 1-MASTERPAGE)/DICOM/IM_000004.dcm with error: 'FileDataset' object has no attribute 'PixelData'
Failed to import /data/archives/DCIS/DICOMS/DICOMS_2006-2010/Pt 520 (Mw) A.30.0000520/19300603 (Hosp 1-MASTERPAGE)/DICOM/IM_000001.dcm with error: 'FileDataset' object has no attribute 'PixelData'
Failed to import /data/archives/DCIS/DICOMS/DICOMS_2006-2010/Pt 520 (Mw) A.30.0000520/19300603 (Hosp 1-MASTERPAGE)/DICOM/IM_000003.dcm with error: 'FileDataset' object has no attribute 'PixelData'
Failed to import /data/archives/DCIS/DICOMS/DICOMS_rareoriginalrange/Pt_125/IM_000002.dcm with error: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=




In [15]:
mapping = make_patient_mapping(patient_ids)

In [56]:
from collections import defaultdict

def rewrite_structure(mammograms_dict, mapping, metadata_directory='/home/jonas/DCIS_metadata/'):
    """
    Returns a dictionary of patient ids mapping to studyinstance uids, and a studyinstanceuids map to integers. The patient 
    itself is cached, and when new files are added it is checked against this metadata if these files actually exist
    """
    
    studies_per_patient = defaultdict(list)
    uid_mapping = {}
    for fn in mammograms_dict:
        study_instance_uid = mammograms_dict[fn]['StudyInstanceUID']
        patient_id = mammograms_dict[fn]['PatientID']
        if study_instance_uid not in studies_per_patient[patient_id]:
            studies_per_patient[patient_id].append(study_instance_uid)
            
        metadata_file = Path(metadata_directory) / f'{mapping[patient_id]}.txt'
        if not os.path.exists(metadata_file):      
            with open(metadata_file, 'w') as f:
                f.write('StudyInstanceUIDs\n')

    for patient_id in studies_per_patient:
        with open(Path(metadata_directory) / f'{mapping[patient_id]}.txt', 'r') as f:
            study_instance_uids = [_.strip() for _ in f.readlines()[1:]]

        new_study_instance_uids = list(study_instance_uids)
        # Add new study instance UIDs to the list if they are not yet there.
        new_study_instance_uids.extend(x for x in studies_per_patient[patient_id]
                                       if x not in new_study_instance_uids)

        for idx, study_instance_uid in enumerate(new_study_instance_uids):
            if not study_instance_uid in study_instance_uids:
                with open(Path(metadata_directory) / f'{mapping[patient_id]}.txt', 'a') as f: 
                    f.write(study_instance_uid + '\n')
            uid_mapping[study_instance_uid] = '{:2d}'.format(idx + 1).replace(' ', '0')  
        
    return dict(studies_per_patient), uid_mapping


def create_temporary_file_structure(mammograms, mapping, new_path='/home/jonas/DCIS'):
    new_mammograms = {}
    
    output = defaultdict(list)
    
    for fn in mammograms:
        patient_id = mammograms[fn]['PatientID']
        study_instance_uid = mammograms[fn]['StudyInstanceUID']
        folder_name = Path(mapping[patient_id] + uid_mapping[study_instance_uid])

        f = new_path / folder_name
        f.mkdir(exist_ok=True)
        fn = Path(fn)
        new_fn = f / Path(fn.name)
        try:
            os.symlink(fn, new_fn)
        except FileExistsError as e:
             logger.info(f'Symlinking for {fn} already exists.')
        new_mammograms[str(new_fn)] = mammograms[str(fn)].copy()
        new_mammograms[str(new_fn)]['Original_PatientID'] = new_mammograms[str(new_fn)]['PatientID']
        new_mammograms[str(new_fn)]['PatientID'] = mapping[new_mammograms[str(new_fn)]['Original_PatientID']]
        
        curr_dict = mammograms[str(fn)].copy()
        patient_id = curr_dict['PatientID']
        curr_dict['Original_PatientID'] = patient_id
        curr_dict['filename'] = str(new_fn)
        curr_dict['PatientID'] = mapping[patient_id]
        
        output[str(f)].append(curr_dict)
        
        
    return dict(output)

In [68]:
studies_per_patient, uid_mapping = rewrite_structure(mammograms, mapping)
new_mammograms = create_temporary_file_structure(mammograms, mapping)

In [69]:
new_mammograms

{'/home/jonas/DCIS/10000017701': [{'PatientID': '100000177',
   'StudyInstanceUID': '1.3.6.1.4.1.30071.8.79232738901141.5651047080104635',
   'SeriesInstanceUID': '1.3.6.1.4.1.30071.8.79232738901141.5651047082084635',
   'InstitutionName': 'Hosp 63',
   'ViewPosition': 'MLO',
   'Laterality': 'L',
   'ImageType': ['DERIVED', 'SECONDARY', 'ENHANCED'],
   'Original_PatientID': 'A.30.0000566',
   'filename': '/home/jonas/DCIS/10000017701/IM_000002.dcm'},
  {'PatientID': '100000177',
   'StudyInstanceUID': '1.3.6.1.4.1.30071.8.79232738901141.5651047080104635',
   'SeriesInstanceUID': '1.3.6.1.4.1.30071.8.79232738901141.5651047179144635',
   'InstitutionName': 'Hosp 63',
   'ViewPosition': 'CC',
   'Laterality': 'L',
   'ImageType': ['DERIVED', 'SECONDARY'],
   'Original_PatientID': 'A.30.0000566',
   'filename': '/home/jonas/DCIS/10000017701/IM_000004.dcm'},
  {'PatientID': '100000177',
   'StudyInstanceUID': '1.3.6.1.4.1.30071.8.79232738901141.5651047080104635',
   'SeriesInstanceUID': '1

In [106]:
# Now we can find duplicate views.
def get_duplicate_index(input_list):
    oc_set = set() 
    res = [] 
    for idx, val in enumerate(input_list): 
        if val not in oc_set: 
            oc_set.add(val)          
        else: 
            res.append(input_list[idx])   

    return res
            
renamed_files = {}

cannot_rename = []
for patient_folder in tqdm_notebook(new_mammograms):
    list_of_images = new_mammograms[patient_folder]
    filename = Path(image['filename'])
    
    image_names = []
    for image in list_of_images:
        laterality_name = (image['Laterality'] + image['ViewPosition'])
        image_names.append(laterality_name)
    
    if len(set(image_names)) < len(image_names):
        duplicated_views = get_duplicate_index(image_names)
        # There are several reasons there could be duplicates, in some cases there are derived images.
        
        sorted_duplicates = defaultdict(list)
        for image in list_of_images:
            laterality_name = image['Laterality'] + image['ViewPosition']
            if laterality_name in duplicated_views:
                sorted_duplicates[laterality_name].append(image)
            
    else:
        for image in list_of_images:
            laterality_name = (image['Laterality'] + image['ViewPosition']).lower()
        
            # There are no duplicates, and images can safely be renamed.
            # TODO: also copy the labels!
            to_name = str(Path(filename.parent) / f'image_{laterality_name}.dcm')
            renamed_files[image['filename']] = to_name
            
            
    # Now we can sort through the sorted_duplicates to see what we can fix.
    for list_of_duplicates in sorted_duplicates.values():
        # In case there are too many, we really need to check manually
        if len(list_of_duplicates) > 2:
            cannot_rename.append(('Too many', patient_folder))
            continue
            
        
        arrays = [dicom.read_file(_['filename']).pixel_array for _ in list_of_duplicates]
        shapes = [arr.shape for arr in arrays]
        if len(set(shapes)) != 1:
            cannot_rename.append(('Different shapes', patient_folder, shapes))
            continue
            
    

HBox(children=(IntProgress(value=0, max=478), HTML(value='')))

In [107]:
cannot_rename

[('Too many', '/home/jonas/DCIS/10000033501'),
 ('Different shapes',
  '/home/jonas/DCIS/10000033501',
  [(1065, 640), (4096, 3328)]),
 ('Too many', '/home/jonas/DCIS/10000039901'),
 ('Different shapes',
  '/home/jonas/DCIS/10000039901',
  [(1065, 640), (4096, 3328)]),
 ('Different shapes',
  '/home/jonas/DCIS/10000010901',
  [(3328, 2560), (4096, 3328)]),
 ('Too many', '/home/jonas/DCIS/10000039801'),
 ('Too many', '/home/jonas/DCIS/10000039802'),
 ('Too many', '/home/jonas/DCIS/10000028701'),
 ('Too many', '/home/jonas/DCIS/10000032801'),
 ('Too many', '/home/jonas/DCIS/10000043001'),
 ('Too many', '/home/jonas/DCIS/10000033801'),
 ('Too many', '/home/jonas/DCIS/10000042201'),
 ('Different shapes',
  '/home/jonas/DCIS/10000008701',
  [(4096, 3328), (3328, 2560)]),
 ('Too many', '/home/jonas/DCIS/10000035801'),
 ('Too many', '/home/jonas/DCIS/10000035801'),
 ('Too many', '/home/jonas/DCIS/10000027301'),
 ('Too many', '/home/jonas/DCIS/10000027301'),
 ('Too many', '/home/jonas/DCIS/100

In [105]:
len(cannot_rename)

348