In [1]:
import os, string 
from pathlib import Path
import pydicom as dicom
from tqdm import tqdm_notebook
from pydicom.errors import InvalidDicomError
import numpy as np

In [2]:
path = '/data/archives/DCIS/'
path = os.path.normpath(path) 
res = [] 
for root, dirs, files in os.walk(path, topdown=True): 
    res += ([os.path.join(root, d) for d in dirs]) 


In [3]:
dicoms = {}
for d in res: 
    dcms = list(Path(d).glob('*.dcm')) 
    if dcms: 
        dicoms[d] = [str(dcm) for dcm in dcms] 


In [7]:
institutes = []
strange_dicoms = []
invalid_dicoms = []
weird_modalities = []
mammograms = []
patient_ids = []
for key, val in tqdm_notebook(dicoms.items()):
    for line in val:
        try:
            x = dicom.read_file(line, stop_before_pixels=True)
            if x.Modality == 'MG':
                mammograms.append(line)
                institutes.append(x.InstitutionName)
                patient_ids.append(x.PatientID)
            elif x.Modality not in ['PR', 'SR', 'US', 'CR']:
                weird_modalities.append([line, x.Modality])
                
        except AttributeError:
            strange_dicoms.append(line)
        except InvalidDicomError:
            invalid_dicoms.append(line)

HBox(children=(IntProgress(value=0, max=1104), HTML(value='')))

In [12]:
patient_ids = list(set(patient_ids))

In [34]:
def make_mapping(patient_ids, start_at=1, encoding='10'):
    mapping = {}
    n_cases = len(patient_ids)
    new_ids = [f'{encoding}' + '{:7d}'.format(idx).replace(' ', '0') for idx in range(start_at, n_cases + 1)]
    for idx, line in enumerate(patient_ids):
        mapping[line] = new_ids[idx]
    return mapping

        
    

In [38]:
mapping = make_mapping(patient_ids)

In [74]:
from collections import defaultdict

def rewrite_structure(mammos, mapping):
    studies_per_patient = defaultdict(list)
    uid_mapping = {}
    for mammo in mammos:
        x = dicom.read_file(mammo, stop_before_pixels=True)
        study_instance_uid = x.StudyInstanceUID
        patient_id = x.PatientID
        if study_instance_uid not in studies_per_patient[patient_id]:
            studies_per_patient[patient_id].append(study_instance_uid)
            
    for patient_id in studies_per_patient:
        for idx, study_instance_uid in enumerate(studies_per_patient[patient_id]):
            uid_mapping[study_instance_uid] = '{:2d}'.format(idx + 1).replace(' ', '0')
            
            
                
        
    return uid_mapping

In [75]:
studies_per_patient, uid_mapping = rewrite_structure(mammograms, mapping)

In [76]:
uid_mapping

{'1.3.6.1.4.1.30071.8.79232738901141.5651047080104635': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651050046353787': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651073027334550': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651062511594550': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651057731957655': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651043164224635': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651046311784635': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651079263154550': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651061428594550': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651075130194550': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651084591844550': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651040879814635': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651041130424635': '02',
 '1.3.6.1.4.1.30071.8.79232738901141.5651046635914635': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651072036624550': '01',
 '1.3.6.1.4.1.30071.8.79232738901141.5651056671141584': '01',
 '1.3.6.

In [90]:
for mammo in mammograms:
    x = dicom.read_file(mammo, stop_before_pixels=True)
    patient_id = x.PatientID
    study_instance_uid = x.StudyInstanceUID
    folder_name = mapping[patient_id] + uid_mapping[study_instance_uid]
    
    f = os.path.join('/home/jonas/DCIS/', folder_name)
    os.makedirs(f, exist_ok=True)
    try:
        os.symlink(mammo, os.path.join(f, Path(mammo).stem + '.dcm'))
    except:
        pass
    

In [82]:
mammos

['/data/archives/DCIS/DICOMS/DICOMS_2006-2010/Pt 139 (Mw) A.30.0000139/19610513 (Hosp 63-Lorad Selenia)/DICOM/IM_000002.dcm',
 '/data/archives/DCIS/DICOMS/DICOMS_2006-2010/Pt 139 (Mw) A.30.0000139/19610513 (Hosp 63-Lorad Selenia)/DICOM/IM_000004.dcm',
 '/data/archives/DCIS/DICOMS/DICOMS_2006-2010/Pt 139 (Mw) A.30.0000139/19610513 (Hosp 63-Lorad Selenia)/DICOM/IM_000001.dcm',
 '/data/archives/DCIS/DICOMS/DICOMS_2006-2010/Pt 139 (Mw) A.30.0000139/19610513 (Hosp 63-Lorad Selenia)/DICOM/IM_000003.dcm',
 '/data/archives/DCIS/DICOMS/DICOMS/Pt 139 (Mw) A.30.0000139/19650407 (Hosp 1-Selenia Dimensions)/DICOM/IM_000002.dcm',
 '/data/archives/DCIS/DICOMS/DICOMS/Pt 139 (Mw) A.30.0000139/19650407 (Hosp 1-Selenia Dimensions)/DICOM/IM_000001.dcm',
 '/data/archives/DCIS/DICOMS/DICOMS/Pt 139 (Mw) A.30.0000139/19650407 (Hosp 1-Selenia Dimensions)/DICOM/IM_000003.dcm']

In [87]:
x = Path('/data/archives/DCIS/DICOMS/DICOMS/Pt 139 (Mw) A.30.0000139/19650407 (Hosp 1-Selenia Dimensions)/DICOM/IM_000003.dcm')

In [88]:
x.stem

'IM_000003'