## Introduction

This script will iterate all the original dicom files from the PACS system, extract all the images to a final folder and add the corresponding ID.

This serves as the first step in the workflow for AI density analysis of mammography according to the project "OncoServe: Deploying Deep Learning Models for Breast Cancer Risk Assessment, and Breast Density Assessment."

Reference: https://github.com/yala/OncoServe_Public - Tool and all associated code provided for under MIT License.

In [12]:
import os
import fnmatch
import sys
import shutil
import json
from pydicom import dcmread

### Set folders and lists

In [6]:
rootFolderPath = 'G:\\mamografiasCHTMAD'
targetdir = "G:\mamografiaseditadas"
list_of_ids = []

### Defining relevant functions:

In [7]:
def get_processo(filetoread):
    ds = dcmread(filetoread, force=True)
    elemn = ds[0x0010,0x1000]
    processo = elemn.value
    return processo

In [8]:
#Just one dicom file per patient
def copyandrename(source,process):
    file_finish_copy = targetdir + "/" + process + ".dcm"
    shutil.copy(source, file_finish_copy)

In [9]:
#All the 4 dicom files per patient, corresponding to both CC and MLO mammography (Left and Right)
def safe_copy(source, file_finish_copy):
    """Safely copy a file to the specified directory. If a file with the same name already 
    exists, the copied file name is altered to preserve both.

    :param str file_path: Path to the file to copy.
    :param str out_dir: Directory to copy the file into.
    :param str dst: New name for the copied file. If None, use the name of the original
        file.
    """
    name = process + ".dcm"
    file_finish_copy = targetdir + "/" + process + ".dcm"
    
    
    if not os.path.exists(file_finish_copy):
        shutil.copy(source, file_finish_copy)
    else:
        base, extension = os.path.splitext(name)
        i = 1
        while os.path.exists(os.path.join(targetdir, '{}_{}{}'.format(base, i, extension))):
            i += 1
            if i >3:
                return
            else:
                pass
        shutil.copy(source, os.path.join(targetdir, '{}_{}{}'.format(base, i, extension)))

### Iterate all the files and subfolders in the dataset source folder
#### Extract the list of exam ID's to provide comparison with the radiologist annotation - ground truth

In [29]:
for root, dirs, files in os.walk(rootFolderPath):
    for filename in files:
        if filename == "DICOMDIR":
            pass
        elif filename.endswith(".ipynb"):
            pass
        elif filename.endswith(".TXT"):
            pass
        elif filename.endswith(".XML"):
            pass
        elif filename.endswith(".json"):
            pass
        elif filename.startswith("._"):
            pass
        elif filename.endswith(''):
            source = os.path.join(root, filename)
            try:
                process = get_processo(os.path.join(root, filename))
            except KeyError:
                pass
            safe_copy(source,process)
            list_of_ids.append(process)
            
        else:
            pass

EEF1C9E2
EEA7087B
EE0364B8
EE68E0B3
EECE7277
EE623AF3
EECE68A0
EE24D671
EE8579C0
EE138B64
EE0FEA8D
EE1A14F1
EE8CA55B
EE6311BC
EE917EB8
EE4DA030
EE524ECF
EE6D0D55
EE8EC794
EEA0C9C4
EE9997A7
EED332E5
EE8C9423
EEF720AC
EE75628A
EE2A6854
EED86183
EE6ED6B3
EEC74D3A
EE9A2309
EEAC7699
EEA2B4B1
EE7C289E
EE88FD26
EE176843
EE5DA260
EE067BEA
EE029059
EEDBE835
EEE305AD
EE0F3DC7
EEF51252
EEC3BE95
EE4CC108
EE0FE4B3
EE362CE6
EE53AFF5
EEE101E0
EEC98EBE
EEC32549
EE512DE9
EED4967E
EED5D4F4
EEE84CB3
EEA82205
EE9ABCF9
EE810D51
EEADA50B
EE010BBD
EE5AA3D4
EE745BFA
EE8CC486
EE6161D6
EED7CE08
EE3D16AA
EE165687
EE186BCB
EEA9826B
EE261CC2
EEA87122
EE1E5FDC
EEC4CED1
EEF1DDA6
EEF78518
EEED6BBA
EE813FA7
EEFAC3AE
EE79830A
EE7664B1
EE323523
EE1E61B5
EE142C19
EE29951E
EE7E6B19
EE711763
EED2D2D7
EEA65221
EE7B075C
EE4D8F5E
EEEF5FE2
EE3EA697
EEBC3B18
EE4DC306
EE91A2F3
EEC8241A
EEA4C54E
EE62E765
EEA1B300
EE448275
EE22E9A0
EE960AE5
EEBA489E
EEF9F4F2
EE76D48C
EE6F5706
EE0A0A34
EE6234AD
EE64D123
EEE13C1D
EE1E4492
EEA5B084
E

EEBCA246
EE77ED11
EE9A5BBD
EE4A1CB6
EE5B2EC6
EEE1B1CB
EE07A8A1
EE352BF0
EE45AF1F
EEBD82EF
EE51EA7E
EEE027AE
EE92FD2D
EE094AE0
EE791180
EED8AF68
EEDB0036
EE411A13
EE7955A8
EE2C2EB4
EEBD749E
EEC86BD7
EED24625
EE291C1F
EEA0C747
EECB622D
EEB1A1F2
EEB19FA4
EED8A0C0
EEEFA315
EE22596F
EEF40CF0
EE376264
EE2E4FB8
EECAFF04
EE351535
EE2746FC
EE7ADC8E
EE73703D
EECD1BBE
EE2E89F0
EE4B0F3B
EE645D92
EE9B3A3C
EEE4A9B6
EEC7227F
EEFE2AA2
EE5494A9
EEFB3778
EE4DA70E
EE87183C
EEDC920E
EEFBD2BD
EE1F9912
EEE7CEE6
EEC38193
EE8F7900
EE0C3013
EE306A6B
EE36FBF1
EE515F57
EEC45CF3
EE046331
EE76B18A
EE884B63
EE463FFC
EE50233B
EE6E1FFC
EEC181B2
EEB76768
EE3A9C45
EEA29630
EECD209A
EEE43FDB
EE29EFC0
EED49E34
EE913E45
EEE7FBCD
EE0056C0
EEAC1F01
EE221E42
EE980881
EEB0AB84
EE62CA0C
EE992FD4
EE64ED75
EE4D760B
EEB8D972
EEBB8E42
EEC22767
EED8C943
EEFAB244
EE61AC40
EE14C357
EE2614B2
EE90A8E2
EEFC1742
EEC3B3A4
EEB6197E
EECB9B22
EE26EBB7
EE4C9793
EEC03EFF
EE682E03
EE520395
EE1EAA2D
EEF6D142
EE0EE6B3
EE06E765
EE18383E
EE3D422D
E

### Extract the list of ID's as a JSON file

In [13]:
with open('ids.json', 'w', encoding='utf-8') as f:
    json.dump(list_of_ids, f, ensure_ascii=False, indent=4)