## Introduction

This script will iterate all the original dicom files from the PACS system, extract all the images to a final folder and add the corresponding ID.

This serves as the first step in the workflow for AI density analysis of mammography according to the project "OncoServe: Deploying Deep Learning Models for Breast Cancer Risk Assessment, and Breast Density Assessment."

Reference: https://github.com/yala/OncoServe_Public - Tool and all associated code provided for under MIT License.

In [None]:
import os
import fnmatch
import sys
import shutil
import json
from pydicom import dcmread

### Set folders and lists

In [None]:
rootFolderPath = 'G:\\mamografiasCHTMAD'
targetdir = "G:\mamografiaseditadas"
list_of_ids = []

### Defining relevant functions:

In [None]:
def get_processo(filetoread):
    ds = dcmread(filetoread, force=True)
    elemn = ds[0x0010,0x1000]
    processo = elemn.value
    return processo

In [None]:
#Just one dicom file per patient
def copyandrename(source,process):
    file_finish_copy = targetdir + "/" + process + ".dcm"
    shutil.copy(source, file_finish_copy)

In [None]:
#All the 4 dicom files per patient, corresponding to both CC and MLO mammography (Left and Right)
def safe_copy(source, file_finish_copy):
    """Safely copy a file to the specified directory. If a file with the same name already 
    exists, the copied file name is altered to preserve both.

    :param str file_path: Path to the file to copy.
    :param str out_dir: Directory to copy the file into.
    :param str dst: New name for the copied file. If None, use the name of the original
        file.
    """
    name = process + ".dcm"
    file_finish_copy = targetdir + "/" + process + ".dcm"
    
    
    if not os.path.exists(file_finish_copy):
        shutil.copy(source, file_finish_copy)
    else:
        base, extension = os.path.splitext(name)
        i = 1
        while os.path.exists(os.path.join(targetdir, '{}_{}{}'.format(base, i, extension))):
            i += 1
            if i >3:
                return
            else:
                pass
        shutil.copy(source, os.path.join(targetdir, '{}_{}{}'.format(base, i, extension)))

### Iterate all the files and subfolders in the dataset source folder
#### Extract the list of exam ID's to provide comparison with the radiologist annotation - ground truth

In [None]:
for root, dirs, files in os.walk(rootFolderPath):
    for filename in files:
        if filename == "DICOMDIR":
            pass
        elif filename.endswith(".ipynb"):
            pass
        elif filename.endswith(".TXT"):
            pass
        elif filename.endswith(".XML"):
            pass
        elif filename.endswith(".json"):
            pass
        elif filename.startswith("._"):
            pass
        elif filename.endswith(''):
            source = os.path.join(root, filename)
            try:
                process = get_processo(os.path.join(root, filename))
            except KeyError:
                pass
            safe_copy(source,process)
            list_of_ids.append(process)
            
        else:
            pass

### Extract the list of ID's as a JSON file

In [None]:
with open('ids.json', 'w', encoding='utf-8') as f:
    json.dump(list_of_ids, f, ensure_ascii=False, indent=4)