# Extracting data from DICOM files
### This notebook extract data from Dicom files using the dicomTags.

#### Installing required packages:

1. Create a new environment (https://docs.python.org/3/library/venv.html)

```python
python3 -m venv /path/to/new/virtual/environment
```
2. Activate the new environment

```python
source env/bin/activate
```

3. Install required packages

```python
pip install -r requirements.txt
```

4. Run the notebook.  :)

In [1]:
from pydicom import dcmread
import os
import glob
from tqdm.notebook import tqdm
import csv

## Find data
Download data from https://wiki.cancerimagingarchive.net/display/Public/Soft-tissue-Sarcoma (~10GB)  
and look if they are present in folder "data"

In [2]:
if not os.path.isdir("Soft-tissue-Sarcoma"):
    print("Downlad data before")

DICOM_files = glob.glob('Soft-tissue-Sarcoma/**/*.[dD][cC][mM]', recursive=True)


## Print some dicom file paths

In [3]:
print(DICOM_files[:10])

['Soft-tissue-Sarcoma/STS_010/12-30-2003-NA-CT PET with registered MR-74609/48044.000000-AlignedT1toPETBOX-18519/1-81.dcm', 'Soft-tissue-Sarcoma/STS_010/12-30-2003-NA-CT PET with registered MR-74609/48044.000000-AlignedT1toPETBOX-18519/1-42.dcm', 'Soft-tissue-Sarcoma/STS_010/12-30-2003-NA-CT PET with registered MR-74609/48044.000000-AlignedT1toPETBOX-18519/1-56.dcm', 'Soft-tissue-Sarcoma/STS_010/12-30-2003-NA-CT PET with registered MR-74609/48044.000000-AlignedT1toPETBOX-18519/1-57.dcm', 'Soft-tissue-Sarcoma/STS_010/12-30-2003-NA-CT PET with registered MR-74609/48044.000000-AlignedT1toPETBOX-18519/1-43.dcm', 'Soft-tissue-Sarcoma/STS_010/12-30-2003-NA-CT PET with registered MR-74609/48044.000000-AlignedT1toPETBOX-18519/1-80.dcm', 'Soft-tissue-Sarcoma/STS_010/12-30-2003-NA-CT PET with registered MR-74609/48044.000000-AlignedT1toPETBOX-18519/1-82.dcm', 'Soft-tissue-Sarcoma/STS_010/12-30-2003-NA-CT PET with registered MR-74609/48044.000000-AlignedT1toPETBOX-18519/1-55.dcm', 'Soft-tissue-Sa

### Create the dictionaries containing the dicom name and dicom tag from the pivot csv files  


In [4]:
dictTagStudy = {
                "Patient_Id": [('0x0010', '0x0020')],
                "Study_StudyInstanceUID": [('0x0020', '0x000d')],
                "Study_StudyDescription": [('0x0008', '0x1030')],
                "Study_AcquisitionDate": [('0x0008', '0x0022'), ('0x0008', '0x0032')],
                "Study_InstitutionName": [('0x0008', '0x0080')],
                "Study_ModalitiesInStudy": [('0x0008', '0x0061')],
                "Study_NbStudyRelatedSeries": [('0x0020', '0x1206')],
                }

dictTagSeries = {
                "Patient_Id": [('0x0010', '0x0020')],
                "Series_SeriesNumber": [('0x0020', '0x0011')],
                "Series_SeriesInstanceUID": [('0x0020', '0x000e')],
                "Series_Modality": [('0x0008', '0x0060')],
                "Series_Description": [('0x0008', '0x103e'), ('0x0008', '0x1030')],
                "Series_BodyPartExamined": [('0x0018', '0x0015')],
                "Series_NbSeriesRelatedInstances": [('0x0020', '0x1209')],
                "Series_SoftwareVersion": [('0x0018', '0x1020')],
                }


dictTagEquipment = {
                "Patient_Id": [('0x0010', '0x0020')],
                "Equipment_ModelName": [('0x0008', '0x1090')],
                "Equipment_Manufacturer": [('0x0008', '0x0070')],
                   }

dictTagInjection = {
                "Patient_Id": [('0x0010', '0x0020')],
                "Injection_Radiopharmaceutical": [('0x0018', '0x0031')],
                "Injection_ContrastBolusAgent": [('0x0018', '0x0010')],
                "Injection_ContrastBolusStartTime": [('0x0018', '0x1042')],
                "Injection_ContrastBolusStopTime": [('0x0018', '0x1043')],
                "Injection_RadiopharmaceuticalStartTime": [('0x0018', '0x1072')],
                "Injection_RadionuclideTotalDose": [('0x0018', '0x1074')],
                   }

dictTagROISegmentation = {
                "Patient_Id": [('0x0010', '0x0020')],
                "ROISegmentation_ReferencedSOPInstanceUID": [('0x0008', '0x1150')],
                "ROISegmentation_ROINumber": [('0x3006', '0x0084')],
                "ROISegmentation_ROIName": [('0x3006', '0x0026')],
                "ROISegmentation_ROIDescription": [('0x3006', '0x0028')],
                "ROISegmentation_ROIType": [],
                "ROISegmentation_ROIFilename": [],
                         }

dictTagCommonImage = {
                "Patient_Id": [('0x0010', '0x0020')],
                "CommonImage_SOPInstanceUID": [('0x0008', '0x0018')],
                "CommonImage_SliceThickness": [('0x0018', '0x0050')],
                "CommonImage_PixelSpacing": [('0x0028', '0x0030')],
                "CommonImage_FieldOfView": [],
                "CommonImage_Rows": [('0x0028', '0x0010')],
                "CommonImage_Columns": [('0x0028', '0x0011')],
                     }

dictTagCTImage = {
                "Patient_Id": [('0x0010', '0x0020')],
                "CTImage_KVp": [('0x0018', '0x0060')],
                "CTImage_XRayTubeCurrent": [('0x0018', '0x1151')],
                "CTImage_ExposureTime": [('0x0018', '0x1150')],
                "CTImage_SpiralPitchFactor": [('0x0018', '0x9311')],
                "CTImage_FilterType": [('0x0018', '0x1160')],
                "CTImage_ConvolutionKernel": [('0x0018', '0x1210')],
                 }

dictTagDXImage = {
                "Patient_Id": [('0x0010', '0x0020')],
                "DXImage_ImageLaterality": [('0x0020', '0x0062')],
                "DXImage_PatientOrientation": [('0x0020', '0x0020')],
                "DXImage_AnatomicRegionSequenceCodeMeaning": [('0x0008', '0x0104')],
                "DXImage_AnatomicRegionSequenceCodeValue": [('0x0008', '0x0100')],
                "DXImage_KVP": [('0x0008', '0x0060')],
                "DXImage_Exposure": [('0x0018', '0x1152')],
                "DXImage_ExposureTime": [('0x0018', '0x1150')],
                "DXImage_ContrastBolusAgent": [('0x0018', '0x0010')],
                 }

dictTagMRImage = {
                "Patient_Id": [('0x0010', '0x0020')],
                "MRImage_SequenceName": [('0x0018', '0x0024')],
                "MRImage_MagneticFieldStrength": [('0x0018', '0x0087')],
                "MRImage_MRAcquisitionType": [('0x0018', '0x0023')],
                "MRImage_RepetitionTime": [('0x0018', '0x0080')],
                "MRImage_EchoTime": [('0x0018', '0x0081')],
                "MRImage_ImagingFrequency": [('0x0018', '0x0084')],
                "MRImage_FlipAngle": [('0x0018', '0x0024')],
                "MRImage_InversionTime": [('0x0018', '0x0082')],
                "MRImage_ReceiveCoilName": [('0x0018', '0x1250')],
                 }

dictTagNMImage = {
                "Patient_Id": [('0x0010', '0x0020')],
                "NMImage_AttenuationCorrectionMethod": [('0x0054', '0x1101')],
                "NMImage_ReconstructionMethod": [('0x0054', '0x1103')],
                "NMImage_ScatterCorrectionMethod": [('0x0054', '0x1105')],
                 }

dictTagPTImage = {
                "Patient_Id": [('0x0010', '0x0020')],
                "PTImage_AttenuationCorrectionMethod": [('0x0054', '0x1101')],
                "PTImage_ReconstructionMethod": [('0x0054', '0x1103')],
                "PTImage_ScatterCorrectionMethod": [('0x0054', '0x1105')],
                 }





## Define some usefull class and functions
The class Pivot represent a pivot csv file

In [5]:
# Return the concatenation of all tag values from DICOM_file
def get_tag_value(DICOM_file, tags):
    value = ""
    for tag in tags:
        try:
            value += " " + str(DICOM_file[tag[0], tag[1]].value)
        except:
            pass
    return(value)

class Pivot:
    # Create a pivot element with:
    #  - a dictionary containing all the entries (= aline in the .csv file). The key is self.key
    #  - instanceId is an int = line of the .csv file
    #  - key is the key of the self.dict
    #  - dictTag is a dictionay containing the Header of the .csv files and the corresponding dicom tags
    #  - columns is an array containing the header of the csv file and starts with Patient_Id and Instance_Id
    def __init__(self, key, dictTag):
        self.dict = {}
        self.instanceId = 0
        self.key = key
        self.dictTag = dictTag
        self.columns = ["Patient_Id", "Instance_Id"] + list(self.dictTag.keys())[1:]

    # Return the list of element already present in the dict
    def elements(self):
        return(list(self.dict.keys()))
    
    # Fill the dict with the current DICOM_file (= add a line in the .csv file)
    def fill(self, DICOM_file):
        self.dict[DICOM_file[self.key].value] = {}
        for el in list(self.dictTag.keys()):
            self.dict[DICOM_file[self.key].value][el] = get_tag_value(DICOM_file, self.dictTag[el])
        self.dict[DICOM_file[self.key].value]["Instance_Id"] = self.instanceId
        self.instanceId += 1

    # Add reference from another pivot and add the entry after Patient_Id and Instance_Id in the header if not present
    def addRef(self, DICOM_file, name, pivot):
        self.dict[DICOM_file[self.key].value][name] = pivot.dict[DICOM_file[pivot.key].value]["Instance_Id"]
        if name not in self.columns:
            self.columns = self.columns[0:2] + [name] + self.columns[2:]
    
    # Write the pivot as .csv
    def write(self, output):
        with open(output, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=self.columns)
            writer.writeheader()
            for el in self.dict.keys():
                writer.writerow(self.dict[el])
    

## Initialize the pivots

In [6]:
pivotStudy = Pivot(('0x0020', '0x000d'), dictTagStudy)
pivotSeries = Pivot(('0x0020', '0x000e'), dictTagSeries)
pivotEquipment = Pivot(('0x0020', '0x000e'), dictTagEquipment)
pivotInjection = Pivot(('0x0020', '0x000e'), dictTagInjection)
pivotROISegmentation = Pivot(('0x0008', '0x0018'), dictTagROISegmentation)
pivotCommonImage = Pivot(('0x0008', '0x0018'), dictTagCommonImage)
pivotCTImage = Pivot(('0x0008', '0x0018'), dictTagCTImage)
pivotDXImage = Pivot(('0x0008', '0x0018'), dictTagDXImage)
pivotMRImage = Pivot(('0x0008', '0x0018'), dictTagMRImage)
pivotNMImage = Pivot(('0x0008', '0x0018'), dictTagNMImage)
pivotPTImage = Pivot(('0x0008', '0x0018'), dictTagPTImage)

## Fill the pivots
Read all dicoms files and add to the pivots if necessary

In [7]:
for dicom in tqdm(DICOM_files):
    #Read the dicom file
    DICOM_file = dcmread(dicom)

    #If the SOP Study UID is not present, I create the entry inside the pivot
    if DICOM_file[('0x0020', '0x000d')].value not in pivotStudy.elements():
        pivotStudy.fill(DICOM_file)
        pivotStudy.dict[DICOM_file[('0x0020', '0x000d')].value]["Study_Location"] = []
    pivotStudy.dict[DICOM_file[('0x0020', '0x000d')].value]["Study_Location"].append(dicom)
    
    #If the SOP Series UID is not present, I create the entry inside the pivots
    if DICOM_file[('0x0020', '0x000e')].value not in pivotSeries.elements():
        pivotSeries.fill(DICOM_file)
        pivotSeries.addRef(DICOM_file, "Study_Ref", pivotStudy)
        
        pivotEquipment.fill(DICOM_file)
        pivotEquipment.addRef(DICOM_file, "Series_Ref", pivotSeries)
        
        pivotInjection.fill(DICOM_file)
        pivotInjection.addRef(DICOM_file, "Series_Ref", pivotSeries)

    #For all dicom files, add the element to common image pivot or ROI segmentation pivot
    if DICOM_file[('0x0008', '0x0060')].value in ["CT", "MR", "PT", "DX", "NM"]:
        pivotCommonImage.fill(DICOM_file)
        pivotCommonImage.addRef(DICOM_file, "Series_Ref", pivotSeries)
        
        if DICOM_file[('0x0008', '0x0060')].value == "CT":
            pivotCTImage.fill(DICOM_file)
            pivotCTImage.addRef(DICOM_file, "CommonImage_Ref", pivotCommonImage)
        
        elif DICOM_file[('0x0008', '0x0060')].value == "DX":
            pivotDXImage.fill(DICOM_file)
            pivotDXImage.addRef(DICOM_file, "CommonImage_Ref", pivotCommonImage)
        
        elif DICOM_file[('0x0008', '0x0060')].value == "MR":
            pivotMRImage.fill(DICOM_file)
            pivotMRImage.addRef(DICOM_file, "CommonImage_Ref", pivotCommonImage)
        
        elif DICOM_file[('0x0008', '0x0060')].value == "NM":
            pivotNMImage.fill(DICOM_file)
            pivotNMImage.addRef(DICOM_file, "CommonImage_Ref", pivotCommonImage)
        
        elif DICOM_file[('0x0008', '0x0060')].value == "PT":
            pivotPTImage.fill(DICOM_file)
            pivotPTImage.addRef(DICOM_file, "CommonImage_Ref", pivotCommonImage)
            
    elif DICOM_file[('0x0008', '0x0060')].value == "RTSTRUCT":
        pivotROISegmentation.fill(DICOM_file)
        pivotROISegmentation.addRef(DICOM_file, "Series_Ref", pivotSeries)
        

  0%|          | 0/38283 [00:00<?, ?it/s]

## Write the pivot as csv files
For pivotStudy, I need to add the Study_Location to the columns

In [8]:
pivotStudy.columns = pivotStudy.columns[0:6] + ["Study_Location"] + pivotStudy.columns[6:]

pivotStudy.write("OSIRIS_pivot_Study.csv")
pivotSeries.write("OSIRIS_pivot_Series.csv")
pivotEquipment.write("OSIRIS_pivot_Equipment.csv")
pivotInjection.write("OSIRIS_pivot_Injection.csv")
pivotROISegmentation.write("OSIRIS_pivot_ROISegmentation.csv")
pivotCommonImage.write("OSIRIS_pivot_CommonImage.csv")
pivotCTImage.write("OSIRIS_pivot_CTImage.csv")
pivotDXImage.write("OSIRIS_pivot_DXImage.csv")
pivotMRImage.write("OSIRIS_pivot_MRImage.csv")
pivotNMImage.write("OSIRIS_pivot_NMImage.csv")
pivotPTImage.write("OSIRIS_pivot_PTImage.csv")