# Extracting data from DICOM files
### This notebook extract data from Dicom files using the dicomTags.

#### Installing required packages:

1. Create a new environment (https://docs.python.org/3/library/venv.html)

```python
python3 -m venv /path/to/new/virtual/environment
```
2. Activate the new environment

```python
source env/bin/activate
```

3. Install required packages

```python
pip install -r requirements.txt
```

4. Run the notebook.  :)

In [1]:

import numpy as np
from pydicom import dcmread
import re
import wget
import zipfile
import os
import glob
from tqdm.notebook import tqdm
import csv

Download data with diferent dicom serie examples (~4.1Go) and unzip it  
Dicom_files: all dicom files

In [14]:
if not os.path.isfile("exampleOsirisImage.zip"):
    dicomArchive = wget.download("https://www.creatis.insa-lyon.fr/~baudier/exampleOsirisImage.zip")
    with zipfile.ZipFile(dicomArchive, 'r') as zip_ref:
        zip_ref.extractall(".")

DICOM_files = glob.glob('exampleOsirisImage/**/*.[dD][cC][mM]', recursive=True)


Print some dicom file paths

In [3]:
print(DICOM_files[:100])

['exampleOsirisImage/HP/RTDOSE/(0008,1030)_LO_(no_value)_________________________________________#_0,1_Study_Description/20210621/084522/2_/Phase_#1_Dosi_DKC_DKC_(Soft_Tissues):_PT_-_5384_-_6654_(GY)_/1.3.6.1.4.1.33868.20210621084522.826595.dcm', 'exampleOsirisImage/HP/RTDOSE/(0008,1030)_LO_(no_value)_________________________________________#_0,1_Study_Description/20210621/084457/4.41806_/Phase_#1_Dosi_DKC_DKC_(Soft_Tissues):_NM_-_SHUNT_PULMONAIRE_-_3_/1.3.6.1.4.1.33868.20210621084457.432480.dcm', 'exampleOsirisImage/HP/CT/SHUNT_PULMONAIRE/20210415/160250/1.250000/CT_THERA_1.25mm_/1.2.840.113619.2.55.3.2831157704.29.1618467837.736.70.dcm', 'exampleOsirisImage/HP/CT/SHUNT_PULMONAIRE/20210415/160250/1.250000/CT_THERA_1.25mm_/1.2.840.113619.2.55.3.2831157704.29.1618467837.736.228.dcm', 'exampleOsirisImage/HP/CT/SHUNT_PULMONAIRE/20210415/160250/1.250000/CT_THERA_1.25mm_/1.2.840.113619.2.55.3.2831157704.29.1618467837.736.64.dcm', 'exampleOsirisImage/HP/CT/SHUNT_PULMONAIRE/20210415/160250/1.

### Create the dictionary dictTag containing the dicom name and dicom tag from the Excel  
Start to write the output csv

In [18]:
dictTags = {"Patient_Id": [('0x0010', '0x0020')],
           "Study_Study_Instance_UID": [('0x0020', '0x000d')],
           "Study_Study_Description": [('0x0008', '0x1030')],
           "Study_acquisition-date_time": [('0x0008', '0x0022'), ('0x0008', '0x0032')],
           "Study_Institution_Name": [('0x0008', '0x0080')],
           "Study_Modalities_in_Study": [('0x0008', '0x0061')],
           "Study_Number_of_Study_Related_Series": [('0x0020', '0x1206')],
           "Series_Series_Number": [('0x0020', '0x0011')],
           "Series_Series_Instance_UID": [('0x0020', '0x000e')],
           "Series_Modality": [('0x0008', '0x0060')],
           "Series_Description": [('0x0008', '0x103e')],
           "Series_Body_Part_Examined": [('0x0018', '0x0015')],
           "Series_Number_of_Series_Related_Instances": [('0x0020', '0x1209')],
           "Series_Software_Version": [('0x0018', '0x1020')],
           "Common_Image_SOP_Instance_UID": [('0x0008', '0x0018')],
           "Common_Image_Slice_thickness": [('0x0018', '0x0050')],
           "Common_Image_Pixel_spacing":  [('0x0028', '0x0030')],
           "Common_Image_Rows":  [('0x0028', '0x0010')],
           "Common_Image_Columns": [('0x0028', '0x0011')],
          }

columns = list(dictTags.keys()) + ["Instance_Id", "Study_location", "Common_Image_Field_of_View"]
print(columns)
with open('pivot.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=columns)
    writer.writeheader()


['Patient_Id', 'Study_Study_Instance_UID', 'Study_Study_Description', 'Study_acquisition-date_time', 'Study_Institution_Name', 'Study_Modalities_in_Study', 'Study_Number_of_Study_Related_Series', 'Series_Series_Number', 'Series_Series_Instance_UID', 'Series_Modality', 'Series_Description', 'Series_Body_Part_Examined', 'Series_Number_of_Series_Related_Instances', 'Series_Software_Version', 'Common_Image_SOP_Instance_UID', 'Common_Image_Slice_thickness', 'Common_Image_Pixel_spacing', 'Common_Image_Rows', 'Common_Image_Columns', 'Instance_Id', 'Study_location', 'Common_Image_Field_of_View']


Look for all previous tags in all dicom files and write it to the csv pivot-like file called "pivot.csv"

In [19]:
# searching for the tags in the dicom file and building the dataframe
indexFile = 0
for dicom in tqdm(DICOM_files):
    DICOM_file = dcmread(dicom)
    dictFile = {}
    for key in dictTags:
        value = ""
        for tag in dictTags[key]:
            try:
                value += " " + DICOM_file[tag[0], tag[1]].value
            except:
                pass
        dictFile[key] = value
    
    if dictFile is not {}:
        dictFile["Instance_Id"] = indexFile
        dictFile["Study_location"] = dicom
        dictFile["Common_Image_Field_of_View"] = ""
        indexFile += 1
        with open('pivot.csv', 'a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=columns)
            writer.writerow(dictFile)



  0%|          | 0/18546 [00:00<?, ?it/s]