In [1]:
import sys

sys.path.append('../')

import os
import tqdm
import csv
import pydicom
from tqdm import tqdm

from pathlib import Path

In [2]:
from dcm_anonymizers.utils import ensure_dir, list_all_files

In [3]:
DEID_DATASET_ROOT = '/home/r079a/Desktop/de-identification/dataset/midi-validation-data'

input_data_dir = Path(DEID_DATASET_ROOT, 'input_data')

In [4]:
total_dcms = 0
dcm_dirs = []

# List all the directory and dicoms
alldirs = [x[0] for x in os.walk(str(input_data_dir))]

for dir in alldirs:
    dcms = list_all_files(dir)
    
    if len(dcms) > 0:
        total_dcms += len(dcms)
        dcm_dirs.append(dir)

print(f"Total dicoms in the input data: {total_dcms}, Total series in the input data: {len(dcm_dirs)}")

Total dicoms in the input data: 23921, Total series in the input data: 280


In [5]:
def id_map_csv_to_dict(csvfile: str):
    id_map = {}
    with open(csvfile, mode ='r')as file:
      mapping = csv.reader(file)
      for idx, lines in enumerate(mapping):
          if idx == 0:
              continue
          id_map[lines[0]] = lines[1]
    return id_map 

def get_all_dcm_paths_from_series(anonymizer_output_path: str, series_mapping:dict, seriesUID: str):
    series_path = series_mapping.get(seriesUID, '')
    if series_path == "":
        print(f"No path found for given series id {seriesUID}")
        return
    full_series_path = anonymizer_output_path / 'data' / series_path
    alldcms = list_all_files(full_series_path)
    # print(full_series_path)
    if len(alldcms) == 0:
        print(f"No dicom found for given series id {seriesUID}")
        return
    return alldcms

In [38]:
mismatched_dict = {}
mismatched_log = {}

def add_to_mismatched_dict(elementname: str):
    if elementname in mismatched_dict:
        mismatched_dict[elementname] += 1
    else:
        mismatched_dict[elementname] = 1   

def add_to_mismatched_log(element1val: str, element2val: str):
    element_val_key = f"{element2val}->{element1val}"
    if element_val_key in mismatched_log:
        mismatched_log[element_val_key] += 1
    else:
        mismatched_log[element_val_key] = 1

def compare_datasets(ds1: pydicom.Dataset, ds2: pydicom.Dataset):
    for element in ds1:
        element2 = ds2.get(element.tag)
        
        if element2 is None and element.value != '':
            if element.name == "Requested Procedure Comments":
                add_to_mismatched_log(element.value, '')
            add_to_mismatched_dict(element.name)
            continue

        if element2 is None:
            el2val = ''
        else:
            el2val = element2.value
            
        if element.VR == 'UI':
            pass
        elif element.VR == "SQ":
            for idx, sub_ds in enumerate(element.value):
                sub_ds2 = element2.value[idx]
                compare_datasets(sub_ds, sub_ds2)        
        elif element.value != el2val:
            if element.name == "Requested Procedure Comments":
                add_to_mismatched_log(element.value, el2val)
            add_to_mismatched_dict(element.name)                   

In [39]:
output_dir_1 = Path(DEID_DATASET_ROOT, 'output_data')
output_dir_2 = Path(DEID_DATASET_ROOT, 'output_data_tcia_private_tags_06')


series_mapping_file_1 = output_dir_1 / 'mappings' / 'path_mapping.csv'
series_mapping_file_2 = output_dir_2 / 'mappings' / 'path_mapping.csv'

output_1_series_mapping = id_map_csv_to_dict(series_mapping_file_1)
output_2_series_mapping = id_map_csv_to_dict(series_mapping_file_2)

In [40]:
for series_id in output_1_series_mapping.keys():
    assert series_id in output_2_series_mapping

In [41]:
count = 0
limit = total_dcms

# progress_bar = tqdm(total=1000)
with tqdm(total=limit) as pbar:
    for series_id in output_1_series_mapping.keys():
        if count >= limit:
            break
            
        output_1_dcms = get_all_dcm_paths_from_series(output_dir_1, output_1_series_mapping, series_id)
        output_2_dcms = get_all_dcm_paths_from_series(output_dir_2, output_2_series_mapping, series_id)
    
        assert len(output_1_dcms) == len(output_2_dcms)
    
        output_1_dcm_dict = {}
        output_2_dcm_dict = {}
    
        for idx, dcm in enumerate(output_1_dcms):
            output_1_dcm = os.path.basename(dcm)
            output_1_dcm_dict[output_1_dcm] = dcm
            
            another_dcm = output_2_dcms[idx]
            output_2_dcm = os.path.basename(another_dcm)
            output_2_dcm_dict[output_2_dcm] = another_dcm
    
        for dcmname in output_1_dcm_dict:
            dcmpath_1 = output_1_dcm_dict[dcmname]
            dcmpath_2 = output_2_dcm_dict[dcmname]
    
            ds1 = pydicom.dcmread(dcmpath_1)
            ds2 = pydicom.dcmread(dcmpath_2)
    
            compare_datasets(ds1, ds2)
            if count >= limit:
                break
            count += 1
            pbar.update(1)
            # print(count)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 23921/23921 [03:10<00:00, 125.83it/s]


In [43]:
mismatched_log

{}

In [30]:
mismatched_dict

{"Patient's Name": 23921,
 'Patient ID': 23921,
 'Clinical Trial Time Point Description': 17645,
 'Image Comments': 10,
 'Additional Patient History': 618,
 'Protocol Name': 165,
 'Study Description': 1032,
 '[PET batch_description]': 547,
 'Contrast/Bolus Agent': 2,
 'Acquisition Comments': 75,
 'Series Number': 4,
 'Verifying Organization': 10,
 'Referenced Segment Number': 10,
 'Requested Procedure Description': 92,
 'Patient Comments': 15,
 'Scheduled Procedure Step Description': 11,
 'Performed Procedure Step Description': 43,
 'Series Description': 14,
 'Detector ID': 5,
 'Admitting Diagnoses Description': 55,
 'Derivation Description': 214,
 '[Actual series data time stamp]': 68,
 'Graphic Data': 4}

In [31]:
# {
#     'Clinical Trial Time Point Description': 17645,
#     'Requested Procedure Comments': 2970, # 353
#     'Image Comments': 10, # 4 -> 10
#     'Additional Patient History': 143, # 280 -> 143
#     'Comments on Radiation Dose': 18, # 8
#     'Protocol Name': 165, # 1634
#     'Study Description': 1025, # 1714
#     '[PET batch_description]': 547,
#     'Contrast/Bolus Agent': 2,
#     'Acquisition Comments': 29, #?
#     'Series Number': 4,
#     'Verifying Organization': 10,
#     'Referenced Segment Number': 10,
#     'Requested Procedure Description': 80, # 98
#     'Patient Comments': 14, #?
#     'Scheduled Procedure Step Description': 11, # 177
#     'Performed Procedure Step Description': 9, # 126
#     'Series Description': 14, # 1341
#     'Reason for the Requested Procedure': 2,
#     'Detector ID': 5,
#     'Admitting Diagnoses Description': 55, # ?
#     '[Actual series data time stamp]': 68,
#     'Gantry ID': 1,
#     'Graphic Data': 4
# }
{"Patient's Name": 23921,
 'Patient ID': 23921,
 'Clinical Trial Time Point Description': 17645,
 'Number of Time Slices': 135,
 'Comments on Radiation Dose': 18,
 'Performed Procedure Step Description': 43,
 'Additional Patient History': 618,
 'Requested Procedure Comments': 2970,
 'Protocol Name': 165,
 'KVP': 601,
 'Acquisition Number': 601,
 'Acquisition Comments': 75,
 'Graphic Data': 4,
 'Study Description': 1032,
 '[PET batch_description]': 547,
 'Image Comments': 10,
 'Verifying Organization': 10,
 'Referenced Segment Number': 10,
 'Patient Comments': 15,
 'Admitting Diagnoses Description': 55,
 'Series Description': 14,
 'Requested Procedure Description': 92,
 'Detector ID': 5,
 'Contrast/Bolus Agent': 2,
 'Derivation Description': 214,
 'Planar Configuration': 14,
 'Series Number': 4,
 '[Actual series data time stamp]': 68,
 'Reason for the Requested Procedure': 2,
 'Scheduled Procedure Step Description': 11,
 'Gantry ID': 1,
 'Referenced Patient Sequence': 45}

{"Patient's Name": 23921,
 'Patient ID': 23921,
 'Clinical Trial Time Point Description': 17645,
 'Number of Time Slices': 135,
 'Comments on Radiation Dose': 18,
 'Performed Procedure Step Description': 43,
 'Additional Patient History': 618,
 'Requested Procedure Comments': 2970,
 'Protocol Name': 165,
 'KVP': 601,
 'Acquisition Number': 601,
 'Acquisition Comments': 75,
 'Graphic Data': 4,
 'Study Description': 1032,
 '[PET batch_description]': 547,
 'Image Comments': 10,
 'Verifying Organization': 10,
 'Referenced Segment Number': 10,
 'Patient Comments': 15,
 'Admitting Diagnoses Description': 55,
 'Series Description': 14,
 'Requested Procedure Description': 92,
 'Detector ID': 5,
 'Contrast/Bolus Agent': 2,
 'Derivation Description': 214,
 'Planar Configuration': 14,
 'Series Number': 4,
 '[Actual series data time stamp]': 68,
 'Reason for the Requested Procedure': 2,
 'Scheduled Procedure Step Description': 11,
 'Gantry ID': 1,
 'Referenced Patient Sequence': 45}

In [13]:
import pandas as pd

In [14]:
report_1_path = '/home/r079a/Desktop/de-identification/dataset/midi-validation-data/output_data_tcia_private_tags_06/discrepancy_report_participant.csv'
report_2_path = '/home/r079a/Desktop/de-identification/dataset/midi-validation-data/output_data_tcia_private_tags_05/discrepancy_report_participant.csv'

In [15]:
report_1_df = pd.read_csv(report_1_path)

In [16]:
report_1_tags = report_1_df['tag_name'].value_counts().head(50)
report_1_tags_df = pd.DataFrame({'tag_name':report_1_tags.index, 'count':report_1_tags.values})
report_1_tags_df

Unnamed: 0,tag_name,count
0,<Clinical Trial Time Point Description>,17645
1,<Study Description>,1714
2,<Protocol Name>,1634
3,<Series Description>,1341
4,<Position Reference Indicator>,878
5,<KVP>,601
6,<Acquisition Number>,601
7,<[PET batch_description]>,547
8,<Requested Procedure Comments>,353
9,<Additional Patient History>,280


In [17]:
# sample_dcm_path = "/home/r079a/Desktop/de-identification/dataset/midi-validation-data/input_data/4312559682/3.1.292.1.1.1931822.2.050.1648578733778415208/3.1.292.1.1.1931822.2.050.1829936347311263175/00000043.dcm"

# ds = pydicom.dcmread(sample_dcm_path)

In [18]:
# print(ds)

In [19]:
# output_sample_path = "/home/r079a/Desktop/de-identification/dataset/midi-validation-data/output_data_tcia_private_tags_06/data/Pseudo-PHI-152/1.2.826.0.1.3680043.8.498.38125644669066907523185955651810491497/1.2.826.0.1.3680043.8.498.11650675140155247984597992913196329429/00000043.dcm"

# ds = pydicom.dcmread(output_sample_path)

In [20]:
# print(ds)