In [1]:
import pandas as pd

In [2]:
rules_table = pd.read_html('../docs/hippa_safe_harbour_rules.html')[0]

In [3]:
rules_table

Unnamed: 0,0,1,2,3,4
0,Tag,Attribute Name,TCIA Profile,TCIA Implementation,Final CTP Script
1,"(0000,1000)",Affected SOP Instance UID,X,remove,@remove()
2,"(0000,1001)",Requested SOP Instance UID,U,hashuid,"@hashuid(@UIDROOT,this)"
3,"(0002,0003)",Media Storage SOP Instance UID,U,hashuid,"@hashuid(@UIDROOT,this)"
4,"(0004,1511)",Referenced SOP Instance UID in File,U,hashuid,"@hashuid(@UIDROOT,this)"
...,...,...,...,...,...
606,"(0074,1236)",Requesting AE,X,remove,@remove()
607,"(0008,0054)",Retrieve AE Title,X,remove,@remove()
608,"(0072,005E)",Selector AE Value,D,remove,@remove()
609,"(0064,0003)",Source Frame of Reference UID,U,hashuid,"@hashuid(@UIDROOT,this)"


In [4]:
new_header = rules_table.iloc[0] #grab the first row for the header
rules_table = rules_table[1:]
rules_table.columns = new_header #set the header row as the df header

In [5]:
rules_table

Unnamed: 0,Tag,Attribute Name,TCIA Profile,TCIA Implementation,Final CTP Script
1,"(0000,1000)",Affected SOP Instance UID,X,remove,@remove()
2,"(0000,1001)",Requested SOP Instance UID,U,hashuid,"@hashuid(@UIDROOT,this)"
3,"(0002,0003)",Media Storage SOP Instance UID,U,hashuid,"@hashuid(@UIDROOT,this)"
4,"(0004,1511)",Referenced SOP Instance UID in File,U,hashuid,"@hashuid(@UIDROOT,this)"
5,"(0008,0012)",Instance Creation Date,C,incrementdate,"@incrementdate(this,@DATEINC)"
...,...,...,...,...,...
606,"(0074,1236)",Requesting AE,X,remove,@remove()
607,"(0008,0054)",Retrieve AE Title,X,remove,@remove()
608,"(0072,005E)",Selector AE Value,D,remove,@remove()
609,"(0064,0003)",Source Frame of Reference UID,U,hashuid,"@hashuid(@UIDROOT,this)"


In [6]:
rules_table['TCIA Profile'].value_counts()

TCIA Profile
C    284
X    202
U     54
Z     32
D     29
K      9
Name: count, dtype: int64

In [7]:
rules_table['TCIA Implementation'].value_counts()

TCIA Implementation
remove           237
keep             121
incrementdate     96
hashuid           55
time              43
empty             22
replace           20
process           12
lookup             2
hashname           1
Remove Unsafe      1
Name: count, dtype: int64

In [8]:
unique_functions = rules_table['TCIA Implementation'].unique()
unique_functions

array(['remove', 'hashuid', 'incrementdate', 'time', 'empty', 'keep',
       'process', 'lookup', 'replace', 'hashname', 'Remove Unsafe'],
      dtype=object)

In [9]:
def tag_str_to_hex_str(dicomtag: str):
    # Removing parentheses and splitting the string
    dicomtag = dicomtag.strip("()").split(",")

    # Converting to a tuple of hex numbers
    hextuple = f"(0x{dicomtag[0].strip()}, 0x{dicomtag[1].strip()})"

    return hextuple

def rules_to_tag_name(rule:str):
    rule = rule.replace('/', '_')
    rule = rule.replace('*', '_STAR')
    return f"{rule}_TAGS"

def filter_list_from_list(target: list, source: list):
    filtered = []
    for item in target:
        if item in source:
            continue
        filtered.append(item)
    return filtered

In [10]:
rules = {}

for r in unique_functions:
    rule_tags = rules_table.loc[
        (rules_table['TCIA Implementation'] == r), 
        'Tag'
    ]
    rule_tags = rule_tags.tolist()
    rule_tags = [tag_str_to_hex_str(t) for t in rule_tags]
    rules[r] = rule_tags

In [11]:
rules

{'remove': ['(0x0000, 0x1000)',
  '(0x0008, 0x0080)',
  '(0x0008, 0x0081)',
  '(0x0008, 0x0082)',
  '(0x0008, 0x0092)',
  '(0x0008, 0x0094)',
  '(0x0008, 0x0096)',
  '(0x0008, 0x009D)',
  '(0x0008, 0x0201)',
  '(0x0008, 0x1010)',
  '(0x0008, 0x1040)',
  '(0x0008, 0x1041)',
  '(0x0008, 0x1048)',
  '(0x0008, 0x1049)',
  '(0x0008, 0x1050)',
  '(0x0008, 0x1052)',
  '(0x0008, 0x1060)',
  '(0x0008, 0x1062)',
  '(0x0008, 0x1070)',
  '(0x0008, 0x1072)',
  '(0x0008, 0x1120)',
  '(0x0008, 0x2111)',
  '(0x0008, 0x4000)',
  '(0x0010, 0x0021)',
  '(0x0010, 0x0032)',
  '(0x0010, 0x0050)',
  '(0x0010, 0x0101)',
  '(0x0010, 0x0102)',
  '(0x0010, 0x1000)',
  '(0x0010, 0x1001)',
  '(0x0010, 0x1002)',
  '(0x0010, 0x1005)',
  '(0x0010, 0x1040)',
  '(0x0010, 0x1050)',
  '(0x0010, 0x1060)',
  '(0x0010, 0x1080)',
  '(0x0010, 0x1081)',
  '(0x0010, 0x1090)',
  '(0x0010, 0x1100)',
  '(0x0010, 0x2110)',
  '(0x0010, 0x2150)',
  '(0x0010, 0x2152)',
  '(0x0010, 0x2154)',
  '(0x0010, 0x2155)',
  '(0x0010, 0x2180)',


In [12]:
import json

with open('../docs/tcia_supplied_deid_attrs.json', 'w', encoding='utf-8') as f:
    json.dump(rules, f, ensure_ascii=False, indent=4)

### Check the TCIA anonymizer

In [13]:
import sys
from pathlib import Path
# setting path
sys.path.append('../')

In [14]:
rootdir = '/home/r079a/Desktop/de-identification/'

raw_img_path = Path(rootdir, 'dataset/images/manifest-1617826555824', 
                    'Pseudo-PHI-DICOM-Data/3209648408/09-23-1999-NA-CT UROGRAM-31798/3.000000-PARENCHYMAL PHASE Sep1999-95798')
deid_img_path = Path(rootdir, 'dataset/images-2/manifest-1617826161202', 
                     'Pseudo-PHI-DICOM-Data/Pseudo-PHI-004/09-05-1989-NA-CT UROGRAM-19189/3.000000-PARENCHYMAL PHASE-31670')
output_path = Path(rootdir, 'dicom-output/v2_outputs')

first_dcm_name = '1-001.dcm'

sample_img_path = Path(raw_img_path, first_dcm_name)
deid_img_path = Path(deid_img_path, first_dcm_name)
output_file = Path(output_path, first_dcm_name)

print(sample_img_path)

/home/r079a/Desktop/de-identification/dataset/images/manifest-1617826555824/Pseudo-PHI-DICOM-Data/3209648408/09-23-1999-NA-CT UROGRAM-31798/3.000000-PARENCHYMAL PHASE Sep1999-95798/1-001.dcm


In [15]:
from dcm_anonymizers.tcia_deid import DCMTCIAAnonymizer
from dcm_anonymizers.phi_detectors import DcmPHIDetector
from dcm_anonymizers.ps_3_3 import format_action_dict, replace_with_value

In [16]:
phi_detector = DcmPHIDetector()
anonymizer = DCMTCIAAnonymizer(phi_detector)

In [17]:
patient_attrs_action = {
    "(0x0010, 0x0010)": replace_with_value(['Pseudo-PHI-007']),
    "(0x0010, 0x0020)": replace_with_value(['Pseudo-PHI-007']),
}

patient_attrs_action = format_action_dict(patient_attrs_action)


history = anonymizer.anonymize(
    input_path=str(sample_img_path),
    output_path=str(output_file),
    custom_actions=patient_attrs_action,
)

In [18]:
print(history)

{(0008, 103e): 'tcia_keep', (0008, 0012): 'replace', (0008, 0020): 'replace', (0008, 0021): 'replace', (0008, 0022): 'replace', (0008, 0023): 'replace', (0040, 0244): 'replace', (0008, 0018): 'replace_UID', (0020, 000d): 'replace_UID', (0020, 000e): 'replace_UID', (0020, 0052): 'replace_UID', (0008, 0050): 'empty', (0008, 0090): 'empty', (0010, 0030): 'empty', (0010, 0010): 'apply_replace_with_value', (0010, 0020): 'apply_replace_with_value', (0013, 1013): 'apply_replace_with_value'}


In [19]:
import pydicom
from pydicom import dcmread

In [20]:
tagvalues = []
    
def extract_tags(dcm, gt_ds, annon_ds):
    elements = dcm
    gt_elements = gt_ds
    annon_elements = annon_ds
    parent_tag = None
    
    if isinstance(dcm, pydicom.dataelem.DataElement):
        parent_tag = dcm.tag
        elements = dcm.value[0]
        gt_elements = gt_ds.value[0] if gt_ds else None
        annon_elements = annon_ds.value[0] if annon_ds else None
        
    for element in elements:
        deidelem = gt_elements.get(element.tag) if gt_elements else None
        dcmannonelem = annon_elements.get(element.tag) if annon_elements else None
        
        if element.VR == 'OW':
            continue
        elif element.VR == 'SQ':
            extract_tags(element, deidelem, dcmannonelem)
            continue
        # targettags.append(element.tag)
        
        deidval = "Not available"
        if deidelem:
            deidval = str(deidelem.value)
        dcmannonval = "Not available"
        if dcmannonelem:
            dcmannonval = str(dcmannonelem.value)
        changed = False
        if dcmannonval != deidval:
            changed = True

        element_tag_str = str(element.tag)
        if parent_tag:
            element_tag_str = f"{str(parent_tag)} - {str(element.tag)}"
        values_tuple = (element_tag_str, element.VR, element.name, str(element.value), deidval, dcmannonval, changed)
        tagvalues.append(values_tuple)

In [21]:
with open(sample_img_path, 'rb') as infile:
    raw_ds = dcmread(infile)

with open(deid_img_path, 'rb') as deidfile:
    deid_ds = dcmread(deidfile)

with open(output_file, 'rb') as outfile:
    anon_ds = dcmread(outfile)

In [22]:
extract_tags(raw_ds, deid_ds, anon_ds)

valus_df = pd.DataFrame(tagvalues, columns =['Tag', 'VR', 'Name', 'Raw Metadata', 'Ground Truth Anonimization', 'TCIA deId Anonymizer', 'Changed'])

pd.set_option('display.max_rows', 75)

changed_df = valus_df[valus_df['Changed']]
changed_df

Unnamed: 0,Tag,VR,Name,Raw Metadata,Ground Truth Anonimization,TCIA deId Anonymizer,Changed
2,"(0008, 0012)",DA,Instance Creation Date,19990923,19890905,20000121,True
5,"(0008, 0018)",UI,SOP Instance UID,2.25.37474298480622017016850343221557575158,1.3.6.1.4.1.14519.5.2.1.8700.9668.257030622180...,1.2.826.0.1.3680043.8.498.69027402894912746237...,True
6,"(0008, 0020)",DA,Study Date,19990923,19890905,20000121,True
7,"(0008, 0021)",DA,Series Date,19990923,19890905,20000121,True
8,"(0008, 0022)",DA,Acquisition Date,19990923,19890905,20000121,True
9,"(0008, 0023)",DA,Content Date,19990923,19890905,20000121,True
30,"(0008, 1140) - (0008, 1150)",UI,Referenced SOP Class UID,1.2.840.10008.5.1.4.1.1.2,1.2.840.10008.5.1.4.1.1.2,2.25.187701193604464392115757753001281035547,True
31,"(0008, 1140) - (0008, 1155)",UI,Referenced SOP Instance UID,2.25.303925426203336524218866884690732236278,1.3.6.1.4.1.14519.5.2.1.8700.9668.147541751373...,2.25.213565635511937744097879252815569138034,True
34,"(0010, 0010)",PN,Patient's Name,SEXTON^CARMEN,Pseudo-PHI-004,Pseudo-PHI-007,True
35,"(0010, 0020)",LO,Patient ID,3209648408,Pseudo-PHI-004,Pseudo-PHI-007,True


In [23]:
n_mismatched = 0
mismatched_tags = []

for index, row in valus_df.iterrows():
    gt_val = row['Ground Truth Anonimization']
    target_val = row['TCIA deId Anonymizer']

    if gt_val != target_val:
        if row['VR'] == 'UI' and not (gt_val == "" or target_val == ""):
            continue
        elif row['VR'] in ('DA', 'DT', 'TM') and not (gt_val == "" or target_val == ""):
            if len(gt_val) != len(target_val):
                n_mismatched += 1
                mismatched_tags.append(row['Name'])
            continue
        elif row['Tag'] in ('(0010, 0010)', '(0010, 0020)'):
            continue
        n_mismatched += 1
        mismatched_tags.append(row['Name'])

print(n_mismatched)
print(mismatched_tags)

2
['Occupation', 'Private tag data']


In [24]:
def str_element_val(element):
    if isinstance(element.value, bytes):
        return element.value.decode("utf-8")
    return str(element.value)


for element in raw_ds:
    if element.VR == 'OW':
            continue
    if element.tag.is_private:
        target_val = 'Not Availble'
        deid_el = deid_ds.get(element.tag)
        if deid_el:
            target_val = str_element_val(deid_el)
        print(f"{element.tag} {element.VR} {element.name}: {element.value} -> {target_val}")

(0009, 0010) LO Private Creator: GEMS_IDEN_01 -> GEMS_IDEN_01
(0009, 0011) LO Private Creator: GEIIS -> GEIIS
(0013, 0010) LO Private Creator: CTP -> CTP
(0013, 1010) LO Private tag data: Pseudo-PHI-DICOM-Data -> Pseudo-PHI-DICOM-Data
(0013, 1013) LO Private tag data: 87009668 -> 87009668
(0019, 0010) LO Private Creator: GEMS_ACQU_01 -> GEMS_ACQU_01
(0021, 0010) LO Private Creator: GEMS_RELA_01 -> GEMS_RELA_01
(0023, 0010) LO Private Creator: GEMS_STDY_01 -> GEMS_STDY_01
(0027, 0010) LO Private Creator: GEMS_IMAG_01 -> GEMS_IMAG_01
(0043, 0010) LO Private Creator: GEMS_PARM_01 -> GEMS_PARM_01
(0045, 0010) LO Private Creator: GEMS_HELIOS_01 -> GEMS_HELIOS_01
(0903, 0010) LO Private Creator: GEIIS PACS -> GEIIS PACS
(0905, 0010) LO Private Creator: GEIIS -> GEIIS
(7fd1, 0010) LO Private Creator: GEIIS -> GEIIS


In [25]:
for element in raw_ds:
    if element.VR == 'OW':
            continue
    if element.tag.is_private:
        entities = phi_detector.detect_entities_from_element(element)
        if len(entities) > 0:
            print(element)
            print(entities)

(0009, 0010) Private Creator                     LO: 'GEMS_IDEN_01'
[('GEMS_', 'STAFF', 17), ('IDEN_', 'STAFF', 22)]
(0009, 0011) Private Creator                     LO: 'GEIIS'
[('GE', 'STAFF', 17), ('IIS', 'STAFF', 19)]
(0013, 1013) Private tag data                    LO: '87009668'
[('87009', 'ID', 18)]
(0019, 0010) Private Creator                     LO: 'GEMS_ACQU_01'
[('G', 'STAFF', 17), ('EMS_ACQU_', 'STAFF', 18)]
(0023, 0010) Private Creator                     LO: 'GEMS_STDY_01'
[('G', 'STAFF', 17), ('EMS_STDY_', 'STAFF', 18)]
(0027, 0010) Private Creator                     LO: 'GEMS_IMAG_01'
[('GEMS', 'HOSP', 17)]
(0043, 0010) Private Creator                     LO: 'GEMS_PARM_01'
[('G', 'STAFF', 17), ('EMS_PARM_01', 'STAFF', 18)]
(0045, 0010) Private Creator                     LO: 'GEMS_HELIOS_01'
[('G', 'STAFF', 17), ('EMS', 'STAFF', 18), ('HELIOS_', 'STAFF', 22)]
(0903, 0010) Private Creator                     LO: 'GEIIS PACS'
[('GE', 'STAFF', 17), ('IIS', 'STAFF', 19),

In [26]:
all_texts = ''
tag_position_map = {}

for element in raw_ds:
    if element.VR == 'OW':
            continue
    if element.tag.is_private:
        if element.VR in ("LO", "LT", "SH", "PN", "CS", "ST", "UT", "UN") and element.value != "":
            element_val = phi_detector.process_element_val(element)
            element_name = phi_detector.processed_element_name(element.name)
            element_text = f"{element_name}: {element_val}"
            start = len(all_texts) + len(element_name) + 1
            end = start + len(element_name)
            tag_position_map[element.tag] = (start, end)
            if all_texts == '':
                all_texts += f"{element_text}"
            else:
                all_texts += f", {element_text}"

print(all_texts)

print(tag_position_map)

Private Creator: GEMS_IDEN_01, Private Creator: GEIIS, Private Creator: CTP, Private tag data: Pseudo-PHI-DICOM-Data, Private tag data: 87009668, Private Creator: GEMS_ACQU_01, Private Creator: GEMS_RELA_01, Private Creator: GEMS_STDY_01, Private Creator: GEMS_IMAG_01, Private Creator: GEMS_PARM_01, Private Creator: GEMS_HELIOS_01, Private Creator: GEIIS PACS, Private Creator: GEIIS, Private Creator: GEIIS
{(0009, 0010): (16, 31), (0009, 0011): (45, 60), (0013, 0010): (69, 84), (0013, 1010): (92, 108), (0013, 1013): (133, 149), (0019, 0010): (160, 175), (0021, 0010): (191, 206), (0023, 0010): (222, 237), (0027, 0010): (253, 268), (0043, 0010): (284, 299), (0045, 0010): (315, 330), (0903, 0010): (348, 363), (0905, 0010): (377, 392), (7fd1, 0010): (401, 416)}


In [27]:
entities = phi_detector.detect_entities(all_texts)
# print(entities)
element_target = {}

for e in entities:
    e_start = e[2]
    for t in tag_position_map:
        if e_start >= tag_position_map[t][0] and e_start <= tag_position_map[t][1]:
            if t in element_target:
                element_target[t].append(e[0])
            else:
                element_target[t] = [e[0]]

print(element_target)

{(0009, 0010): ['G', 'ID'], (0013, 1013): ['87009']}


In [28]:
def replace_element_values_from_entity_values(elemval, entity_values: list):
    elemval = str(elemval)
    n_words = len(elemval.split())
    deid_val = elemval[:]
    if n_words == 1:
        elem_n_chars = len(elemval)
        entity_n_chars = len(''.join(entity_values))
        if elem_n_chars < 2*entity_n_chars:
            deid_val = ""
    else:
        for entity_val in entity_values:
            deid_val = deid_val.replace(entity_val, '', 1)
            
        remaining_value_prcnt = len(deid_val) / len(elemval)
        # return empty string in case of value almost stripped by anonymizer
        if remaining_value_prcnt <= 0.2:
            deid_val = ""

    return deid_val

In [29]:
for tag in element_target:
    element = raw_ds.get(tag)
    deid_val = replace_element_values_from_entity_values(element.value, element_target[tag])

    print(deid_val)

GEMS_IDEN_01



In [30]:
# import re

In [31]:
# empty_tags_substr = ["id", "uid", "date"]

# empty_tags_substr = [f"\\b{ts}\\b" for ts in empty_tags_substr]

# empty_tags_pattern = '|'.join(empty_tags_substr)
# empty_tags_pattern = r'{}'.format(empty_tags_pattern)

# for element in raw_ds:
#     if element.VR == 'OW':
#             continue
#     if element.tag.is_private:
#         if re.search(empty_tags_pattern, str(element.name.lower())):
#             print(element)