In [1]:
import pandas as pd

In [2]:
rules_table = pd.read_html('../docs/rules.html')[0]

In [3]:
rules_table

Unnamed: 0,Attribute Name,Tag,Retired (from PS3.6),In Std. Comp. IOD (from PS3.3),Basic Profile,Retain Safe Private Option,Retain UIDs Option,Retain Device Ident. Option,Retain Patient Chars. Option,Retain Long. Full Dates Option,Retain Long. Modif. Dates Option,Clean Desc. Option,Clean Struct. Cont. Option,Clean Graph. Option
0,Accession Number,"(0008,0050)",N,Y,Z,,,,,,,,,
1,Acquisition Comments,"(0018,4000)",Y,N,X,,,,,,,C,,
2,Acquisition Context Sequence,"(0040,0555)",N,Y,X,,,,,,,,C,
3,Acquisition Date,"(0008,0022)",N,Y,X/Z,,,,,K,C,,,
4,Acquisition DateTime,"(0008,002A)",N,Y,X/D,,,,,K,C,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,Verifying Observer Identification Code Sequence,"(0040,A088)",N,Y,Z,,,,,,,,,
245,Verifying Observer Name,"(0040,A075)",N,Y,D,,,,,,,,,
246,Verifying Observer Sequence,"(0040,A073)",N,Y,D,,,,,,,,,
247,Verifying Organization,"(0040,A027)",N,Y,X,,,,,,,,,


Following rules needs to applied for each selected tags:

* D - replace with a non-zero length value that may be a dummy value and consistent with the VR ([Value Representation](https://dicom.nema.org/dicom/2013/output/chtml/part05/sect_6.2.html))
* Z - replace with a zero length value, or a non-zero length value that may be a dummy value and consistent with the VR
* X - remove
* K - keep (unchanged for non-sequence attributes, cleaned for sequences)
* C - clean, that is replace with values of similar meaning known not to contain identifying information and consistent with the VR
* U - replace with a non-zero length UID that is internally consistent within a set of Instances
* Z/D - Z unless D is required to maintain IOD conformance (Type 2 versus Type 1)
* X/Z - X unless Z is required to maintain IOD conformance (Type 3 versus Type 2)
* X/D - X unless D is required to maintain IOD conformance (Type 3 versus Type 1)
* X/Z/D - X unless Z or D is required to maintain IOD conformance (Type 3 versus Type 2 versus Type 1)
* X/Z/U* - X unless Z or replacement of contained instance UIDs (U) is required to maintain IOD conformance (Type 3 versus Type 2 versus Type 1 sequences containing UID references)

In [4]:
basic_profile_rules = rules_table[['Attribute Name', 'Tag', 'Basic Profile']]

In [5]:
basic_profile_rules

Unnamed: 0,Attribute Name,Tag,Basic Profile
0,Accession Number,"(0008,0050)",Z
1,Acquisition Comments,"(0018,4000)",X
2,Acquisition Context Sequence,"(0040,0555)",X
3,Acquisition Date,"(0008,0022)",X/Z
4,Acquisition DateTime,"(0008,002A)",X/D
...,...,...,...
244,Verifying Observer Identification Code Sequence,"(0040,A088)",Z
245,Verifying Observer Name,"(0040,A075)",D
246,Verifying Observer Sequence,"(0040,A073)",D
247,Verifying Organization,"(0040,A027)",X


In [6]:
basic_profile_rules['Basic Profile'].value_counts()

Basic Profile
X         178
U          29
Z          13
X/D         7
X/Z         6
X/Z/D       6
D           5
Z/D         3
X/Z/U*      2
Name: count, dtype: int64

### Get all the UID

In [7]:
basic_profile_rules[basic_profile_rules['Basic Profile'] == 'U']

Unnamed: 0,Attribute Name,Tag,Basic Profile
22,Concatenation UID,"(0020,9161)",U
29,Context Group Extension Creator UID,"(0008,010D)",U
33,Creator Version UID,"(0008,9123)",U
43,Device UID,"(0018,1002)",U
46,Dimension Organization UID,"(0020,9164)",U
50,Dose Reference UID,"(300A,0013)",U
52,Failed SOP Instance UID List,"(0008,0058)",U
53,Fiducial UID,"(0070,031A)",U
56,Frame of Reference UID,"(0020,0052)",U
68,Instance Creator UID,"(0008,0014)",U


In [47]:
alluid_rows = basic_profile_rules[basic_profile_rules['Attribute Name'].str.contains('UID')]
alluid_rows

Unnamed: 0,Attribute Name,Tag,Basic Profile
15,Affected SOP Instance UID,"(0000,1000)",X
22,Concatenation UID,"(0020,9161)",U
29,Context Group Extension Creator UID,"(0008,010D)",U
33,Creator Version UID,"(0008,9123)",U
43,Device UID,"(0018,1002)",U
44,Digital Signature UID,"(0400,0100)",X
46,Dimension Organization UID,"(0020,9164)",U
50,Dose Reference UID,"(300A,0013)",U
52,Failed SOP Instance UID List,"(0008,0058)",U
53,Fiducial UID,"(0070,031A)",U


### Get the Dates

In [48]:
alldate_rows = basic_profile_rules[basic_profile_rules['Attribute Name'].str.contains('Date')]
alldate_rows

Unnamed: 0,Attribute Name,Tag,Basic Profile
3,Acquisition Date,"(0008,0022)",X/Z
4,Acquisition DateTime,"(0008,002A)",X/D
11,Admitting Date,"(0038,0020)",X
26,Content Date,"(0008,0023)",Z/D
36,Curve Date,"(0008,0025)",X
87,Last Menstrual Date,"(0010,21D0)",X
111,Overlay Date,"(0008,0024)",X
122,Patient's Birth Date,"(0010,0030)",Z
138,Performed Procedure Step End Date,"(0040,0250)",X
141,Performed Procedure Step Start Date,"(0040,0244)",X


### Get the ID tags

In [49]:
allid_rows = basic_profile_rules[basic_profile_rules['Attribute Name'].str.contains(' ID$')]
allid_rows

Unnamed: 0,Attribute Name,Tag,Basic Profile
10,Admission ID,"(0038,0010)",X
20,Cassette ID,"(0018,1007)",X
41,Detector ID,"(0018,700A)",X/D
57,Gantry ID,"(0018,1008)",X
58,Generator ID,"(0018,1005)",X
83,Issuer of Admission ID,"(0038,0011)",X
84,Issuer of Patient ID,"(0010,0021)",X
85,Issuer of Service Episode ID,"(0038,0061)",X
95,Modifying Device ID,"(0020,3401)",X
117,Patient ID,"(0010,0020)",Z


### Extract Rules JSON

In [56]:
def tag_str_to_hex_str(dicomtag: str):
    # Removing parentheses and splitting the string
    dicomtag = dicomtag.strip("()").split(",")

    # Converting to a tuple of hex numbers
    hextuple = f"(0x{dicomtag[0].strip()}, 0x{dicomtag[1].strip()})"

    return hextuple

def rules_to_tag_name(rule:str):
    rule = rule.replace('/', '_')
    rule = rule.replace('*', '_STAR')
    return f"{rule}_TAGS"

def filter_list_from_list(target: list, source: list):
    filtered = []
    for item in target:
        if item in source:
            continue
        filtered.append(item)
    return filtered

In [59]:
rules = {}

alluid_tags = alluid_rows['Tag'].tolist()
rules['UID_TAGS'] = [tag_str_to_hex_str(t) for t in alluid_tags]

alldate_tags = alldate_rows['Tag'].tolist()
rules['DATES_TAGS'] = [tag_str_to_hex_str(t) for t in alldate_tags]

allid_tags = allid_rows['Tag'].tolist()
rules['ID_TAGS'] = [tag_str_to_hex_str(t) for t in allid_tags]

unique_rules = basic_profile_rules['Basic Profile'].unique()
for r in unique_rules:
    rule_tags = basic_profile_rules.loc[
        (basic_profile_rules['Basic Profile'] == r) & ~(basic_profile_rules['Attribute Name'] == 'Private attributes'), 
        'Tag'
    ]
    rule_tags = rule_tags.tolist()
    rule_tags = [tag_str_to_hex_str(t) for t in rule_tags]
    rule_tags = filter_list_from_list(rule_tags, rules['UID_TAGS'])
    rule_tags = filter_list_from_list(rule_tags, rules['DATES_TAGS'])
    rule_tags = filter_list_from_list(rule_tags, rules['ID_TAGS'])
    rules[rules_to_tag_name(r)] = rule_tags

In [60]:
import json

with open('../docs/ps3.3_profile_attrs.json', 'w', encoding='utf-8') as f:
    json.dump(rules, f, ensure_ascii=False, indent=4)

### Load a Sample Dicom

In [12]:
import sys
 
# setting path
sys.path.append('../')

from utils.dataloaders import MIDIEvalDataLoader
from utils.display import display_dicom

from pathlib import Path
import pydicom

In [13]:
root_data_dir = '/home/r079a/Desktop/de-identification/dataset'

loader = MIDIEvalDataLoader(
    rawimagespath=Path(root_data_dir, 'images/manifest-1617826555824'),
    deidimagespath=Path(root_data_dir, 'images-2/manifest-1617826161202'),
    uidsmappath=Path(root_data_dir, 'Pseudo-PHI-DICOM-Dataset-uid_crosswalk.csv'),
)

In [14]:
(rawdcm, metadata), (deiddcm, _) = loader.get_raw_n_deid_patient(0, include_metadata=True)

{'Series UID': '2.25.22373700284337223907674770690654453238', 'Collection': 'Pseudo-PHI-DICOM-Data', '3rd Party Analysis': 'NO', 'Data Description URI': 'https://doi.org/10.7937/s17zr072', 'Subject ID': 292821506, 'Study UID': '2.25.106461954783291641048254423668956446198', 'Study Description': 'XR CHEST AP PORTABLE for Douglas Davidson', 'Study Date': '07-13-2013', 'Series Description': nan, 'Manufacturer': 'FUJIFILM Corporation', 'Modality': 'CR', 'SOP Class Name': 'Computed Radiography Image Storage', 'SOP Class UID': '1.2.840.10008.5.1.4.1.1.1', 'Number of Images': 1, 'File Size': '7.53 MB', 'File Location': './Pseudo-PHI-DICOM-Data/292821506/07-13-2013-NA-XR CHEST AP PORTABLE for Douglas Davidson-46198/1002.000000-NA-53238', 'Download Timestamp': '2024-06-03T18:20:37.027'}


In [15]:
import re
from os import listdir
from os.path import isfile, join
from pydicom.tag import BaseTag
from pydicom.uid import generate_uid
from dicomanonymizer import simpledicomanonymizer

def extract_hex_chars(input_string):
    # Use regular expression to find all hexadecimal characters (0-9, a-f, A-F)
    hex_chars = re.findall(r'[0-9a-fA-F]+', input_string)
    # Join all found sequences into a single string
    result = ''.join(hex_chars)
    return result

def str_to_basetag(tagstr: str):
    cleaned_tag = extract_hex_chars(tagstr)
    if len(cleaned_tag) != 8:
        raise ValueError(f"Invalid value for BaseTag provided {tagstr}")
    hex_tag = int(cleaned_tag, 16)
    return BaseTag(hex_tag)


In [16]:
print(simpledicomanonymizer.dictionary)

{}


In [17]:
target_dcm_path = Path(loader.rawimagespath, metadata['File Location'])
alldicompaths = [f for f in listdir(target_dcm_path) if isfile(join(target_dcm_path, f))]
rawdcm_custom = ds = pydicom.dcmread(Path(target_dcm_path, alldicompaths[0]))

## Anonymize the UID's

In [18]:
uidtags = basic_profile_rules[basic_profile_rules['Attribute Name'].str.contains('UID')]['Tag'].tolist()
print(len(uidtags))

31


In [19]:
convrt_uidtags = [str_to_basetag(t) for t in uidtags]
print(len(convrt_uidtags))

31


In [20]:
from dicomanonymizer.format_tag import tag_to_hex_strings

print(len(simpledicomanonymizer.U_TAGS))

54


In [21]:
def elem_callback(dataset, data_element):
    if data_element.tag in convrt_uidtags:
        print(data_element)
    elif data_element.VR == 'UI':
        print('Not Mentioned')
        print(data_element)

for item in rawdcm:
    item.walk(elem_callback)

Not Mentioned
(0008, 0016) SOP Class UID                       UI: Computed Radiography Image Storage
(0008, 0018) SOP Instance UID                    UI: 2.25.112784503178059210578740147414000844278
(0020, 000d) Study Instance UID                  UI: 2.25.106461954783291641048254423668956446198
(0020, 000e) Series Instance UID                 UI: 2.25.22373700284337223907674770690654453238


In [22]:
print(simpledicomanonymizer.dictionary)

{}


In [23]:
for item in deiddcm:
    item.walk(elem_callback)


Not Mentioned
(0008, 0016) SOP Class UID                       UI: Computed Radiography Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.3.6.1.4.1.14519.5.2.1.8700.9668.265811167095300800838686837893
(0020, 000d) Study Instance UID                  UI: 1.3.6.1.4.1.14519.5.2.1.8700.9668.289834788775974734227766096544
(0020, 000e) Series Instance UID                 UI: 1.3.6.1.4.1.14519.5.2.1.8700.9668.334418705549879145080284996023


## Anonymize the dates

In [24]:
from datetime import datetime, timedelta

In [25]:
datetags = basic_profile_rules[basic_profile_rules['Attribute Name'].str.contains('Date')]['Tag'].tolist()
convrt_datetags = [str_to_basetag(t) for t in datetags]

In [26]:
def parse_date_string(date_string):
    # Define possible formats
    date_formats = [
        "%Y%m%d%H%M%S",  # Full format with hours, minutes, and seconds
        "%Y%m%d"         # Format with only date
    ]
    
    # Try to parse the date string using the appropriate format
    for date_format in date_formats:
        try:
            return datetime.strptime(date_string, date_format)
        except ValueError:
            continue
    
    # If no format matches, raise an error
    raise ValueError(f"Date string '{date_string}' does not match any known format")


def shift_date(date_string, days=0, hours=0, minutes=0, seconds=0, date_only=True):
    if date_string == '':
        return date_string
        
    # Parse the date string
    original_date = parse_date_string(date_string)
    
    # Create a timedelta object based on the provided offset values
    offset = timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
    
    # Shift the date by the offset
    new_date = original_date - offset

    if date_only:
        return new_date.date().strftime("%Y%m%d")
    
    return new_date.strftime("%Y%m%d%H%M%S")


def datetag_callback(dataset, data_element):
    if data_element.tag in convrt_datetags:
        print(data_element)
        res = shift_date(data_element.value, days=2780, date_only=True)
        print(res)
    elif data_element.VR == 'DT' or data_element.VR == 'DA':
        print('Not Mentioned')
        print(data_element)
        res = shift_date(data_element.value, days=2780, date_only=False)
        print(res)

for item in rawdcm:
    item.walk(datetag_callback)

(0008, 0020) Study Date                          DA: '20130713'
20051202
(0008, 0021) Series Date                         DA: '20130713'
20051202
(0008, 0022) Acquisition Date                    DA: '20130713'
20051202
(0008, 0023) Content Date                        DA: '20130713'
20051202
(0010, 0030) Patient's Birth Date                DA: '19760616'
19681105
Not Mentioned
(0018, a002) Contribution DateTime               DT: '20120224142721'
20040715142721


In [27]:
for item in deiddcm:
    item.walk(datetag_callback)

(0008, 0020) Study Date                          DA: '20030626'
19951115
(0008, 0021) Series Date                         DA: '20030626'
19951115
(0008, 0022) Acquisition Date                    DA: '20030626'
19951115
(0008, 0023) Content Date                        DA: '20030626'
19951115
(0010, 0030) Patient's Birth Date                DA: ''

Not Mentioned
(0018, a002) Contribution DateTime               DT: '20030626'
19951115000000
