In [1]:
import pandas as pd
from pprint import pprint

In [2]:
private_tags_table = pd.read_csv('../docs/TCIAPrivateTagKB-02-01-2024-formatted.csv')

In [3]:
private_tags_table

Unnamed: 0,element_sig_pattern,tag_name,vr,private_disposition
0,"(0008,1084)[<0>](312f,""Ramsoft Diagnosis Datet...",Admitting Diagnoses Code Sequence:Unknown,DA,d
1,"(0008,1084)[<0>](312f,Ramsoft Diagnosis Dateti...",Admitting Diagnoses Code Sequence:Unknown,TM,d
2,"(0008,1084)[<0>](3131,Ramsoft Diagnosis Modifi...",Admitting Diagnoses Code Sequence:Unknown,DA,d
3,"(0008,1084)[<0>](3131,Ramsoft Diagnosis Modifi...",Admitting Diagnoses Code Sequence:Unknown,TM,d
4,"(0008,1084)[<0>](3133,Ramsoft Diagnosis Status...",Admitting Diagnoses Code Sequence:Unknown,IS,d
...,...,...,...,...
8782,"(8053,Unnamed Private Block - 10"",00)",Unknown,OB,d
8783,"(f215,PB group A"",17)",Unknown,DA,d
8784,"(f215,PB group A"",17)",Unknown,UN,d
8785,"(f215,PB group A"",18)",Unknown,TM,d


In [4]:
private_tags_table['element_sig_pattern']

0       (0008,1084)[<0>](312f,"Ramsoft Diagnosis Datet...
1       (0008,1084)[<0>](312f,Ramsoft Diagnosis Dateti...
2       (0008,1084)[<0>](3131,Ramsoft Diagnosis Modifi...
3       (0008,1084)[<0>](3131,Ramsoft Diagnosis Modifi...
4       (0008,1084)[<0>](3133,Ramsoft Diagnosis Status...
                              ...                        
8782                (8053,Unnamed Private Block - 10",00)
8783                                (f215,PB group A",17)
8784                                (f215,PB group A",17)
8785                                (f215,PB group A",18)
8786                                (f215,PB group A",18)
Name: element_sig_pattern, Length: 8787, dtype: object

In [5]:
private_tags_table['private_disposition'].value_counts()

private_disposition
d     4585
k     3811
na     210
h       84
o       46
 k      23
oi       9
K        2
D        2
Name: count, dtype: int64

In [6]:
private_tags_table[
    (private_tags_table['element_sig_pattern'].str.match('\\(0021,.*,36')) & (private_tags_table['vr'] == 'SS')
]

Unnamed: 0,element_sig_pattern,tag_name,vr,private_disposition
2294,"(0021,GEMS_RELA_01"",36)",Image From Which Prescribed,SS,k


In [7]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()
    
def replace_non_alphanumeric_with_space(input_string):
    # Replace all non-alphanumeric characters with a space
    result = re.sub(r'[^a-zA-Z0-9]', ' ', input_string)
    return result

class PrivateTagsAnonymizer:
    def __init__(self, private_tags_dict_path: str):
        self.private_tag_dict_path = private_tags_dict_path
        self.private_tag_df = None
        self.filtered_private_tag_df = None

        self._load_private_tag_dict()

    def _load_private_tag_dict(self):
        self.private_tag_df = pd.read_csv(self.private_tag_dict_path)
        self.private_tag_df['vr'] = self.private_tag_df['vr'].astype('category')
        self.private_tag_df['private_disposition'] = self.private_tag_df['private_disposition'].astype('category')

        self.private_tag_df['tag_group'] = self.private_tag_df['element_sig_pattern'].str[1:5]
        self.private_tag_df['tag_group'] = self.private_tag_df['tag_group'].astype('category')

    def filter_by_tag_group(self, tag_groups: list):
        return self.private_tag_df[self.private_tag_df['tag_group'].isin(tag_groups)]

    def filter_by_pattern_n_vr(self, filter_pattern, vr):
        df = self.private_tag_df
        if self.filtered_private_tag_df is not None:
            df = self.filtered_private_tag_df
        return df[
            (df['element_sig_pattern'].str.match(filter_pattern)) & 
            (df['vr'] == vr)
        ]
        
    def search_patterns_from_element(self, element, creators: list = [], include_base_tag_pattrn: bool = False):
        # Get the group and element separately
        group = element.tag.group
        dataelement = element.tag.element
        
        # Convert to hexadecimal string format
        group_str = f"{group:04x}"   # Output: '0010'
        element_str = f"{dataelement:04x}" # Output: '0010'

        all_patterns = []

        for creator in creators:
            pttrn = fr'\({group_str},{creator}.*,{element_str[2:]}\)'
            all_patterns.append(pttrn)

        all_patterns.append(fr'\({group_str},.*,{element_str[2:]}\)')
        if include_base_tag_pattrn:
            all_patterns.append(fr'\({group_str},.*{element_str[2:]}\)')

        return all_patterns

    def get_filtered_rows_from_patterns(self, patterns: list, element, strict: bool = True):
        filtered = []
        for pttrn in patterns:
            filtered_rules = anonymizer.filter_by_pattern_n_vr(pttrn, element.VR)
            if not strict:
                filtered = filtered_rules
            if len(filtered_rules) == 1:
                filtered = filtered_rules
                break
        return filtered

    def get_private_disposition_from_rows(self, filtered_rows, element, match_names: bool = True):
        if len(filtered_rows) == 0:
            return 'k'
        disposition_val = 'k'
        
        first_row = filtered_rows.iloc[0]

        if match_names:
            # get the similarity of the tag name with the element name
            row_name = first_row['tag_name']
            similarity = similar(
                element.name.lower(), 
                replace_non_alphanumeric_with_space(row_name.lower())
            )            
            if similarity > 0.5:
                disposition_val = first_row['private_disposition']
        else:
            disposition_val = first_row['private_disposition']
            
        return disposition_val.lower().strip()

In [8]:
anonymizer = PrivateTagsAnonymizer(
    private_tags_dict_path='../docs/TCIAPrivateTagKB-02-01-2024-formatted.csv'
)

In [9]:
import sys
import re
 
# setting path
sys.path.append('../')
import csv
from difflib import SequenceMatcher

from utils.dataloaders import MIDIEvalDataLoader
from utils.display import display_dicom
from dcm_anonymizers.utils import list_all_files
from IPython.display import display, HTML

from pathlib import Path

In [10]:
root_data_dir = '/home/r079a/Desktop/de-identification/dataset'

loader = MIDIEvalDataLoader(
    rawimagespath=Path(root_data_dir, 'images/manifest-1617826555824'),
    deidimagespath=Path(root_data_dir, 'images-2/manifest-1617826161202'),
    uidsmappath=Path(root_data_dir, 'Pseudo-PHI-DICOM-Dataset-uid_crosswalk.csv'),
)

In [11]:
(rawdcm, metadata), (deiddcm, deiddcm_metadata) = loader.get_raw_n_deid_patient(2, include_metadata=True)

In [12]:
dataset = rawdcm[0]

In [13]:
def extract_private_groups_n_creators(dataset):
    creators = []
    groups = []
    for element in dataset:
        if element.VR == 'OW':
            continue
        if element.tag.is_private:
            groups.append(f"{element.tag.group:04x}")
            if element.name == 'Private Creator' and element.value not in creators:                
                creators.append(element.value)

    groups = list(set(groups))

    return groups, creators

In [17]:
count = 0
groups, creators = extract_private_groups_n_creators(dataset)

groups_df = anonymizer.filter_by_tag_group(groups)
anonymizer.filtered_private_tag_df = groups_df

for element in dataset:
    if element.VR == 'OW':
        continue
    if element.tag.is_private:
        all_patterns = anonymizer.search_patterns_from_element(element, creators)        
        filtered = anonymizer.get_filtered_rows_from_patterns(all_patterns, element)

        # display(filtered)

        disposition_val = anonymizer.get_private_disposition_from_rows(filtered, element)
        print(element.tag, element.name, disposition_val)
        
        count += 1
        # if count > 5:
        #     break

anonymizer.filtered_private_tag_df = None

(0009, 0010) Private Creator k
(0009, 1001) [GE Discovery PET Implementation Version Name] k
(0009, 1003) [PET compatible_version] k
(0009, 1004) [GE Advance Patient.software_version] k
(0009, 1005) [PET patient_datetime] o
(0009, 1007) [PET exam_id] h
(0009, 1008) [PET compatible_version] k
(0009, 1009) [PET software_version] k
(0009, 100a) [PET scan_id] h
(0009, 100b) [PET compatible_version] k
(0009, 100c) [PET software_version] k
(0009, 100d) [PET scan_datetime] o
(0009, 100e) [PET scan_ready] o
(0009, 100f) [PET scan_description] d
(0009, 1011) [PET scanner_desc] k
(0009, 1012) [PET manufacturer] k
(0009, 1013) [PET for_identifier] h
(0009, 1014) [PET landmark_name] k
(0009, 1015) [PET landmark_abbrev] k
(0009, 1016) [PET patient_position] k
(0009, 1017) [PET scan_perspective] k
(0009, 1018) [PET scan_type] k
(0009, 1019) [PET scan_mode] k
(0009, 101a) [PET start_condition] k
(0009, 101b) [PET start_cond_data] k
(0009, 101c) [PET sel_stop_cond] k
(0009, 101d) [PET sel_stop_cond_da

In [15]:
deiddataset = deiddcm[0]

In [16]:
for element in dataset:
    if element.VR == 'OW':
        continue
    if element.tag.is_private:
        deidelem = deiddataset.get(element.tag)

        deidval = 'Not Available'
        if deidelem is not None:
            deidval = deidelem.value
        
        print(element.tag, element.name, element.VR)
        print(f"{element.value} -> {deidval}")
        if element.value == deidval:
            print("k")
        else:
            print("Changed")
            print("======================")

(0009, 0010) Private Creator LO
GEMS_PETD_01 -> GEMS_PETD_01
k
(0009, 1001) [GE Discovery PET Implementation Version Name] LO
GE Advance -> GE Advance
k
(0009, 1003) [PET compatible_version] SH
05.00 -> 05.00
k
(0009, 1004) [GE Advance Patient.software_version] SH
16.01 -> 16.01
k
(0009, 1005) [PET patient_datetime] DT
 -> 
k
(0009, 1007) [PET exam_id] UI
2.25.274128889631512605961383205193865870838 -> 1.3.6.1.4.1.14519.5.2.1.8700.9668.295597077400414441954848071527
Changed
(0009, 1008) [PET compatible_version] SH
05.00 -> 05.00
k
(0009, 1009) [PET software_version] SH
16.01 -> 16.01
k
(0009, 100a) [PET scan_id] UI
2.25.173228199981161944442560079805241998838 -> 1.3.6.1.4.1.14519.5.2.1.8700.9668.380972221829913586024126040548
Changed
(0009, 100b) [PET compatible_version] SH
05.00 -> 05.00
k
(0009, 100c) [PET software_version] SH
16.01 -> 16.01
k
(0009, 100d) [PET scan_datetime] DT
 -> 
k
(0009, 100e) [PET scan_ready] DT
 -> 
k
(0009, 100f) [PET scan_description] ST
PET CT INFERIOR TO S