![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Medical_Language_Models/April_2025/Notebooks/Dicom_Metadata_Only.ipynb)

<!-- ========================================================= -->
<!--        John Snow Labs - Package Installation Guide         -->
<!--        JupyterLab Single Markdown Cell (HTML Content)      -->
<!-- ========================================================= -->

<h1>Package Installation</h1>

<!-- Link to official GitHub repository -->
<p>
  Official Repository:
  <a href="https://github.com/JohnSnowLabs/johnsnowlabs" target="_blank">
    https://github.com/JohnSnowLabs/johnsnowlabs
  </a>
</p>

<!-- License setup instructions -->
<p>
  Keep your <strong>license keys</strong> in a JSON file and point to it using the
  <code>json_license_path</code> argument when starting the Spark session.
</p>

<!-- Visual NLP configuration note -->
<p>
  Set <code>visual=True</code> while starting the Spark session to install and make
  <strong>Visual NLP libraries</strong> available.
</p>

<!-- Restart note -->
<p>
  ⚠️ <strong>Important:</strong> After installing the library, make sure to
  <strong>RESTART your session</strong> before running Spark again.
</p>

<!-- End of notebook cell -->

In [None]:
!pip install -q johnsnowlabs

In [None]:
from johnsnowlabs import nlp, visual, medical

nlp.install(refresh_install=True, visual=True, json_license_path="./spark_nlp_for_healthcare_spark_ocr_10538.json")

In [None]:
# RESTART SESSION!!!

In [2]:
from johnsnowlabs import visual, nlp

spark = nlp.start(visual=True, hardware_target="gpu", json_license_path="./spark_nlp_for_healthcare_spark_ocr_10538.json")

spark

👌 License info detected in file ./spark_nlp_for_healthcare_spark_ocr_10538.json


25/10/21 09:51:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/21 09:51:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


👌 Launched [92mgpu optimized[39m session with with: 🚀Spark-NLP==6.1.3, 💊Spark-Healthcare==6.1.1, 🕶Spark-OCR==6.1.0, running on ⚡ PySpark==3.4.0


<h1>Dicom Metadata DeIdentification</h1>

<!-- Notebook purpose -->
<p>
  This notebook focuses on the <strong>DICOM Metadata De-Identification</strong> process — the anonymization of
  Protected Health Information (PHI) stored within DICOM headers and nested tag structures.
  It demonstrates how <strong>Visual NLP</strong> transformers can be applied
  to sanitize sensitive metadata fields across large DICOM datasets while maintaining UID consistency
  and referential integrity.
</p>

<!-- Stages list -->
<h2>Concepts and Components Covered in this Notebook</h2>
<ul>
  <li>DicomToMetadata</li>
  <li>DicomMetadataDeidentifier</li>
</ul>

In [3]:
# 📦 Spark OCR Imports
from sparkocr.transformers import *
from sparkocr.enums import *
from sparkocr.utils import *
from sparkocr.schemas import BinarySchema

# ⚡ Spark NLP Core
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

# 🔗 Spark ML
from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.functions as F

# 🧩 Spark NLP for Healthcare (JSL)
import sparknlp_jsl
from sparknlp_jsl.annotator import *
from sparkocr.base import LightPipeline

from pyspark.sql.functions import *
from pyspark.sql.types import *
from urllib.parse import urlparse
from IPython.display import display, Markdown
from PIL import Image, ImageDraw, ImageFont
import pkg_resources
import pandas as pd
import time

In [4]:
def compare_dicom_metadata(dicom_original_metadata, dicom_redacted_metadata):
    """
    Compare Original and Redacted Metadata
    Add a new column denoting if value has changed
    Returns pandas dataframe
    """
    metadata_mapping = []

    for key in dicom_original_metadata.keys():
        changed = dicom_original_metadata[key] != dicom_redacted_metadata[key]
        metadata_mapping.append([key,dicom_original_metadata[key],dicom_redacted_metadata[key], changed])

    metadata_mapping_df = pd.DataFrame(metadata_mapping, columns=["Key", "Original Value", "Redacted Value", "Value Changed"])

    return metadata_mapping_df

<h2>Extract Metadata from Dicom Object</h2>

In [5]:
dicom_to_metadata = DicomToMetadata() \
    .setInputCol("content") \
    .setOutputCol("metadata")

In [None]:
df = spark.read.format("binaryFile").load("./data/visual/dicom/TCIA-MIDI-B-Synthetic-Test_20250502___1.1.582.0.3.8604111.4.040.1865311550435039158___1-1.dcm")

result = dicom_to_metadata.transform(df).select("metadata").collect()[0].asDict()["metadata"]

json_result = json.loads(result)

json_result

<h2>Run Deidentify Dicom Metadata and Extract Results</h2>

In [10]:
csv_path = pkg_resources.resource_filename('sparkocr', "resources/ocr/dicom/default_dicom_strategy.csv")

strategy_csv = pd.read_csv(csv_path)

strategy_csv.head(10)

Unnamed: 0,Tags,VR,Name,Status,Action,Repeatable
0,"(0002,0100)",UI,Private Information Creator UID,,hashId,
1,"(0002,0102)",OB,Private Information,,hashId,
2,"(0004,1130)",CS,File-set ID,,hashId,
3,"(0004,1141)",CS,File-set Descriptor File ID,,hashId,
4,"(0004,1432)",UI,Private Record UID,,hashId,
5,"(0004,1500)",CS,Referenced File ID,,hashId,
6,"(0008,0012)",DA,Instance Creation Date,,shiftDateByRandomNbOfDays,
7,"(0008,0014)",UI,Instance Creator UID,,hashId,
8,"(0008,0020)",DA,Study Date,,shiftDateByFixedNbOfDays,112.0
9,"(0008,0021)",DA,Series Date,,shiftDateByRandomNbOfDays,


In [11]:
dicom_deidentifier = DicomMetadataDeidentifier() \
    .setInputCols(["content"]) \
    .setOutputCol("dicom_metadata_cleaned") \
    .setRemovePrivateTags(False) \
    .setPlaceholderText("<hidden>") \
    .setStrategyFile(csv_path)

In [19]:
df = spark.read.format("binaryFile").load("./data/visual/dicom/TCIA-MIDI-B-Synthetic-Test_20250502___1.1.582.0.3.8604111.4.040.1865311550435039158___1-1.dcm")

result = dicom_deidentifier.transform(df).write \
  .format("binaryFormat") \
  .option("type", "dicom") \
  .option("field", "dicom_metadata_cleaned") \
  .option("nameField", "fileName") \
  .option("extension", "dcm") \
  .option("prefix", "de-id-") \
  .mode("overwrite") \
  .save("./data/result_metadata/")

25/10/21 10:40:37 WARN SparkContext: The path /usr/local/lib/python3.12/dist-packages/sparkocr/resources/ocr/dicom/first_names.all.txt has been added already. Overwriting of added paths is not supported in the current version.
25/10/21 10:40:37 WARN SparkContext: The path /usr/local/lib/python3.12/dist-packages/sparkocr/resources/ocr/dicom/last_names.all.txt has been added already. Overwriting of added paths is not supported in the current version.
  import pkg_resources
10:40:40, INFO Run DicomMetadataDeidentifier
10:40:41, INFO DEBUG: tag:(2, 256), vr:UI, action:hashId, option:None
10:40:41, INFO DEBUG: tag:(2, 258), vr:OB, action:hashId, option:None
10:40:41, INFO DEBUG: tag:(4, 4400), vr:CS, action:hashId, option:None
10:40:41, INFO DEBUG: tag:(4, 4417), vr:CS, action:hashId, option:None
10:40:41, INFO DEBUG: tag:(4, 5170), vr:UI, action:hashId, option:None
10:40:41, INFO DEBUG: tag:(4, 5376), vr:CS, action:hashId, option:None
10:40:41, INFO DEBUG: tag:(8, 18), vr:DA, action:shiftD

<h2>Run DicomToMetadata and Extract Results</h2>

In [20]:
df = spark.read.format("binaryFile").load("./data/result_metadata/*.dcm")

deid_result = dicom_to_metadata.transform(df).select("metadata").collect()[0].asDict()["metadata"]

json_deid_result = json.loads(deid_result)

json_deid_result

  warn_and_log(msg)
                                                                                

{'SpecificCharacterSet': 'ISO_IR 100',
 'SOPClassUID': '1.2.840.10008.5.1.4.1.1.7',
 'SOPInstanceUID': '1.1.582.0.3.8604111.4.040.1906345282118337300',
 'StudyDate': '20131109',
 'ContentDate': '20130905',
 'StudyTime': '110007',
 'ContentTime': '110007',
 'AccessionNumber': '2.25.104661169016722362372224050231930259815',
 'Modality': 'MG',
 'ConversionType': 'WSD',
 "ReferringPhysician'sName": '<hidden>',
 'AdmittingDiagnosesDescription': 'Suspected breast cancer; Ordered by Dr. Burke',
 "Patient'sName": '<hidden>',
 'PatientID': '2.25.104661169016722362372224050231930259815',
 "Patient'sBirthDate": '19410504',
 "Patient'sSex": 'U',
 "Patient'sAge": '056M',
 'PrivateCreator': 'CTP',
 'Privatetagdata': '29059656',
 'BodyPartExamined': 'Chest',
 'SecondaryCaptureDeviceManufacturer': 'MathWorks',
 "SecondaryCaptureDeviceManufacturer'sModelName": 'MATLAB',
 'ProtocolName': 'Performed 20130720',
 'StudyInstanceUID': '2.25.104661169016722362372224050231930259815',
 'SeriesInstanceUID': '2.2

<h3>Compare the Tags with Changes</h3>

In [21]:
pd_df = compare_dicom_metadata(json_result, json_deid_result)
pd_df[pd_df["Value Changed"] == True].head(50)

Unnamed: 0,Key,Original Value,Redacted Value,Value Changed
3,StudyDate,20130720,20131109,True
4,ContentDate,20130720,20130905,True
7,AccessionNumber,20130720E771698,2.25.104661169016722362372224050231930259815,True
10,ReferringPhysician'sName,BURKE^KAYLA,<hidden>,True
12,Patient'sName,MENDOZA^BRUCE,<hidden>,True
13,PatientID,930219598,2.25.104661169016722362372224050231930259815,True
14,Patient'sBirthDate,19410527,19410504,True
16,Patient'sAge,072Y,056M,True
19,BodyPartExamined,Left Breast,Chest,True
23,StudyInstanceUID,1.1.582.0.3.8604111.4.040.2461550950283810514,2.25.104661169016722362372224050231930259815,True


In [22]:
import shutil
shutil.rmtree("./data/result_metadata/")