In [2]:
from sparkocr import start
import os
import json
import time
import shutil

# Set Java Path
os.environ['JAVA_HOME'] = '/home/linuxbrew/.linuxbrew/Cellar/openjdk@21/21.0.9'

# Set License Path
license = "/workspace/spark_nlp_for_healthcare_spark_ocr_10538.json"

if license and "json" in license:

    with open(license, "r") as creds_in:
        creds = json.loads(creds_in.read())

        for key in creds.keys():
            os.environ[key] = creds[key]
else:
    raise Exception("License JSON File is not specified")


extra_configurations = {
    "spark.extraListeners": "com.johnsnowlabs.license.LicenseLifeCycleManager", #required
    "spark.sql.legacy.allowUntypedScalaUDF" : "true", #required
    "spark.executor.instances" : "7",
    "spark.executor.cores" : "16",
    "spark.executor.memory" : "100G",
    "spark.driver.memory" : "100G",
    "spark.sql.shuffle.partitions" : "896",
    "spark.jsl.settings.pretrained.cache_folder" : "/workspace/cache_pretrained"
}

# Start Spark Session With Visual NLP, Healthcare NLP and GPU Runtime
spark = start(jar_path="./",
              nlp_secret=os.environ.get("SECRET"),
              nlp_internal=os.environ.get("JSL_VERSION"),
              apple_silicon=False,
              nlp_version=os.environ.get("PUBLIC_VERSION"),
              logLevel="ERROR",
              use_gpu=True,
              extra_conf=extra_configurations)

spark

Spark version: 3.5.0
Spark NLP version: 6.2.0
Spark NLP for Healthcare version: 6.2.0
Spark OCR version: 6.1.2rc1



In [3]:
# ðŸ“¦ Spark OCR Imports
from sparkocr.transformers import *
from sparkocr.enums import *
from sparkocr.utils import *

# âš¡ Spark NLP Core
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

# ðŸ”— Spark ML
from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.functions as F

# ðŸ§© Spark NLP for Healthcare (JSL)
import sparknlp_jsl
from sparknlp_jsl.annotator import *

from IPython.display import display, Markdown

import pkg_resources
import glob 
import math
import pandas as pd

  import pkg_resources


## Define Metadata & Pixel Level Components

In [5]:
strategy_file_path = "/workspace/midi_b_strategy_ner.csv"

dicom_to_image = DicomToImageV3() \
    .setInputCols(["content"]) \
    .setOutputCol("image_raw") \
    .setKeepInput(False)

text_detector = ImageTextDetector.pretrained("image_text_detector_mem_opt", "en", "clinical/ocr") \
    .setInputCol("image_raw") \
    .setOutputCol("text_regions") \
    .setScoreThreshold(0.7) \
    .setWithRefiner(True) \
    .setUseGPU(True) \
    .setWidth(0)

ocr = ImageToTextV2.pretrained("ocr_large_printed_v2_opt", "en", "clinical/ocr") \
    .setRegionsColumn("text_regions") \
    .setInputCols(["image_raw"]) \
    .setOutputCol("text") \
    .setOutputFormat("text_with_positions") \
    .setGroupImages(False) \
    .setKeepInput(False) \
    .setUseGPU(True) \
    .setUseCaching(True) \
    .setBatchSize(4)

p_document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("p_document") \
    .setCleanupMode("disabled")

p_sentencer = SentenceDetector()\
    .setInputCols(["p_document"])\
    .setOutputCol("p_sentence") \
    .setCustomBounds(["\n"]) \
    .setUseCustomBoundsOnly(False)

p_tokenizer = Tokenizer() \
    .setInputCols(["p_sentence"]) \
    .setOutputCol("p_token")

labels = ["DATE", "DOCTOR", "PATIENT"]
p_zeroshot_ner_deid_subentity_docwise_medium = PretrainedZeroShotNER().pretrained("zeroshot_ner_deid_subentity_docwise_medium", "en", "clinical/models")\
    .setInputCols("p_sentence", "p_token")\
    .setOutputCol("p_zeroshot_ner_deid_subentity_docwise_medium")\
    .setPredictionThreshold(0.5)\
    .setLabels(labels)

p_zeroshot_ner_deid_subentity_docwise_medium_ner_converter = NerConverterInternal()\
    .setInputCols("p_sentence", "p_token", "p_zeroshot_ner_deid_subentity_docwise_medium")\
    .setOutputCol("p_zeroshot_ner_deid_subentity_docwise_medium_ner_chunk") \
    .setThreshold(0.80)

p_regex_matcher = RegexMatcher()\
    .setInputCols("p_document")\
    .setOutputCol("p_regex")\
    .setRules([
        r"\[\s*([MFU])\s*\](?:\s|\b|$);GENDER",
        r"\b(?:JT|SWU|JKR|MWF|ICG|NKF|YH|TJN|LEITO|ACO|CEF|CMS|JGR|MSS|MHS|ROC|LM|RCN|FTA|MGO|LACI|VV|HA|TR|CJA)\b;CODE"]) \
    .setDelimiter(";")

p_chunk_converter = ChunkConverter()\
    .setInputCols("p_regex")\
    .setOutputCol("p_regex_chunk")

p_chunk_merger = ChunkMergeApproach()\
    .setInputCols('p_regex_chunk', "p_zeroshot_ner_deid_subentity_docwise_medium_ner_chunk")\
    .setOutputCol('p_merged_ner_chunk')\
    .setMergeOverlapping(True)

position_finder = PositionFinder() \
    .setInputCols("p_merged_ner_chunk") \
    .setOutputCol("coordinates") \
    .setPageMatrixCol("positions") \
    .setSmoothCoordinates(True)

dicom_to_metadata = DicomToMetadata() \
    .setInputCol("path") \
    .setOutputCol("metadata") \
    .setKeepInput(True) \
    .setExtractTagForNer(True) \
    .setTagMappingCol("tag_mapping") \
    .setDocumentCol("t_document") \
    .setStrategyFile(strategy_file_path)
 
t_sentencer = SentenceDetector()\
    .setInputCols(["t_document"])\
    .setOutputCol("t_sentence") \
    .setCustomBounds(["<dicom>"]) \
    .setUseCustomBoundsOnly(True)

t_tokenizer = Tokenizer() \
    .setInputCols(["t_sentence"]) \
    .setOutputCol("t_token") \
    .setContextChars(["_", "^", "."])

t_word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
    .setInputCols(["t_sentence", "t_token"]) \
    .setOutputCol("t_embeddings")

t_ner_deidentify_dl = MedicalNerModel.pretrained("ner_deidentify_dl", "en", "clinical/models") \
    .setInputCols("t_sentence", "t_token", "t_embeddings") \
    .setOutputCol("t_ner_deidentify_dl")

t_ner_deidentify_dl_ner_converter = NerConverterInternal() \
    .setInputCols(["t_sentence", "t_token", "t_ner_deidentify_dl"]) \
    .setOutputCol("t_ner_deidentify_dl_ner_chunk") \
    .setWhiteList(["DOCTOR", "PATIENT", "ORGANIZATION", "LOCATION-OTHER", "COUNTRY", "CITY", "STATE", "STREET", "ZIP"]) \
    .setThreshold(0.82)

labels = ["CITY", "COUNTRY", "DATE", "DOCTOR", "HOSPITAL", "ORGANIZATION", "PATIENT", "PHONE", "PROFESSION", "STATE", "STREET", "ZIP"]
t_zeroshot_ner_deid_subentity_docwise_large = PretrainedZeroShotNER().pretrained("zeroshot_ner_deid_subentity_docwise_large", "en", "clinical/models") \
    .setInputCols("t_sentence", "t_token") \
    .setOutputCol("t_zeroshot_ner_deid_subentity_docwise_large") \
    .setPredictionThreshold(0.90) \
    .setLabels(labels)

t_zeroshot_ner_deid_subentity_docwise_large_ner_converter = NerConverterInternal()\
    .setInputCols("t_sentence", "t_token", "t_zeroshot_ner_deid_subentity_docwise_large")\
    .setOutputCol("t_zeroshot_ner_deid_subentity_docwise_large_ner_chunk")

codes = [
    "AK","TSARH","BH","FGH","PH","WGMC","GVH","MCH","MJM","SS","CWASH","RASC","CMCH","FMCC",
    "TLM","RMABCH","EBMC","MM","SCH","WM","AAARM","HM","WHAYCC","BGH","NH","MDCH","DSAPCC",
    "MBARCH","CMC","MLASGH","FWARMC","GFASCC","JCH","MKAAH","WNGH","PAAHMC","WBARCC","HG",
    "WG","HMCC","CHC","HCCH","CHAJH","PQACH","MSCH","MJH","HMAJMC","DHCH","GGAMH","LGMC",
    "HRG","HGAWC","WJADC","DC","LCH","WSC","BJAHGH","SSABM","HSG","HHAMM","BGCC","JASGH",
    "HWGH","AMC","KM","RS","DL","CL","LS","KP","CV","AR","CB","SE","ES","JD","CCH","DBAMCH",
    "MSAMM","SBAJH","RTAJCC","BASM","VHG","HPCC","HPAOM","AL","RD","CW","DTAMG","SCASM",
    "HFACG","YPH","MWM","SL","SW","VAGH","SJ", "AH", "CH", "JCASCH", "LBAWCH","ECC", "NRALG",
    "MWG", "WBAACH", "FBH", "CAAFMC", "BYARMC"
]

pattern = r"\b(" + "|".join(codes) + r")\b"

t_regex_matcher = RegexMatcher() \
    .setInputCols("t_sentence") \
    .setOutputCol("t_regex") \
    .setRules([
        # Match known hospital or facility codes (from predefined list)
        pattern + ";CODES",
        # Match professional titles like Dr., DR., dr
        r"\b[Dd][Rr]\b\.?;PROFESSION",
        # Match numeric IDs in format: 123.456.7890x123
        r"\d{3}\.\d{3}\.\d{4}x\d{3};ID",
        # Match compact date-like numeric patterns (YYMMDD etc.)
        r"\b(?:\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01]))\b;DATE",
        # Match phone-like pattern with parentheses and extension,
        # e.g. (151)265-4464x20 â€” but we tag it as DATE as per your request
        r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}(?:x\d{1,5})?;PHONE",
        # Match SSN-style numeric pattern (e.g. 300-45-3175) â†’ labeled as DATE
        r"\b\d{3}-\d{2}-\d{4}\b;DATE"]) \
    .setDelimiter(";") 

t_chunk_converter = ChunkConverter()\
    .setInputCols("t_regex")\
    .setOutputCol("t_regex_ner_chunk")

t_chunk_merger = ChunkMergeApproach()\
    .setInputCols('t_regex_ner_chunk', 't_ner_deidentify_dl_ner_chunk', 't_zeroshot_ner_deid_subentity_docwise_large_ner_chunk')\
    .setOutputCol('t_merged_ner_chunk')\
    .setMergeOverlapping(True)

filter_chunk = ["US", "MR BREAS", "MR BREAST BILAT", "MR BREASTUNI", "MIEDNICA", "WWO", "THINS", 
      "3D Rendering", "MAMMO", "CT", "DCE", "CA", "MC prostaat kliniek detectie-mc"]

t_chunk_filterer = ChunkFilterer() \
  .setInputCols(["t_sentence","t_merged_ner_chunk"]) \
  .setOutputCol("t_filtered") \
  .setCriteria("isin") \
  .setFilterValue("result") \
  .setBlackList(filter_chunk)

deid_obfuscated = DeIdentification() \
    .setInputCols(["t_sentence", "t_token", "t_filtered"]) \
    .setOutputCol("deid_documents") \
    .setMode("deid")

dicom_deidentifier = DicomMetadataDeidentifier() \
    .setInputCols(["path", "metadata"]) \
    .setOutputCol("dicom_metadata_cleaned") \
    .setTagMappingCol("tag_mapping") \
    .setKeepInput(True) \
    .setTagCleanedCol("deid_documents") \
    .setStrategyFile(strategy_file_path)

draw_regions = DicomDrawRegions() \
    .setInputCol("dicom_metadata_cleaned") \
    .setInputRegionsCol("coordinates") \
    .setOutputCol("dicom_final") \
    .setAggCols(["path"]) \
    .setKeepInput(False)

extract_original_tag = DicomToMetadata() \
    .setInputCol("path") \
    .setOutputCol("metadata_original") \
    .setKeepInput(True) \
    .setExtractTagForNer(False) \
    .setStrategyFile(strategy_file_path)

extract_final_tag = DicomToMetadata() \
    .setInputCol("dicom_final") \
    .setOutputCol("metadata_cleaned") \
    .setKeepInput(True) \
    .setExtractTagForNer(False) \
    .setStrategyFile(strategy_file_path)

pipeline = Pipeline(stages=[
    dicom_to_image,
    text_detector,
    ocr,
    p_document_assembler,
    p_sentencer,
    p_tokenizer,
    p_zeroshot_ner_deid_subentity_docwise_medium,
    p_zeroshot_ner_deid_subentity_docwise_medium_ner_converter,
    p_regex_matcher,
    p_chunk_converter,
    p_chunk_merger,
    position_finder,
    dicom_to_metadata,
    t_sentencer,
    t_tokenizer,
    t_word_embeddings,
    t_ner_deidentify_dl,
    t_ner_deidentify_dl_ner_converter,
    t_zeroshot_ner_deid_subentity_docwise_large,
    t_zeroshot_ner_deid_subentity_docwise_large_ner_converter,
    t_regex_matcher,
    t_chunk_converter,
    t_chunk_merger,
    t_chunk_filterer,
    deid_obfuscated,
    dicom_deidentifier,
    draw_regions,
    extract_original_tag,
    extract_final_tag
])

image_text_detector_mem_opt download started this may take some time.
Approximate size to download 77.5 MB
zeroshot_ner_deid_subentity_docwise_medium download started this may take some time.
Approximate size to download 678.7 MB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deidentify_dl download started this may take some time.
Approximate size to download 14.1 MB
[OK!]
zeroshot_ner_deid_subentity_docwise_large download started this may take some time.
Approximate size to download 1.5 GB
[OK!]


## Download MIDIB Pixel Subet

In [7]:
!pip install gdown --quiet

IOStream.flush timed out


In [10]:
!gdown 1ndhUYKgUqmhD2N4ulJs0rX2GytqhT28D -O /workspace/midib.zip

IOStream.flush timed out
Downloading...
From (original): https://drive.google.com/uc?id=1ndhUYKgUqmhD2N4ulJs0rX2GytqhT28D
From (redirected): https://drive.google.com/uc?id=1ndhUYKgUqmhD2N4ulJs0rX2GytqhT28D&confirm=t&uuid=bdcfbdd3-b875-4e1f-9427-80df9d521df6
To: /workspace/midib.zip
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 300M/300M [00:09<00:00, 33.1MB/s]


In [11]:
!unzip -q /workspace/midib.zip -d /workspace/midib

IOStream.flush timed out


## Load Dicom Files

In [14]:
import glob

dicom_files = glob.glob("/workspace/midib/midib/*.dcm")

print(f"Total Files : {len(dicom_files)}")

Total Files : 25


## Running Pixel & Metadata DeIdentification

In [15]:
dicom_df = spark.read.format("binaryFile").load(dicom_files).repartition(1)

print(f"Total Dicom Files : {dicom_df.count()}")
print(f"Total Number of Partitions : {dicom_df.rdd.getNumPartitions()}")

                                                                                

Total Dicom Files : 25




Total Number of Partitions : 1


In [18]:
result = pipeline.fit(dicom_df).transform(dicom_df)

result.repartition(25).write.format("parquet").mode("overwrite").save("./temp_deid_ocr_results")

result.columns



['dicom_final', 'exception', 'path', 'metadata_original', 'metadata_cleaned']

## Load Results

In [43]:
df = spark.read.format("parquet").load("./temp_deid_ocr_results")

df.select("exception").distinct().show(10, False)

+---------+
|exception|
+---------+
|         |
+---------+



## Pixel Level Results

In [31]:
def normalize_to_uint8(arr):
    """
    Normalise any numeric numpy array to 0â€“255 uint8.
    """
    import numpy as np
    from PIL import Image 
    
    arr = arr.astype(float)
    arr = arr - arr.min()

    max_val = arr.max()
    if max_val > 0:
        arr = arr / max_val

    return (arr * 255).astype(np.uint8)

def combine_side_by_side(left, right):
    """Combine two images horizontally using Pillow."""
    from PIL import Image 
    
    left = Image.fromarray(left)
    right = Image.fromarray(right)
    
    # Match heights
    if left.height != right.height:
        ratio = right.width / right.height
        new_height = left.height
        new_width = int(ratio * new_height)
        right = right.resize((new_width, new_height), Image.LANCZOS)

    # Create canvas
    total_width = left.width + right.width
    max_height = max(left.height, right.height)
    
    combined = Image.new("RGB", (total_width, max_height), color=(255, 255, 255))
    
    # Paste
    combined.paste(left, (0, 0))
    combined.paste(right, (left.width, 0))
    
    return combined

In [25]:
import pydicom 
import io 

for item in df.select("path", "dicom_final").toLocalIterator():
    data = item.asDict()
    
    source_path = data["path"].replace("file:", "")

    # Read Source DICOM File
    original_dcm = pydicom.dcmread(source_path)

    # Read De-Identified DICOM File
    redacted_dcm = pydicom.dcmread(io.BytesIO(data["dicom_final"]))

    original_image = normalize_to_uint8(original_dcm.pixel_array)
    redacted_image = normalize_to_uint8(redacted_dcm.pixel_array)

    display(combine_side_by_side(original_image, redacted_image))

[Stage 54:>                                                         (0 + 1) / 1]

## Metadata Level Results

In [44]:
import json 

collect_result = []

for item in df.select("path", "metadata_original", "metadata_cleaned").toLocalIterator():

    data = item.asDict()

    metadata_original = json.loads(data["metadata_original"])
    metadata_cleaned = json.loads(data["metadata_cleaned"])

    for item in metadata_original.keys():
        original_value = metadata_original[item]["value"]
        cleaned_value = metadata_cleaned[item]["value"]
        
        value_changed = False if original_value == cleaned_value else True
        
        collect_result.append([data["path"], item, metadata_original[item]["vr"], original_value, cleaned_value, value_changed])

In [47]:
import pandas as pd

columns = ["File", "Tag", "VR", "Original_Value", "Cleaned_Value", "Is_Changed"]

metadata_df = pd.DataFrame(collect_result, columns=columns)
metadata_df.head(20)

Unnamed: 0,File,Tag,VR,Original_Value,Cleaned_Value,Is_Changed
0,file:/workspace/midib/midib/TCIA-MIDI-B-Synthe...,00080005,CS,ISO_IR 100,ISO_IR 100,False
1,file:/workspace/midib/midib/TCIA-MIDI-B-Synthe...,00080008,CS,"[ORIGINAL, PRIMARY, ]","[ORIGINAL, PRIMARY, ]",False
2,file:/workspace/midib/midib/TCIA-MIDI-B-Synthe...,00080016,UI,1.2.840.10008.5.1.4.1.1.1.1,1.2.840.10008.5.1.4.1.1.1.1,False
3,file:/workspace/midib/midib/TCIA-MIDI-B-Synthe...,00080018,UI,1.2.714.0.0.7426654.9.695.5152792103536035321,2.25.113682148897637558461565132184829650959,True
4,file:/workspace/midib/midib/TCIA-MIDI-B-Synthe...,00080020,DA,20120413,20120224,True
5,file:/workspace/midib/midib/TCIA-MIDI-B-Synthe...,00080021,DA,20120413,20120321,True
6,file:/workspace/midib/midib/TCIA-MIDI-B-Synthe...,00080022,DA,20120413,20120405,True
7,file:/workspace/midib/midib/TCIA-MIDI-B-Synthe...,00080023,DA,20120413,20120222,True
8,file:/workspace/midib/midib/TCIA-MIDI-B-Synthe...,00080024,DA,20120413,20120408,True
9,file:/workspace/midib/midib/TCIA-MIDI-B-Synthe...,00080025,DA,20120413,20120320,True


In [51]:
metadata_df[metadata_df["Is_Changed"] == True].drop("File", axis=1).head(50)

Unnamed: 0,Tag,VR,Original_Value,Cleaned_Value,Is_Changed
3,00080018,UI,1.2.714.0.0.7426654.9.695.5152792103536035321,2.25.113682148897637558461565132184829650959,True
4,00080020,DA,20120413,20120224,True
5,00080021,DA,20120413,20120321,True
6,00080022,DA,20120413,20120405,True
7,00080023,DA,20120413,20120222,True
8,00080024,DA,20120413,20120408,True
9,00080025,DA,20120413,20120320,True
10,0008002A,DT,20120413171319,20120317171319,True
13,00080050,SH,20120413E150359,<EMPTY>,True
16,00080090,PN,PARKS^LINDSEY,tillio clarkston,True


## Extract UID, Patient ID Mapping

In [54]:
from sparkocr.utils import generate_dicom_mapping

help(generate_dicom_mapping)

Help on function generate_dicom_mapping in module sparkocr.utils:

generate_dicom_mapping(df, original_col, final_col)



In [58]:
mapping = generate_dicom_mapping(df=df, original_col="metadata_original", final_col="metadata_cleaned")

mapping.keys()

dict_keys(['uid_mapping', 'patient_mapping'])

In [61]:
columns = ["id_old", "id_new"]

uid_mapping = pd.DataFrame(list(mapping["uid_mapping"].items()), columns=columns)

uid_mapping.head(20)

Unnamed: 0,id_old,id_new
0,1.2.840.10008.5.1.4.1.1.1.1,1.2.840.10008.5.1.4.1.1.1.1
1,1.2.714.0.0.7426654.9.695.5152792103536035321,2.25.113682148897637558461565132184829650959
2,1.2.714.0.0.7426654.9.695.1530417802811153775,2.25.14772223078375165741074291410435429768
3,1.2.714.0.0.7426654.9.695.7173207380656716992,2.25.46000258992575277194697886707035584209
4,1.2.714.0.0.7426654.9.695.1102170333010371365,2.25.322153990049235191949770267525079453201
5,1.2.714.0.0.7426654.9.695.1941648348585858575,2.25.168327107346992328687144303305304457760
6,1.2.840.10008.5.1.4.1.1.1,1.2.840.10008.5.1.4.1.1.1
7,1.4.776.1.3.4191796.3.569.1357167488769591508,2.25.308787424999921713098441032168435419356
8,1.4.776.1.3.4191796.3.569.1580326231403656368,2.25.269777154801955276641730628531103824047
9,1.4.776.1.3.4191796.3.569.9002808596826843594,2.25.171130700536356812562644183497110571449


In [62]:
columns = ["id_old", "id_new"]

patient_mapping = pd.DataFrame(list(mapping["patient_mapping"].items()), columns=columns)

patient_mapping.head(20)

Unnamed: 0,id_old,id_new
0,913829086,131119190
1,8529046531,2111110101
2,8326487951,7515991291
3,5496874289,7111410731
4,4025940893,8374149610
5,751685205,110579410
6,5454430957,1010857947
7,356377258,630115015
8,4790680224,5151313155
9,2414418897,6139131064
