![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

<!-- ========================================================= -->
<!--        John Snow Labs - Package Installation Guide         -->
<!--        JupyterLab Single Markdown Cell (HTML Content)      -->
<!-- ========================================================= -->

<h1>Package Installation</h1>

<!-- Link to official GitHub repository -->
<p>
  Official Repository:
  <a href="https://github.com/JohnSnowLabs/johnsnowlabs" target="_blank">
    https://github.com/JohnSnowLabs/johnsnowlabs
  </a>
</p>

<!-- License setup instructions -->
<p>
  Keep your <strong>license keys</strong> in a JSON file and point to it using the
  <code>json_license_path</code> argument when starting the Spark session.
</p>

<!-- Visual NLP configuration note -->
<p>
  Set <code>visual=True</code> while starting the Spark session to install and make
  <strong>Visual NLP libraries</strong> available.
</p>

<!-- Restart note -->
<p>
  ⚠️ <strong>Important:</strong> After installing the library, make sure to
  <strong>RESTART your session</strong> before running Spark again.
</p>

<!-- End of notebook cell -->

In [None]:
!pip install -q johnsnowlabs

In [None]:
from johnsnowlabs import nlp, visual, medical

nlp.install(refresh_install=True, visual=True, json_license_path="./spark_nlp_for_healthcare_spark_ocr_10538.json")

In [None]:
### RESTART SESSION !!!

In [1]:
from johnsnowlabs import visual, nlp

spark = nlp.start(visual=True, hardware_target="gpu", json_license_path="./spark_nlp_for_healthcare_spark_ocr_10538.json")

spark

👌 License info detected in file ./spark_nlp_for_healthcare_spark_ocr_10538.json


25/10/21 09:36:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


👌 Launched [92mgpu optimized[39m session with with: 🚀Spark-NLP==6.1.3, 💊Spark-Healthcare==6.1.1, 🕶Spark-OCR==6.1.0, running on ⚡ PySpark==3.4.0


<h1>Spark Streaming Overview</h1>

<!-- Notebook purpose -->
<p>
  This notebook introduces <strong>Spark Streaming</strong> — the real-time and continuous data processing framework in
  <strong>Apache Spark</strong>. It demonstrates how <strong>Visual NLP</strong> and <strong>Healthcare NLP</strong> pipelines
  can be integrated with streaming data sources to perform large-scale, real-time document and image processing tasks
  such as OCR, De-identification, and Visual Question Answering.
</p>

<!-- Aggregation note -->
<p>
  In streaming mode, <strong>aggregations should be performed inside</strong>
  <code>foreach_batch_function()</code>.
  This is because stages such as <strong>DicomDrawRegions</strong> perform internal aggregations
  (e.g., collecting coordinates, exceptions, or pixel regions) that are not compatible with the default append mode
  of Structured Streaming. Executing them inside the <code>foreach_batch_function()</code> ensures that
  all per-batch computations are properly handled and results are written deterministically.
</p>

<!-- Stages list -->
<h2>Concepts and Components Covered in this Notebook</h2>
<ul>
  <li>clinical_deidentification_docwise_benchmark_large Pretrained Pipeline</li>
  <li>Spark Streaming</li>
</ul>

In [2]:
# 📦 Spark OCR Imports
from sparkocr.transformers import *
from sparkocr.enums import *
from sparkocr.utils import *
from sparkocr.schemas import BinarySchema

# ⚡ Spark NLP Core
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

# 🔗 Spark ML
from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.functions as F

# 🧩 Spark NLP for Healthcare (JSL)
import sparknlp_jsl
from sparknlp_jsl.annotator import *
from sparkocr.base import LightPipeline

from pyspark.sql.functions import *
from pyspark.sql.types import *
from urllib.parse import urlparse
from IPython.display import display, Markdown
from PIL import Image, ImageDraw, ImageFont
import pkg_resources
import pandas as pd
import time

In [None]:
from sparknlp.pretrained import PretrainedPipeline
deid_pipeline = PretrainedPipeline("clinical_deidentification_docwise_benchmark_large", "en", "clinical/models")

In [6]:
dicom_to_metadata = DicomToMetadata() \
    .setInputCol("content") \
    .setOutputCol("metadata") \
    .setKeepInput(True)

dicom_to_image = DicomToImageV3() \
    .setInputCols(["content"]) \
    .setOutputCol("image_raw") \
    .setKeepInput(False)

text_detector = ImageTextDetector.pretrained("image_text_detector_mem_opt", "en", "clinical/ocr") \
    .setInputCol("image_raw") \
    .setOutputCol("text_regions") \
    .setScoreThreshold(0.7) \
    .setWithRefiner(True) \
    .setUseGPU(True) \
    .setWidth(0)

ocr = ImageToTextV2.pretrained("ocr_large_printed_v2_opt", "en", "clinical/ocr") \
    .setRegionsColumn("text_regions") \
    .setInputCols(["image_raw"]) \
    .setOutputCol("text") \
    .setOutputFormat("text_with_positions") \
    .setGroupImages(False) \
    .setKeepInput(False) \
    .setUseGPU(True) \
    .setUseCaching(True) \
    .setBatchSize(4)

regex_matcher = RegexMatcher()\
    .setInputCols("document")\
    .setOutputCol("regex")\
    .setRules([
        r"(?:\s[MFU]|\b[MFU])(?:\s|\b|$);GENDER",
        r"\b(?:JT|SWU|JKR|MWF|ICG|NKF|YH|TJN|LEITO|ACO|CEF|CMS|JGR|MSS|MHS|ROC|LM|RCN|FTA|MGO|LACI|VV|HA|TR|CJA)\b;CODE"]) \
    .setDelimiter(";")

chunkConverter = ChunkConverter()\
    .setInputCols("regex")\
    .setOutputCol("regex_chunks")

chunk_merger = ChunkMergeApproach()\
    .setInputCols('regex_chunks', "ner_chunk")\
    .setOutputCol('merged_ner_chunk')\
    .setMergeOverlapping(True)

position_finder = PositionFinder() \
    .setInputCols("merged_ner_chunk") \
    .setOutputCol("coordinates") \
    .setPageMatrixCol("positions") \
    .setSmoothCoordinates(True)

draw_regions = DicomDrawRegions() \
    .setInputCol("path") \
    .setInputRegionsCol("coordinates") \
    .setOutputCol("dicom") \
    .setAggCols(["path"]) \
    .setKeepInput(True)

dicom_deidentifier = DicomMetadataDeidentifier() \
    .setInputCols(["dicom"]) \
    .setOutputCol("dicom_cleaned")

stages = deid_pipeline.model.stages[:-2].copy()

stages.insert(0, dicom_to_metadata)
stages.insert(1, dicom_to_image)
stages.insert(2, text_detector)
stages.insert(3, ocr)
stages.append(regex_matcher)
stages.append(chunkConverter)
stages.append(chunk_merger)
stages.append(position_finder)

dicom_phi_deid_pipeline = Pipeline(stages=stages)

image_text_detector_mem_opt download started this may take some time.
Approximate size to download 77.5 MB


25/10/21 09:38:45 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
25/10/21 09:38:45 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


image_text_detector_mem_opt download started this may take some time.
Approximate size to download 77.5 MB
Download done! Loading the resource.


25/10/21 09:38:47 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
25/10/21 09:38:47 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


ocr_large_printed_v2_opt download started this may take some time.
Approximate size to download 931 MB
Download done! Loading the resource.


In [7]:
maxFilesPerTrigger = 2
dicom_path = "./data/visual/dicom/midib/"
outputPath = "./deid_stream_output/"

os.makedirs(outputPath, exist_ok=True)

In [None]:
dicom_stream_df = spark.readStream \
    .format("binaryFile") \
    .schema(BinarySchema) \
    .option("maxFilesPerTrigger", maxFilesPerTrigger) \
    .option("recursiveFileLookup", True) \
    .load(dicom_path) \
    .withColumn("timestamp", current_timestamp())

result = dicom_phi_deid_pipeline.fit(dicom_stream_df).transform(dicom_stream_df)

get_base_name = F.udf(lambda path: os.path.basename(path).replace(".dcm", ""), StringType())

def foreach_batch_function(df, epoch_id):
    
    df1 = draw_regions.transform(df).withColumn("fileName", get_base_name(col("path"))).repartition("fileName")

    dicom_deidentifier.transform(df1).write.format("binaryFormat") \
      .option("type", "dicom") \
      .option("field", "dicom_cleaned") \
      .option("nameField", "fileName") \
      .option("extension", "dcm") \
      .option("prefix", "de-id-") \
      .mode("append") \
      .save(outputPath)

query = result.writeStream.foreachBatch(foreach_batch_function) \
   .queryName('dicom_result') \
   .start()

query.lastProgress

In [9]:
query.lastProgress

{'id': 'ceb50421-613a-4bba-9157-a2c1d4e5ef53',
 'runId': '0c1323fa-16e6-49e4-abb2-c93cb2e791e9',
 'name': 'dicom_result',
 'timestamp': '2025-10-21T09:48:11.630Z',
 'batchId': 13,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'latestOffset': 137, 'triggerExecution': 137},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[file:/workspace/data/visual/dicom]',
   'startOffset': {'logOffset': 12},
   'endOffset': {'logOffset': 12},
   'latestOffset': None,
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0}],
 'sink': {'description': 'ForeachBatchSink', 'numOutputRows': -1}}

In [10]:
query.stop()