In [3]:
# Python Script used to extract the Ground truth, dicom files 
# Requires a Zip File in same directory with name "dicom_files.zip"
!python prepare_data.py

In [1]:
def calculate_precision(gt, pred):
    tp = [i for i in pred if i in gt]
    try:
        precision = len(tp) / len(pred)
    except ZeroDivisionError:
        precision = 0

    return precision

def calculate_recall(gt, pred):
    tp = [i for i in pred if i in gt]
    try:
        recall = len(tp) / len(gt)
    except ZeroDivisionError:
        recall = 0

    return recall

def calculate_metrics(gt_path, df_path, save_result_path, ner_chunk="merged_ner_chunk"):

    """
    gt_path - The location of json file containing ground truth. Generate using prepare_data.py 
    df_path - The location of parquet file containing result from Visual NLP pipeline.
    save_result_path - The location of the output json file containing result 
    ner_chunk - The column name containing final chunks used for metrics.
    """

    total_precision = 0.0
    total_recall = 0.0

    with open(gt_path, "r") as file_in:
        gt = json.loads(file_in.read())

    df = spark.read.format("parquet").load(df_path).select("path", ner_chunk)

    total_files = df.count()
    print(f"Total Files : {total_files}")

    save_result = []

    for item_row in df.toLocalIterator():
        data = item_row.asDict()

        img_path = os.path.basename(data["path"])
        chunks = data[ner_chunk]

        collect_pred = []
        for chunk in chunks:
            pred = chunk.asDict()["result"].strip().lower()
            collect_pred.append(pred)

        lowered_gt = [i.lower() for i in gt[img_path]]

        precision = calculate_precision(lowered_gt, collect_pred)
        recall = calculate_recall(lowered_gt, collect_pred)

        total_precision += precision
        total_recall += recall

        save_result.append({"Filename" : img_path,
                             "Ground Truth" : lowered_gt,
                             "Predicted PHI" : collect_pred,
                             "Precision" : precision,
                             "Recall" : recall})

    total_precision = round(total_precision / total_files, 1)
    total_recall = round(total_recall / total_files, 1)
    f1_score = round(2 * ((total_precision * total_recall) / (total_precision + total_recall)), 1)

    print(f"Precision : {total_precision}")
    print(f"Recall : {total_recall}")
    print(f"F1-Score : {f1_score}")

    with open(save_result_path, "w") as result_file_out:
        json.dump(save_result, result_file_out, indent=4)

In [3]:
from sparkocr import start
import os
import json
import time
import shutil

import json

with open("./creds.json", "r") as creds_in:
    creds = json.loads(creds_in.read())["Credentials"]

os.environ['AWS_ACCESS_KEY_ID'] = creds["AccessKeyId"]
os.environ['AWS_SECRET_ACCESS_KEY'] = creds["SecretAccessKey"]
os.environ['AWS_SESSION_TOKEN'] = creds["SessionToken"]
os.environ['SPARK_OCR_LICENSE'] = creds["SPARK_OCR_LICENSE"]

colab_max = {
    "spark.driver.memory": "10g",
    "spark.executor.memory": "12g",
    "spark.executor.memoryOverhead": "3g",
    "spark.driver.memoryOverhead": "2g",
    "spark.extraListeners": "com.johnsnowlabs.license.LicenseLifeCycleManager",
    "spark.cores.max": "12",
    "spark.executor.cores": "4"
}

spark = start(jar_path="./",
              nlp_secret="5.5.2.PR-2579.6fce19f4d0e8cce1d61a808968c833c932bf2dd1",
              nlp_internal=True,
              nlp_jsl=True,
              nlp_version="5.5.2",
              extra_conf=colab_max,
              use_gpu=True)

spark

Spark version: 3.5.0
Spark NLP version: 5.5.2
Spark NLP for Healthcare version: 5.5.2
Spark OCR version: 5.5.1rc1



In [4]:
import os
import sys
import time
import pydicom
from PIL import Image
import io

from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp_jsl
from sparknlp_jsl.annotator import *

import sparkocr
from sparkocr.transformers import *
from sparkocr.utils import *
from sparkocr.enums import *

from pyspark.ml import PipelineModel, Pipeline
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml import PipelineModel, Pipeline

import pyspark.sql.functions as F

print(f"Spark NLP version: {sparknlp.version()}")
print(f"Spark NLP internal version: {sparknlp_jsl.version()}")
print(f"Spark OCR version: {sparkocr.version()}")

Spark NLP version: 5.5.2
Spark NLP internal version: 5.5.2
Spark OCR version: 5.5.1rc1


In [5]:
os.makedirs("./results/", exist_ok=True)
os.makedirs("./results/deid_image_result/", exist_ok=True)
os.makedirs("./results/ner_result/", exist_ok=True)

In [43]:
dicom_to_image = DicomToImageV3() \
    .setInputCols(["content"]) \
    .setOutputCol("image_raw") \
    .setKeepInput(False)

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document_raw") \
    .setCleanupMode("shrink")

cleanUpPatterns = ["<[^>]*>", r"\^ ?", r"\^"]
documentNormalizer = DocumentNormalizer() \
  .setInputCols("document_raw") \
  .setOutputCol("document") \
  .setAction("clean") \
  .setPatterns(cleanUpPatterns) \
  .setReplacement(" ") \
  .setPolicy("pretty_all")

sentencerDL = SentenceDetectorDLModel\
    .pretrained("sentence_detector_dl", "en") \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

clinical_ner = MedicalNerModel.pretrained("ner_deid_large", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner_subentity")

regex_matcher = RegexMatcher()\
    .setInputCols("sentence")\
    .setOutputCol("regex")\
    .setRules(["[0-9]{4} (JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC) [0-9]{1,2};DATE",
               "(0[1-9]|1[0-2])/(0[1-9]|[12][0-9]|3[01])/\d{4};DATENUM",
               "(?:\s[MF]|\b[MF])(?:\s|\b|$);GENDER",
               "(?<!\d)\d{9,10}(?!\d);ID"])\
    .setDelimiter(";")

chunkConverter = ChunkConverter()\
    .setInputCols("regex")\
    .setOutputCol("regex_chunks")

custom_ner_converter_internal = NerConverterInternalModel() \
    .setInputCols(["sentence","token", "ner_subentity"]) \
    .setOutputCol("ner_chunk") \
    .setThreshold(0.4) \
    .setWhiteList(['NAME', 'AGE', 'LOCATION', 'PERSON', 'DOCTOR', 'PATIENT'])

chunk_merger = ChunkMergeApproach()\
    .setInputCols('regex_chunks', "ner_chunk")\
    .setOutputCol('merged_ner_chunk')\
    .setMergeOverlapping(True)

base_stages = [
    dicom_to_image,
    documentAssembler,
    documentNormalizer,
    sentencerDL,
    tokenizer,
    word_embeddings,
    clinical_ner,
    regex_matcher,
    chunkConverter,
    custom_ner_converter_internal,
    chunk_merger
]

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_large download started this may take some time.
[OK!]


In [44]:
text_detector = ImageTextDetector.pretrained("image_text_detector_mem_opt", "en", "clinical/ocr") \
    .setInputCol("image_raw") \
    .setOutputCol("text_regions") \
    .setScoreThreshold(0.7) \
    .setLinkThreshold(0.5) \
    .setWithRefiner(True) \
    .setTextThreshold(0.4) \
    .setSizeThreshold(-1) \
    .setUseGPU(True) \
    .setWidth(0)

ocr = ImageToTextV2.pretrained("ocr_base_printed_v2_opt", "en", "clinical/ocr") \
    .setRegionsColumn("text_regions") \
    .setInputCols(["image_raw"]) \
    .setOutputCol("text") \
    .setOutputFormat("text_with_positions") \
    .setGroupImages(False) \
    .setKeepInput(False) \
    .setUseGPU(True) \
    .setUseCaching(True) \
    .setBatchSize(4)

new_stages = base_stages.copy()

new_stages.insert(1,text_detector)
new_stages.insert(2,ocr)

detector_v2_base_v2 = Pipeline(stages=new_stages)

image_text_detector_mem_opt download started this may take some time.
Approximate size to download 77.5 MB


In [45]:
df = spark.read.format("binaryFile").load("./dicom_files/*.dcm")

result_base = detector_v2_base_v2.fit(df).transform(df)

start = time.time()

result_base.write.format("parquet").mode("overwrite").save("./result_base/")

end = time.time()

print(f"Time Taken : {round((end - start) / df.count(), 2)}")



Time Taken : 3.63


In [46]:
gt_path = "./gt_ocr.json"
df_path = "./result_base/"
result_path = "./results/ner_result/result_v2_base.json"

calculate_metrics(gt_path, df_path, result_path)

Total Files : 15
Precision : 0.8
Recall : 0.8
F1-Score : 0.8


In [48]:
text_detector = ImageTextDetector.pretrained("image_text_detector_mem_opt", "en", "clinical/ocr") \
    .setInputCol("image_raw") \
    .setOutputCol("text_regions") \
    .setScoreThreshold(0.7) \
    .setLinkThreshold(0.5) \
    .setWithRefiner(True) \
    .setTextThreshold(0.4) \
    .setSizeThreshold(-1) \
    .setUseGPU(True) \
    .setWidth(0)

ocr = ImageToTextV2.pretrained("ocr_large_printed_v2_opt", "en", "clinical/ocr") \
    .setRegionsColumn("text_regions") \
    .setInputCols(["image_raw"]) \
    .setOutputCol("text") \
    .setOutputFormat("text_with_positions") \
    .setGroupImages(False) \
    .setKeepInput(False) \
    .setUseGPU(True) \
    .setUseCaching(True) \
    .setBatchSize(4)

new_stages = base_stages.copy()

new_stages.insert(1,text_detector)
new_stages.insert(2,ocr)

detector_v2_large_v2 = Pipeline(stages=new_stages)

image_text_detector_mem_opt download started this may take some time.
Approximate size to download 77.5 MB


In [49]:
df = spark.read.format("binaryFile").load("./dicom_files/*.dcm")

result_large = detector_v2_large_v2.fit(df).transform(df)

start = time.time()

result_large.write.format("parquet").mode("overwrite").save("./result_large/")

end = time.time()

print(f"Time Taken : {round((end - start) / df.count(), 2)}")

Time Taken : 4.06


In [50]:
gt_path = "./gt_ocr.json"
df_path = "./result_large/"
result_path = "./results/ner_result/result_v2_large.json"

calculate_metrics(gt_path, df_path, result_path)

Total Files : 15
Precision : 0.9
Recall : 0.8
F1-Score : 0.8


In [51]:
text_detector = ImageTextDetector.pretrained("image_text_detector_mem_opt", "en", "clinical/ocr") \
    .setInputCol("image_raw") \
    .setOutputCol("text_regions") \
    .setScoreThreshold(0.7) \
    .setLinkThreshold(0.5) \
    .setWithRefiner(True) \
    .setTextThreshold(0.4) \
    .setSizeThreshold(-1) \
    .setUseGPU(True) \
    .setWidth(0)

ocr = ImageToTextV3() \
    .setInputCols(["image_raw", "text_regions"]) \
    .setOutputCol("text")

new_stages = base_stages.copy()

new_stages.insert(1,text_detector)
new_stages.insert(2,ocr)

detector_v3 = Pipeline(stages=new_stages)

image_text_detector_mem_opt download started this may take some time.
Approximate size to download 77.5 MB


In [52]:
df = spark.read.format("binaryFile").load("./dicom_files/*.dcm")

result_v3 = detector_v3.fit(df).transform(df)

start = time.time()

result_v3.write.format("parquet").mode("overwrite").save("./result_v3/")

end = time.time()

print(f"Time Taken : {round((end - start) / df.count(), 2)}")

Time Taken : 0.68


In [53]:
gt_path = "./gt_ocr.json"
df_path = "./result_v3/"
result_path = "./results/ner_result/result_v3.json"

calculate_metrics(gt_path, df_path, result_path)

Total Files : 15
Precision : 0.7
Recall : 0.4
F1-Score : 0.5


In [54]:
ocr = ImageToText() \
    .setInputCol("image_raw") \
    .setOutputCol("text") \
    .setIgnoreResolution(False) \
    .setPageIteratorLevel(PageIteratorLevel.SYMBOL) \
    .setPageSegMode(PageSegmentationMode.SPARSE_TEXT) \
    .setWithSpaces(True) \
    .setConfidenceThreshold(70)

new_stages = base_stages.copy()

new_stages.insert(1,ocr)

tesseract_pipe = Pipeline(stages=new_stages)

In [55]:
df = spark.read.format("binaryFile").load("./dicom_files/*.dcm")

result_tesseract = tesseract_pipe.fit(df).transform(df)

start = time.time()

result_tesseract.write.format("parquet").mode("overwrite").save("./result_tesseract/")

end = time.time()

print(f"Time Taken : {round((end - start) / df.count(), 2)}")

Time Taken : 0.31


In [56]:
gt_path = "./gt_ocr.json"
df_path = "./result_tesseract/"
result_path = "./results/ner_result/result_tesseract.json"

calculate_metrics(gt_path, df_path, result_path)

Total Files : 15
Precision : 0.5
Recall : 0.3
F1-Score : 0.4


In [62]:
dicom_to_image = DicomToImageV3() \
    .setInputCols(["content"]) \
    .setOutputCol("image_raw") \
    .setKeepInput(False)

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document_raw") \
    .setCleanupMode("shrink")

cleanUpPatterns = ["<[^>]*>", r"\^ ?", r"\^"]
documentNormalizer = DocumentNormalizer() \
  .setInputCols("document_raw") \
  .setOutputCol("document") \
  .setAction("clean") \
  .setPatterns(cleanUpPatterns) \
  .setReplacement(" ") \
  .setPolicy("pretty_all")

sentencerDL = SentenceDetectorDLModel\
    .pretrained("sentence_detector_dl", "en") \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

clinical_ner = MedicalNerModel.pretrained("ner_deid_large", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner_subentity")

regex_matcher = RegexMatcher()\
    .setInputCols("sentence")\
    .setOutputCol("regex")\
    .setRules(["[0-9]{4} (JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC) [0-9]{1,2};DATE",
               "(0[1-9]|1[0-2])/(0[1-9]|[12][0-9]|3[01])/\d{4};DATENUM",
               "(?:\s[MF]|\b[MF])(?:\s|\b|$);GENDER",
               "(?<!\d)\d{9,10}(?!\d);ID"])\
    .setDelimiter(";")

chunkConverter = ChunkConverter()\
    .setInputCols("regex")\
    .setOutputCol("regex_chunks")

custom_ner_converter_internal = NerConverterInternalModel() \
    .setInputCols(["sentence","token", "ner_subentity"]) \
    .setOutputCol("ner_chunk") \
    .setThreshold(0.4) \
    .setWhiteList(['NAME', 'AGE', 'LOCATION', 'PERSON', 'DOCTOR', 'PATIENT'])

chunk_merger = ChunkMergeApproach()\
    .setInputCols('regex_chunks', "ner_chunk")\
    .setOutputCol('merged_ner_chunk')\
    .setMergeOverlapping(True)

position_finder = PositionFinder() \
    .setInputCols("merged_ner_chunk") \
    .setOutputCol("coordinates") \
    .setPageMatrixCol("positions") \
    .setOcrScaleFactor(0.9)

draw_regions = DicomDrawRegions() \
    .setInputCol("path") \
    .setInputRegionsCol("coordinates") \
    .setOutputCol("dicom_cleaned") \
    .setAggCols(["path"]) \
    .setKeepInput(True)

base_stages = [
    dicom_to_image,
    documentAssembler,
    documentNormalizer,
    sentencerDL,
    tokenizer,
    word_embeddings,
    clinical_ner,
    regex_matcher,
    chunkConverter,
    custom_ner_converter_internal,
    chunk_merger,
    position_finder,
    draw_regions
]

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_large download started this may take some time.
[OK!]


In [67]:
text_detector = ImageTextDetector.pretrained("image_text_detector_mem_opt", "en", "clinical/ocr") \
    .setInputCol("image_raw") \
    .setOutputCol("text_regions") \
    .setScoreThreshold(0.7) \
    .setLinkThreshold(0.5) \
    .setWithRefiner(True) \
    .setTextThreshold(0.4) \
    .setSizeThreshold(-1) \
    .setUseGPU(True) \
    .setWidth(0)

ocr = ImageToTextV2.pretrained("ocr_large_printed_v2_opt", "en", "clinical/ocr") \
    .setRegionsColumn("text_regions") \
    .setInputCols(["image_raw"]) \
    .setOutputCol("text") \
    .setOutputFormat("text_with_positions") \
    .setGroupImages(False) \
    .setKeepInput(False) \
    .setUseGPU(True) \
    .setUseCaching(True) \
    .setBatchSize(4)

new_stages = base_stages.copy()

new_stages.insert(1,text_detector)
new_stages.insert(2,ocr)

detector_v2_large_v2 = Pipeline(stages=new_stages)

image_text_detector_mem_opt download started this may take some time.
Approximate size to download 77.5 MB


In [68]:
df = spark.read.format("binaryFile").load("./dicom_files/*.dcm")

result_large = detector_v2_large_v2.fit(df).transform(df)

result_base.write.format("parquet").mode("overwrite").save("./result_large_dicom/")



In [70]:
df = spark.read.format("parquet").load("./result_large_dicom/")

for row in df.select("path", "dicom_cleaned").toLocalIterator():
    data = row.asDict()
    dicom_file = data["dicom_cleaned"]
    path = data["path"]

    base_name = os.path.basename(path).replace(".dcm", ".jpg")

    ds = pydicom.dcmread(io.BytesIO(dicom_file))
    image = Image.fromarray(ds.pixel_array).convert("L")

    file_out = os.path.join("./results/deid_image_result/", base_name)
    image.save(file_out)