![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/7.Clinical_NER_Chunk_Merger.ipynb)

# 7. Clinical NER Chunk Merger

In [None]:
import os

jsl_secret = os.getenv('SECRET')

import sparknlp
sparknlp_version = sparknlp.version()
import sparknlp_jsl
jsl_version = sparknlp_jsl.version()

print (jsl_secret)

In [None]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

params = {"spark.driver.memory":"16G",
"spark.kryoserializer.buffer.max":"2000M",
"spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(jsl_secret, params=params)

print (sparknlp.version())
print (sparknlp_jsl.version())

3.0.1
3.0.0


In [None]:
 # if you want to start the session with custom params as in start function above

def start(secret):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:"+version) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+secret+"/spark-nlp-jsl-"+jsl_version+".jar")
      
    return builder.getOrCreate()


#spark = start(secret)

In [None]:
spark

In [None]:
# Sample data
data_chunk_merge = spark.createDataFrame([
  (1,"""A 63 years old man presents to the hospital with a history of recurrent infections that include cellulitis, pneumonias, and upper respiratory tract infections. He reports subjective fevers at home along with unintentional weight loss and occasional night sweats. The patient has a remote history of arthritis, which was diagnosed approximately 20 years ago and treated intermittently with methotrexate (MTX) and prednisone. On physical exam, he is found to be febrile at 102°F, rather cachectic, pale, and have hepatosplenomegaly. Several swollen joints that are tender to palpation and have decreased range of motion are also present. His laboratory values show pancytopenia with the most severe deficiency in neutrophils.
""")]).toDF("id","text")

data_chunk_merge.show(truncate=50)

+---+--------------------------------------------------+
| id|                                              text|
+---+--------------------------------------------------+
|  1|A 63 years old man presents to the hospital wit...|
+---+--------------------------------------------------+



In [None]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line
sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = MedicalNerModel.pretrained("ner_deid_large", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("clinical_ner")

clinical_ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "clinical_ner"]) \
  .setOutputCol("clinical_ner_chunk")

# Cancer Genetics NER
bionlp_ner = MedicalNerModel.pretrained("ner_bionlp", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("bionlp_ner")

bionlp_ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "bionlp_ner"]) \
  .setOutputCol("bionlp_ner_chunk")

# merge ner_chunks by prioritizing the overlapping indices (chunks with longer lengths and highest information will be kept from each ner model)
chunk_merger_1 = ChunkMergeApproach()\
  .setInputCols('clinical_ner_chunk', "bionlp_ner_chunk")\
  .setOutputCol('clinical_bionlp_ner_chunk')

# internal clinical NER (general terms)
jsl_ner = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("jsl_ner")

jsl_ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "jsl_ner"]) \
  .setOutputCol("jsl_ner_chunk")

# merge ner_chunks by prioritizing the overlapping indices (chunks with longer lengths and highest information will be kept from each ner model)
chunk_merger_2 = ChunkMergeApproach()\
  .setInputCols('clinical_bionlp_ner_chunk', "jsl_ner_chunk")\
  .setOutputCol('final_ner_chunk')

# merge ner_chunks regardess of overlapping indices 
# only works with 2.7 and later 
chunk_merger_NonOverlapped = ChunkMergeApproach()\
  .setInputCols('clinical_bionlp_ner_chunk', "jsl_ner_chunk")\
  .setOutputCol('nonOverlapped_ner_chunk')\
  .setMergeOverlapping(False)


nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    clinical_ner_converter,
    bionlp_ner,
    bionlp_ner_converter,
    chunk_merger_1,
    jsl_ner,
    jsl_ner_converter,
    chunk_merger_2,
    chunk_merger_NonOverlapped])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)


embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_large download started this may take some time.
Approximate size to download 14.1 MB
[OK!]
ner_bionlp download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]


In [None]:
merged_data = model.transform(data_chunk_merge).cache()

In [None]:
from pyspark.sql import functions as F

result_df = merged_data.select('id',F.explode('final_ner_chunk').alias("cols")) \
.select('id',F.expr("cols.begin").alias("begin"),
        F.expr("cols.end").alias("end"),
        F.expr("cols.result").alias("chunk"),
        F.expr("cols.metadata.entity").alias("entity"))

result_df.show(50, truncate=100)


+---+-----+---+----------------------------------+----------------------+
| id|begin|end|                             chunk|                entity|
+---+-----+---+----------------------------------+----------------------+
|  1|    2|  3|                                63|                   AGE|
|  1|   15| 17|                               man|              Organism|
|  1|   62| 70|                         recurrent|              Modifier|
|  1|   96|105|                        cellulitis|             Diagnosis|
|  1|  108|117|                        pneumonias|             Diagnosis|
|  1|  124|157|upper respiratory tract infections|             Diagnosis|
|  1|  160|161|                                He|                Gender|
|  1|  171|180|                        subjective|              Modifier|
|  1|  182|187|                            fevers|          Symptom_Name|
|  1|  238|247|                        occasional|              Modifier|
|  1|  249|260|                      n

## NonOverlapped Chunk

all the entities form each ner model will be returned one by one

In [None]:
from pyspark.sql import functions as F

result_df2 = merged_data.select('id',F.explode('nonOverlapped_ner_chunk').alias("cols")) \
.select('id',F.expr("cols.begin").alias("begin"),
        F.expr("cols.end").alias("end"),
        F.expr("cols.result").alias("chunk"),
        F.expr("cols.metadata.entity").alias("entity"))

result_df2.show(50, truncate=100)


+---+-----+---+----------------------------------+----------------------+
| id|begin|end|                             chunk|                entity|
+---+-----+---+----------------------------------+----------------------+
|  1|    2|  3|                                63|                   AGE|
|  1|    2|  3|                                63|                   Age|
|  1|   15| 17|                               man|              Organism|
|  1|   15| 17|                               man|                Gender|
|  1|   62| 70|                         recurrent|              Modifier|
|  1|   96|105|                        cellulitis|             Diagnosis|
|  1|  108|117|                        pneumonias|             Diagnosis|
|  1|  124|157|upper respiratory tract infections|             Diagnosis|
|  1|  124|128|                             upper|  Organism_subdivision|
|  1|  142|146|                             tract|  Organism_subdivision|
|  1|  160|161|                       