![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/7.Clinical_NER_Chunk_Merger.ipynb)

# Clinical NER Chunk Merger

In [3]:
import json

with open('workshop_license_keys_Aug2020.json') as f:
    license_keys = json.load(f)

license_keys.keys()


dict_keys(['JSL_VERSION', 'PUBLIC_VERSION', 'SECRET', 'SPARK_NLP_LICENSE', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'JSL_OCR_SECRET', 'SPARK_OCR_LICENSE'])

In [4]:
license_keys['JSL_VERSION']

'2.5.4rc3'

In [5]:
# template for license_key.json

{'JSL_VERSION':'jjj',
'PUBLIC_VERSION':'vvv',
'SECRET':"xxx",
'SPARK_NLP_LICENSE': 'aaa',
'JSL_OCR_LICENSE': 'bbb',
'AWS_ACCESS_KEY_ID':"ccc",
'AWS_SECRET_ACCESS_KEY':"ddd",
'JSL_OCR_SECRET':"eee"}

{'JSL_VERSION': 'jjj',
 'PUBLIC_VERSION': 'vvv',
 'SECRET': 'xxx',
 'SPARK_NLP_LICENSE': 'aaa',
 'JSL_OCR_LICENSE': 'bbb',
 'AWS_ACCESS_KEY_ID': 'ccc',
 'AWS_SECRET_ACCESS_KEY': 'ddd',
 'JSL_OCR_SECRET': 'eee'}

In [None]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

secret = license_keys['SECRET']

os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['SPARK_OCR_LICENSE'] = license_keys['SPARK_OCR_LICENSE']
os.environ['AWS_ACCESS_KEY_ID']= license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
jsl_version = license_keys['JSL_VERSION']
version = license_keys['PUBLIC_VERSION']

! pip install --ignore-installed -q pyspark==2.4.4

! python -m pip install --upgrade spark-nlp-jsl==$jsl_version  --extra-index-url https://pypi.johnsnowlabs.com/$secret

! pip install --ignore-installed -q spark-nlp==$version

import sparknlp

print (sparknlp.version())

import json
import os
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession


from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl

spark = sparknlp_jsl.start(secret)

In [None]:
 # if you want to start the session with custom params as in start function above

def start(secret):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:"+version) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+secret+"/spark-nlp-jsl-"+jsl_version+".jar")
      
    return builder.getOrCreate()


#spark = start(secret)

In [None]:
spark

In [None]:
# Sample data
data_chunk_merge = spark.createDataFrame([
  (1,"""A 63-year-old man presents to the hospital with a history of recurrent infections that include cellulitis, pneumonias, and upper respiratory tract infections. He reports subjective fevers at home along with unintentional weight loss and occasional night sweats. The patient has a remote history of arthritis, which was diagnosed approximately 20 years ago and treated intermittently with methotrexate (MTX) and prednisone. On physical exam, he is found to be febrile at 102°F, rather cachectic, pale, and have hepatosplenomegaly. Several swollen joints that are tender to palpation and have decreased range of motion are also present. His laboratory values show pancytopenia with the most severe deficiency in neutrophils.
""")]).toDF("id","text")

data_chunk_merge.show(truncate=50)

+---+--------------------------------------------------+
| id|                                              text|
+---+--------------------------------------------------+
|  1|A 63-year-old man presents to the hospital with...|
+---+--------------------------------------------------+



In [None]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line
sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = NerDLModel.pretrained("ner_deid_large", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("clinical_ner")

clinical_ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "clinical_ner"]) \
  .setOutputCol("clinical_ner_chunk")

# Cancer Genetics NER
bionlp_ner = NerDLModel.pretrained("ner_bionlp", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("bionlp_ner")

bionlp_ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "bionlp_ner"]) \
  .setOutputCol("bionlp_ner_chunk")

chunk_merger_1 = ChunkMergeApproach()\
  .setInputCols('clinical_ner_chunk', "bionlp_ner_chunk")\
  .setOutputCol('bionlp_ner_chunk')

# internal clinical NER (general terms)
jsl_ner = NerDLModel.pretrained("ner_jsl", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("jsl_ner")

jsl_ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "jsl_ner"]) \
  .setOutputCol("jsl_ner_chunk")

chunk_merger_2 = ChunkMergeApproach()\
  .setInputCols('bionlp_ner_chunk', "jsl_ner_chunk")\
  .setOutputCol('final_ner_chunk')

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    clinical_ner_converter,
    bionlp_ner,
    bionlp_ner_converter,
    chunk_merger_1,
    jsl_ner,
    jsl_ner_converter,
    chunk_merger_2])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)


embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_large download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_bionlp download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]


In [None]:
from collections import OrderedDict
ners_to_merge = OrderedDict({"ner_deid_large":"deid", "ner_bionlp":"bio", "ner_jsl":"jsl"})

# Build the graph
ner_pl = []
for ner, out in ners_to_merge.items():
    first = len(ner_pl)==0
    ner_pl.append(NerDLModel.pretrained(ner,"en","clinical/models").setInputCols("sentence","token","embs").setOutputCol(out))
    ner_pl.append(NerConverter().setInputCols("sentence","token",out).setOutputCol(out+"_chunk"))
    if not first:
      print (prev+"_chunk", out+"_chunk", out+"_chunk")
      ner_pl.append(ChunkMergeApproach().setInputCols(prev+"_chunk", out+"_chunk").setOutputCol(out+"_chunk"))
    prev = out


ner_deid_large download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_bionlp download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
deid_chunk bio_chunk bio_chunk
ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]
bio_chunk jsl_chunk jsl_chunk


In [None]:
merged_data = model.transform(data_chunk_merge).cache()

In [None]:
from pyspark.sql import functions as F

result_df = merged_data.select('id',F.explode('final_ner_chunk').alias("cols")) \
.select('id',F.expr("cols.begin").alias("begin"),
        F.expr("cols.end").alias("end"),
        F.expr("cols.result").alias("chunk"),
        F.expr("cols.metadata.entity").alias("entity"))

result_df.show(50, truncate=100)


+---+-----+---+----------------------------------+----------------------+
| id|begin|end|                             chunk|                entity|
+---+-----+---+----------------------------------+----------------------+
|  1|    2| 12|                       63-year-old|                   AGE|
|  1|   14| 16|                               man|              Organism|
|  1|   61| 69|                         recurrent|              Modifier|
|  1|   95|104|                        cellulitis|             Diagnosis|
|  1|  107|116|                        pneumonias|             Diagnosis|
|  1|  123|156|upper respiratory tract infections|             Diagnosis|
|  1|  159|160|                                He|                Gender|
|  1|  170|179|                        subjective|              Modifier|
|  1|  181|186|                            fevers|          Symptom_Name|
|  1|  237|246|                        occasional|              Modifier|
|  1|  248|259|                      n