[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/enterprise/healthcare/ChunkMergeClinicalMultiple.ipynb)

In [1]:
import json

with open('keys.json') as f:
    license_keys = json.load(f)

license_keys.keys()


dict_keys(['version', 'secret', 'SPARK_NLP_LICENSE', 'JSL_OCR_LICENSE', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'JSL_OCR_SECRET'])

In [2]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

secret = license_keys.get("secret",license_keys.get('SPARK_NLP_SECRET', ""))
spark_version = os.environ.get("SPARK_VERSION", license_keys.get("SPARK_VERSION","2.4"))
version = license_keys.get("version",license_keys.get('SPARK_NLP_PUBLIC_VERSION', ""))
jsl_version = license_keys.get("jsl_version",license_keys.get('SPARK_NLP_VERSION', ""))

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['JSL_OCR_LICENSE'] = license_keys['JSL_OCR_LICENSE']
os.environ['AWS_ACCESS_KEY_ID']= license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']

print(spark_version, version, jsl_version)

! python -m pip install "pyspark==$spark_version".*
! python -m pip install --upgrade spark-nlp-jsl==$jsl_version  --extra-index-url https://pypi.johnsnowlabs.com/$secret

import sparknlp
import sparknlp_jsl
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

print (sparknlp.version())
print (sparknlp_jsl.version())

spark = sparknlp_jsl.start(secret, gpu=False, spark23=(spark_version[:3]=="2.3"))

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)
Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/8zvTuUjWPt
Collecting spark-nlp-jsl==2.5.2
Collecting spark-nlp==2.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/3e/b0/c272273674b5810c0909b369c57669197907a15d84bbdf058007bb909c99/spark_nlp-2.5.2-py2.py3-none-any.whl (122kB)
[K     |████████████████████████████████| 133kB 3.4MB/s 
[?25hCollecting pyspark==2.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 66kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████

In [0]:
# Sample data. If really training a new NER from the output of multiple prertained NERs, 
# this data should have thousands of documents
data_chunk_merge = spark.createDataFrame([
  (1,"""A 63-year-old man presents to the hospital with a history of recurrent infections that include cellulitis, pneumonias, and upper respiratory tract infections. He reports subjective fevers at home along with unintentional weight loss and occasional night sweats. The patient has a remote history of arthritis, which was diagnosed approximately 20 years ago and treated intermittently with methotrexate (MTX) and prednisone. On physical exam, he is found to be febrile at 102°F, rather cachectic, pale, and have hepatosplenomegaly. Several swollen joints that are tender to palpation and have decreased range of motion are also present. His laboratory values show pancytopenia with the most severe deficiency in neutrophils.
""")]).toDF("id","text")

In [4]:
# Preprocessing pipeline
da = DocumentAssembler().setInputCol("text").setOutputCol("document")
sd = SentenceDetector().setInputCols("document").setOutputCol("sentence")
tk = Tokenizer().setInputCols("sentence").setOutputCol("token")
emb = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models").setOutputCol("embs")


embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [0]:
# Ners names and column names for each ner
from collections import OrderedDict
ners_to_merge = OrderedDict({"ner_deid_large":"deid", "ner_bionlp":"bio", "ner_jsl":"jsl"})

In [6]:
# Build the graph
ner_pl = []
for ner, out in ners_to_merge.items():
    first = len(ner_pl)==0
    ner_pl.append(NerDLModel.pretrained(ner,"en","clinical/models").setInputCols("sentence","token","embs").setOutputCol(out))
    ner_pl.append(NerConverter().setInputCols("sentence","token",out).setOutputCol(out+"_chunk"))
    if not first:
        ner_pl.append(ChunkMergeApproach().setInputCols(prev+"_chunk", out+"_chunk").setOutputCol(out+"_chunk"))
    prev = out
out_col = list(ners_to_merge.values())[-1]+"_chunk"

ner_deid_large download started this may take some time.
Approximate size to download 14 MB
[OK!]
ner_bionlp download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]


In [0]:
iob_tagger = IOBTagger().setInputCols("token", out_col).setOutputCol("ner_label")

In [0]:

pl = Pipeline().setStages([da,sd,tk,emb] + ner_pl + [iob_tagger])

In [0]:
merged_data = pl.fit(data_chunk_merge).transform(data_chunk_merge).cache()

In [12]:
merged_data.selectExpr("id",f"explode({out_col}) as a")\
.selectExpr("id","a.begin","a.end","a.result as chunk","a.metadata.entity as entity").show(100, False)

+---+-----+---+----------------------------------+----------------------+
|id |begin|end|chunk                             |entity                |
+---+-----+---+----------------------------------+----------------------+
|1  |2    |12 |63-year-old                       |Age                   |
|1  |14   |16 |man                               |Organism              |
|1  |61   |69 |recurrent                         |Modifier              |
|1  |95   |104|cellulitis                        |Diagnosis             |
|1  |107  |116|pneumonias                        |Diagnosis             |
|1  |123  |156|upper respiratory tract infections|Diagnosis             |
|1  |159  |160|He                                |Gender                |
|1  |170  |179|subjective                        |Modifier              |
|1  |181  |186|fevers                            |Symptom_Name          |
|1  |237  |246|occasional                        |Modifier              |
|1  |248  |259|night sweats           

In [17]:
#As we can see, ner_label column is ready to train a NerDLApproach
merged_data.selectExpr("id",f"explode(ner_label) as a")\
.selectExpr("id","a.begin","a.end","a.result as chunk","a.metadata.word as word").where("chunk!='O'").show(1000, False)

+---+-----+---+------------------------+------------------+
|id |begin|end|chunk                   |word              |
+---+-----+---+------------------------+------------------+
|1  |2    |12 |B-Age                   |63-year-old       |
|1  |14   |16 |B-Organism              |man               |
|1  |61   |69 |B-Modifier              |recurrent         |
|1  |95   |104|B-Diagnosis             |cellulitis        |
|1  |107  |116|B-Diagnosis             |pneumonias        |
|1  |123  |127|B-Diagnosis             |upper             |
|1  |129  |139|I-Diagnosis             |respiratory       |
|1  |141  |145|I-Diagnosis             |tract             |
|1  |147  |156|I-Diagnosis             |infections        |
|1  |159  |160|B-Gender                |He                |
|1  |170  |179|B-Modifier              |subjective        |
|1  |181  |186|B-Symptom_Name          |fevers            |
|1  |237  |246|B-Modifier              |occasional        |
|1  |248  |252|B-Symptom_Name          |