
![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/NER_JSL.ipynb)

# **Detect Clinical Entities**

To run this yourself, you will need to upload your license keys to the notebook. Just Run The Cell Below in order to do that. Also You can open the file explorer on the left side of the screen and upload license_keys.json to the folder that opens. Otherwise, you can look at the example outputs at the bottom of the notebook.

# **Colab Setup**

In [None]:
import json, os
from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [3]:
import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql.types import StringType, IntegerType

import pandas as pd
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 4.2.8
Spark NLP_JSL Version : 4.2.8


# **Models:**


```
ner_jsl
ner_jsl_slim
ner_jsl_enriched
ner_jsl_greedy


bert_token_classifier_ner_jsl
bert_token_classifier_ner_jsl_slim

```



# **🔎Sample Text**

In [4]:
sample_text = """The patient is a 40-year-old white male who presents with a chief complaint of "chest pain". The patient is diabetic and has a prior history of coronary artery disease. The patient presents today stating that his chest pain started yesterday evening and has been somewhat intermittent. He has been advised Aspirin 81 milligrams QDay, insulin 50 units in a.m. HCTZ 50 mg QDay. Nitroglycerin 1/150 sublingually PRN chest pain."""




# ***🔎For models:***

- ***ner_jsl***
- ***ner_jsl_slim***
- ***ner_jsl_enriched***
- ***ner_jsl_greedy***


### **Define Spark NLP pipeline**

In [5]:
jsl_model_list = ["ner_jsl", 
                  "ner_jsl_slim", 
                  "ner_jsl_enriched", 
                  "ner_jsl_greedy"]

In [6]:
documentAssembler = DocumentAssembler()\
                .setInputCol("text")\
                .setOutputCol("document")

sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") \
                .setInputCols(["document"]) \
                .setOutputCol("sentence") 

tokenizer = Tokenizer()\
                .setInputCols(["sentence"])\
                .setOutputCol("token")

jsl_ner_converter = NerConverterInternal() \
                .setInputCols(["sentence", "token", "jsl_ner"]) \
                .setOutputCol("ner_chunk")

embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
                .setInputCols(["sentence", "token"])\
                .setOutputCol("embeddings")
  
for model_name in jsl_model_list:

  jsl_ner = MedicalNerModel.pretrained(model_name, "en", "clinical/models") \
                          .setInputCols(["sentence", "token", "embeddings"]) \
                          .setOutputCol("jsl_ner")


  jsl_ner_pipeline = Pipeline(stages=[documentAssembler, 
                                      sentenceDetector,
                                      tokenizer,
                                      embeddings,
                                      jsl_ner,
                                      jsl_ner_converter])


  jsl_ner_model = jsl_ner_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))
  
  light_model = LightPipeline(jsl_ner_model)
  light_result = light_model.fullAnnotate(sample_text)

  print("\n\n\n")
  print(f"***************  The visualization results for {model_name} ***************")
  print("\n\n\n")

  from sparknlp_display import NerVisualizer
  visualiser = NerVisualizer()
  visualiser.display(light_result[0], label_col='ner_chunk', document_col='document')
  print("\n\n\n")

sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_jsl download started this may take some time.
[OK!]




***************  The visualization results for ner_jsl ***************










ner_jsl_slim download started this may take some time.
[OK!]




***************  The visualization results for ner_jsl_slim ***************










ner_jsl_enriched download started this may take some time.
[OK!]




***************  The visualization results for ner_jsl_enriched ***************










ner_jsl_greedy download started this may take some time.
[OK!]




***************  The visualization results for ner_jsl_greedy ***************














# ***🔎For models:***

- ***bert_token_classifier_ner_jsl***
- ***bert_token_classifier_ner_jsl_slim***



### **Define Spark NLP pipeline**

In [7]:
bert_jsl_ner_model_list = ["bert_token_classifier_ner_jsl", 
                           "bert_token_classifier_ner_jsl_slim"]

In [8]:
documentAssembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

sentenceDetector = SentenceDetectorDLModel.pretrained() \
        .setInputCols(["document"]) \
        .setOutputCol("sentence") 

tokenizer = Tokenizer()\
        .setInputCols("sentence")\
        .setOutputCol("token")

ner_converter = NerConverterInternal()\
        .setInputCols(["sentence","token","ner"])\
        .setOutputCol("ner_chunk")

for model_name in bert_jsl_ner_model_list:
  tokenClassifier = MedicalBertForTokenClassifier.pretrained(model_name, "en", "clinical/models")\
        .setInputCols(["token", "sentence"])\
        .setOutputCol("ner")\
        .setCaseSensitive(True)

  bert_jsl_ner_pipeline =  Pipeline(stages=[documentAssembler, 
                                            sentenceDetector, 
                                            tokenizer, 
                                            tokenClassifier, 
                                            ner_converter])


  bert_jsl_ner_model = bert_jsl_ner_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))
  light_model = LightPipeline(bert_jsl_ner_model)
  light_result = light_model.fullAnnotate(sample_text)

  print("\n\n\n")
  print(f"***************  The visualization results for {model_name} ***************")
  print("\n\n\n")

  from sparknlp_display import NerVisualizer
  visualiser = NerVisualizer()
  visualiser.display(light_result[0], label_col='ner_chunk', document_col='document')
  print("\n\n\n")

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]
bert_token_classifier_ner_jsl download started this may take some time.
[OK!]




***************  The visualization results for bert_token_classifier_ner_jsl ***************










bert_token_classifier_ner_jsl_slim download started this may take some time.
[OK!]




***************  The visualization results for bert_token_classifier_ner_jsl_slim ***************










