![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/PUBLIC_HEALTH_MB4TC.ipynb)

# `Medical Bert For Token Classification` for **Public Health Models**

# **Colab Setup**

In [None]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

# **Install dependencies**

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

# **Import dependencies into Python and start the Spark session**

In [24]:
import json
import os

from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql import SparkSession

import sparknlp
import sparknlp_jsl

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
from sparknlp.util import *
from sparknlp.pretrained import ResourceDownloader
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType
from sparknlp_display import NerVisualizer


import pandas as pd

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

import string
import numpy as np

params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(secret = SECRET, params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 4.0.2
Spark NLP_JSL Version : 4.0.2


# **General Function for MedicalBertForTokenClassifier Pipeline**

In [34]:
def run_pipeline (model, text):
  documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
  sentenceDetector = SentenceDetectorDLModel.pretrained()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")
    
  tokenizer = Tokenizer()\
    .setInputCols("sentence")\
    .setOutputCol("token")
    
  tokenClassifier = MedicalBertForTokenClassifier.pretrained(model, "en", "clinical/models")\
    .setInputCols("token", "sentence")\
    .setOutputCol("ner")\
    .setCaseSensitive(True)
    
  ner_converter = NerConverter()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("ner_chunk")
    
  pipeline =  Pipeline(stages=[
                        documentAssembler,
                        sentenceDetector,
                        tokenizer,
                        tokenClassifier,
                        ner_converter])

  df = spark.createDataFrame(text, StringType()).toDF("text")
  result = pipeline.fit(df).transform(df)
  
  print("\n")
  print("<----------------- MODEL NAME:","\033[1m" + model + "\033[0m"," ----------------- >")
  print("\n")

  result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
        .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

  print("\n")
  
  for i in range(len(text)):
    NerVisualizer().display(
      result = result.collect()[i],
      label_col = 'ner_chunk',
      document_col = 'document'
  )

# **MODELS**

## **bert_token_classifier_ade_tweet_binary**

In [25]:
model = 'bert_token_classifier_ade_tweet_binary'

In [38]:
sample_texts = [
  """I understand you very well. :( just got 1st urgh ! humira worked for me for just 3months then got painful reactions.""",
  """This vyvanse got me sweating right now and i dont even know why!""",
  """Wonder which drug is doing this memory lapse thing. My guess the Duloxetine.""",
  """I used to be on paxil but that made me more depressed and prozac made me angry.""",
  """Maybe it's because of the effect of seroquel, but when I eat fast carbohydrates, I feel the sugar drop."""
]

In [39]:
run_pipeline(model, sample_texts)

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]
bert_token_classifier_ade_tweet_binary download started this may take some time.
[OK!]


<----------------- MODEL NAME: [1mbert_token_classifier_ade_tweet_binary[0m  ----------------- >


+-----------------+---------+
|chunk            |ner_label|
+-----------------+---------+
|painful reactions|ADE      |
|sweating         |ADE      |
|memory lapse     |ADE      |
|depressed        |ADE      |
|angry            |ADE      |
|sugar drop       |ADE      |
+-----------------+---------+



