![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/PUBLIC_HEALTH_MENTION.ipynb)




# **Public Health Models**

# **Colab Setup**

In [None]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

# **Install dependencies**

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

# **Import dependencies into Python and start the Spark session**

In [2]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.util import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.pretrained import ResourceDownloader

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType
from pyspark.ml import Pipeline, PipelineModel

import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

import string
import numpy as np

params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(secret = SECRET, params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 4.1.0
Spark NLP_JSL Version : 4.1.0


## **Sample Text**

In [3]:
text_list = [
"""Got sumatriptan for this week long migraine  I thought these were over as a kid  but I guess not  Thought I could just baby it a couple days with water and pedialyte and excedrin  Nope    Why am I so stubborn about continuing to  DO THINGA when I m obviously not well""",
"""That Thunder trade alert just now from  almost gave me a heart attack  Was bracing myself to see THUNDER SENDS RUSS TO Wow That was the briefest emotional roller coaster ever""",
"""The sickle cell clinic of  still goes on  It is even more for the next 2 months  Ever heard of childhood stroke  One of the leading causes of childhood stroke is sickle cell  Amidst cancer and other""",
"""In 2015 I suffered a stroke  This prevented me from playing Rugby League again  It was difficult physically and mentally to recover from  bit four years later  here I am  about to embark on this   Read my story and please donate if you can""",
"""i respect the fuck out of the lesbian who was in the bar bathroom  one holer  directly before me and dropped a gnarly rank dookie like make a raccoon cough type shit""",  
"""Many of us have witnessed the sad  steady march of Alzheimer s disease as it destroys memory and thinking ability  But what about the physical effects of Alzheimer s which also are significant  but which we tend not to think about as much"""
]

# **MODELS**

## **bert_sequence_classifier_health_mentions**

In [9]:
# bert_sequence_classifier_health_mentions

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

sequenceClassifier = MedicalBertForSequenceClassification.pretrained("bert_sequence_classifier_health_mentions", "en", "clinical/models")\
    .setInputCols(["document","token"])\
    .setOutputCol("class")

pipeline = Pipeline(stages=[
    document_assembler, 
    tokenizer,
    sequenceClassifier    
])

df = spark.createDataFrame(text_list, StringType()).toDF("text")
result = pipeline.fit(df).transform(df)

bert_sequence_classifier_health_mentions download started this may take some time.
[OK!]


In [11]:
result = result.select(F.explode(F.arrays_zip("document.result", "class.result", "class.metadata")).alias("col"))\
                .select(F.expr("col['0']").alias("Text"),
                        F.expr("col['1']").alias("Class"),
                        F.expr("col['2']").alias("Confidence"))

In [13]:
model = "bert_sequence_classifier_health_mentions"

if result.count()>1:    
  udf_func = F.udf(lambda x,y:  x["Some("+str(y)+")"])    
  print("\n",model,"\n")    
  result.withColumn('Confidence', udf_func(result.Confidence, result.Class)).show(truncate=50)


 bert_sequence_classifier_health_mentions 

+--------------------------------------------------+------------------+----------+
|                                              Text|             Class|Confidence|
+--------------------------------------------------+------------------+----------+
|Got sumatriptan for this week long migraine  I ...|    health_mention|0.99974453|
|That Thunder trade alert just now from  almost ...|figurative_mention|   0.99972|
|The sickle cell clinic of  still goes on  It is...|     other_mention|  0.999114|
|In 2015 I suffered a stroke  This prevented me ...|    health_mention| 0.9994201|
|i respect the fuck out of the lesbian who was i...|figurative_mention|0.99958795|
|Many of us have witnessed the sad  steady march...|     other_mention| 0.9727036|
+--------------------------------------------------+------------------+----------+



## **classifierdl_health_mentions**

In [4]:
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

bert_embeddings = BertEmbeddings.pretrained("bert_embeddings_phs_bert", "en", "public/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")\

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["sentence", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLModel.pretrained('classifierdl_health_mentions', 'en', 'clinical/models')\
    .setInputCols(['sentence', 'token', 'sentence_embeddings'])\
    .setOutputCol('class')

clf_pipeline = Pipeline(stages = [document_assembler,
                                  tokenizer,
                                  bert_embeddings,
                                  embeddingsSentence,
                                  classifierdl])

df = spark.createDataFrame(text_list, StringType()).toDF("text")
result = clf_pipeline.fit(df).transform(df)

bert_embeddings_phs_bert download started this may take some time.
Approximate size to download 1.2 GB
[OK!]
classifierdl_health_mentions download started this may take some time.
Approximate size to download 22.9 MB
[OK!]


In [5]:
result = result.select(F.explode(F.arrays_zip("sentence.result", "class.result", "class.metadata")).alias("col"))\
                .select(F.expr("col['0']").alias("Text"),
                        F.expr("col['1']").alias("Class"),
                        F.expr("col['2']").alias("Confidence"))

In [9]:
result.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+----------------------------------------------------------------------------------------------------------------+
|Text                                                                                                                                                                                                                                                                       |Class             |Confidence                                                                                                      |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
model = "classifierdl_health_mentions"

if result.count()>1:    
  udf_func = F.udf(lambda x,y:  x[y])    
  print("\n",model,"\n")    
  result.withColumn('Confidence', udf_func(result.Confidence, result.Class)).show(truncate=50)


 classifierdl_health_mentions 

+--------------------------------------------------+------------------+----------+
|                                              Text|             Class|Confidence|
+--------------------------------------------------+------------------+----------+
|Got sumatriptan for this week long migraine  I ...|    health_mention|  0.999987|
|That Thunder trade alert just now from  almost ...|figurative_mention| 0.9999999|
|The sickle cell clinic of  still goes on  It is...|     other_mention| 0.9992162|
|In 2015 I suffered a stroke  This prevented me ...|    health_mention| 0.9999814|
|i respect the fuck out of the lesbian who was i...|figurative_mention|0.60502684|
|Many of us have witnessed the sad  steady march...|     other_mention|0.99998915|
+--------------------------------------------------+------------------+----------+

