![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare_jsl/PUBLIC_HEALTH_NER_DISEASE_ES.ipynb)

## Colab Setup

In [None]:
# Install the johnsnowlabs library to access Spark-OCR and Spark-NLP for Healthcare, Finance, and Legal.
! pip install -q johnsnowlabs

In [None]:
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()

In [None]:
from johnsnowlabs import *

# After uploading your license run this to install all licensed Python Wheels and pre-download Jars the Spark Session JVM
jsl.install()

In [None]:
from johnsnowlabs import *

# Automatically load license data and start a session with all jars user has access to
spark = jsl.start(exclude_ocr = True)

## **MODELS**

## `bert_token_classifier_disease_mentions_tweet`

In [None]:
text_list = ["""La ansiedad, la depresión, son dos trastornos emocionales graves, muy graves, a todos nos pueden llegar en cualquier momento de nuestras vidas y por muchas""","""Sinusitis de caballo, Faringitis aguda e infección de orina, también elevada.""",
"""El tabaquismo está detrás de un alto porcentaje de casos de cáncer y enfermedades cardiovasculares""",
"""Muchos pacientes vivimos sin tiroides por diferentes patologías Bocio, Hipertiroidismo, Carcinomas (papilar, folicular, medular) Tumores neuroendocrinos Laringectomizados Tomamos levotiroxina sódica.""",
"""El diagnóstico fueron varios. Principal: Neumonía en el pulmón derecho. Sinusitis de caballo, Faringitis aguda e infección de orina, también elevada. Gripe No."""]

In [None]:
documentAssembler = nlp.DocumentAssembler()\
   .setInputCol("text")\
   .setOutputCol("document")
   
sentenceDetector = nlp.SentenceDetectorDLModel.pretrained()\
   .setInputCols(["document"])\
   .setOutputCol("sentence")
   
tokenizer = nlp.Tokenizer()\
   .setInputCols("sentence")\
   .setOutputCol("token")
   
tokenClassifier = medical.BertForTokenClassifier.pretrained("bert_token_classifier_disease_mentions_tweet", "es", "clinical/models")\
   .setInputCols("token", "sentence")\
   .setOutputCol("ner")\
   .setCaseSensitive(True)
   
ner_converter = nlp.NerConverter()\
   .setInputCols(["sentence","token","ner"])\
   .setOutputCol("ner_chunk")
   
pipeline =  Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        tokenClassifier,
        ner_converter])

df = spark.createDataFrame(text_list, StringType()).toDF("text")
ner_model = pipeline.fit(df)

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]
bert_token_classifier_disease_mentions_tweet download started this may take some time.
[OK!]


In [None]:
result = ner_model.transform(df)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result, 
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)


+---------------------------------------+----------+
|chunk                                  |ner_label |
+---------------------------------------+----------+
|ansiedad                               |ENFERMEDAD|
|depresión                              |ENFERMEDAD|
|trastornos emocionales graves          |ENFERMEDAD|
|Sinusitis                              |ENFERMEDAD|
|Faringitis aguda                       |ENFERMEDAD|
|infección de orina                     |ENFERMEDAD|
|tabaquismo                             |ENFERMEDAD|
|cáncer                                 |ENFERMEDAD|
|enfermedades cardiovasculares          |ENFERMEDAD|
|Bocio                                  |ENFERMEDAD|
|Hipertiroidismo                        |ENFERMEDAD|
|Carcinomas (papilar, folicular, medular|ENFERMEDAD|
|Tumores neuroendocrinos                |ENFERMEDAD|
|Neumonía en el pulmón                  |ENFERMEDAD|
|Sinusitis                              |ENFERMEDAD|
|Faringitis aguda                       |ENFER

In [None]:
from sparknlp_display import NerVisualizer

for i in range(len(text_list)):
  NerVisualizer().display(
      result = result.collect()[i],
      label_col = 'ner_chunk',
      document_col = 'document'
  )

## `disease_mentions_tweet`

In [None]:
document_assembler = nlp.DocumentAssembler()\
	.setInputCol("text")\
	.setOutputCol("document")
 
sentenceDetectorDL = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\
	.setInputCols(["document"])\
	.setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
	.setInputCols(["sentence"])\
	.setOutputCol("token")

word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_scielo_300d","es","clinical/models")\
	.setInputCols(["sentence","token"])\
	.setOutputCol("embeddings")

ner = medical.NerModel.pretrained('disease_mentions_tweet', "es", "clinical/models") \
	.setInputCols(["sentence", "token", "embeddings"]) \
	.setOutputCol("ner")
 
ner_converter = nlp.NerConverter()\
	.setInputCols(["sentence", "token", "ner"])\
	.setOutputCol("ner_chunk")

pipeline = Pipeline(stages=[
	document_assembler,
	sentenceDetectorDL,
	tokenizer,
	word_embeddings,
	ner,
	ner_converter])

df = spark.createDataFrame(text_list, StringType()).toDF("text")
ner_model = pipeline.fit(df)


sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
embeddings_scielo_300d download started this may take some time.
Approximate size to download 204.8 MB
[OK!]
disease_mentions_tweet download started this may take some time.
[OK!]


In [None]:
result = ner_model.transform(df)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result, 
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+---------------------------------------+----------+
|chunk                                  |ner_label |
+---------------------------------------+----------+
|ansiedad                               |ENFERMEDAD|
|depresión                              |ENFERMEDAD|
|trastornos emocionales graves          |ENFERMEDAD|
|Sinusitis de caballo                   |ENFERMEDAD|
|Faringitis aguda                       |ENFERMEDAD|
|infección de orina                     |ENFERMEDAD|
|tabaquismo                             |ENFERMEDAD|
|cáncer                                 |ENFERMEDAD|
|enfermedades cardiovasculares          |ENFERMEDAD|
|Bocio                                  |ENFERMEDAD|
|Hipertiroidismo                        |ENFERMEDAD|
|Carcinomas (papilar, folicular, medular|ENFERMEDAD|
|Tumores neuroendocrinos                |ENFERMEDAD|
|Neumonía en el pulmón                  |ENFERMEDAD|
|Sinusitis de caballo                   |ENFERMEDAD|
|Faringitis aguda                       |ENFER

In [None]:
from sparknlp_display import NerVisualizer

for i in range(len(text_list)):
  NerVisualizer().display(
      result = result.collect()[i],
      label_col = 'ner_chunk',
      document_col = 'document'
  )