![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/PUBLIC_HEALTH_NER_DISEASE_ES.ipynb)

## Colab Setup

In [None]:
import json, os
from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [3]:
import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql.types import StringType, IntegerType

import pandas as pd
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 4.2.8
Spark NLP_JSL Version : 4.2.8


## **MODELS**

## `bert_token_classifier_disease_mentions_tweet`

In [4]:
text_list = ["""La ansiedad, la depresión, son dos trastornos emocionales graves, muy graves, a todos nos pueden llegar en cualquier momento de nuestras vidas y por muchas""","""Sinusitis de caballo, Faringitis aguda e infección de orina, también elevada.""",
"""El tabaquismo está detrás de un alto porcentaje de casos de cáncer y enfermedades cardiovasculares""",
"""Muchos pacientes vivimos sin tiroides por diferentes patologías Bocio, Hipertiroidismo, Carcinomas (papilar, folicular, medular) Tumores neuroendocrinos Laringectomizados Tomamos levotiroxina sódica.""",
"""El diagnóstico fueron varios. Principal: Neumonía en el pulmón derecho. Sinusitis de caballo, Faringitis aguda e infección de orina, también elevada. Gripe No."""]

In [5]:
documentAssembler = DocumentAssembler()\
   .setInputCol("text")\
   .setOutputCol("document")
   
sentenceDetector = SentenceDetectorDLModel.pretrained()\
   .setInputCols(["document"])\
   .setOutputCol("sentence")
   
tokenizer = Tokenizer()\
   .setInputCols("sentence")\
   .setOutputCol("token")
   
tokenClassifier = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_disease_mentions_tweet", "es", "clinical/models")\
   .setInputCols("token", "sentence")\
   .setOutputCol("ner")\
   .setCaseSensitive(True)
   
ner_converter = NerConverter()\
   .setInputCols(["sentence","token","ner"])\
   .setOutputCol("ner_chunk")
   
pipeline =  Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        tokenClassifier,
        ner_converter])

df = spark.createDataFrame(text_list, StringType()).toDF("text")
ner_model = pipeline.fit(df)

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]
bert_token_classifier_disease_mentions_tweet download started this may take some time.
[OK!]


In [6]:
result = ner_model.transform(df)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result, 
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)


+---------------------------------------+----------+
|chunk                                  |ner_label |
+---------------------------------------+----------+
|ansiedad                               |ENFERMEDAD|
|depresión                              |ENFERMEDAD|
|trastornos emocionales graves          |ENFERMEDAD|
|Sinusitis                              |ENFERMEDAD|
|Faringitis aguda                       |ENFERMEDAD|
|infección de orina                     |ENFERMEDAD|
|tabaquismo                             |ENFERMEDAD|
|cáncer                                 |ENFERMEDAD|
|enfermedades cardiovasculares          |ENFERMEDAD|
|Bocio                                  |ENFERMEDAD|
|Hipertiroidismo                        |ENFERMEDAD|
|Carcinomas (papilar, folicular, medular|ENFERMEDAD|
|Tumores neuroendocrinos                |ENFERMEDAD|
|Neumonía en el pulmón                  |ENFERMEDAD|
|Sinusitis                              |ENFERMEDAD|
|Faringitis aguda                       |ENFER

In [7]:
from sparknlp_display import NerVisualizer

for i in range(len(text_list)):
  NerVisualizer().display(
      result = result.collect()[i],
      label_col = 'ner_chunk',
      document_col = 'document'
  )

## `disease_mentions_tweet`

In [8]:
document_assembler = DocumentAssembler()\
	.setInputCol("text")\
	.setOutputCol("document")
 
sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\
	.setInputCols(["document"])\
	.setOutputCol("sentence")

tokenizer = Tokenizer()\
	.setInputCols(["sentence"])\
	.setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_scielo_300d","es","clinical/models")\
	.setInputCols(["sentence","token"])\
	.setOutputCol("embeddings")

ner = MedicalNerModel.pretrained('disease_mentions_tweet', "es", "clinical/models") \
	.setInputCols(["sentence", "token", "embeddings"]) \
	.setOutputCol("ner")
 
ner_converter = NerConverter()\
	.setInputCols(["sentence", "token", "ner"])\
	.setOutputCol("ner_chunk")

pipeline = Pipeline(stages=[
	document_assembler,
	sentenceDetectorDL,
	tokenizer,
	word_embeddings,
	ner,
	ner_converter])

df = spark.createDataFrame(text_list, StringType()).toDF("text")
ner_model = pipeline.fit(df)


sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
embeddings_scielo_300d download started this may take some time.
Approximate size to download 204.8 MB
[OK!]
disease_mentions_tweet download started this may take some time.
[OK!]


In [9]:
result = ner_model.transform(df)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result, 
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+---------------------------------------+----------+
|chunk                                  |ner_label |
+---------------------------------------+----------+
|ansiedad                               |ENFERMEDAD|
|depresión                              |ENFERMEDAD|
|trastornos emocionales graves          |ENFERMEDAD|
|Sinusitis de caballo                   |ENFERMEDAD|
|Faringitis aguda                       |ENFERMEDAD|
|infección de orina                     |ENFERMEDAD|
|tabaquismo                             |ENFERMEDAD|
|cáncer                                 |ENFERMEDAD|
|enfermedades cardiovasculares          |ENFERMEDAD|
|Bocio                                  |ENFERMEDAD|
|Hipertiroidismo                        |ENFERMEDAD|
|Carcinomas (papilar, folicular, medular|ENFERMEDAD|
|Tumores neuroendocrinos                |ENFERMEDAD|
|Neumonía en el pulmón                  |ENFERMEDAD|
|Sinusitis de caballo                   |ENFERMEDAD|
|Faringitis aguda                       |ENFER

In [10]:
from sparknlp_display import NerVisualizer

for i in range(len(text_list)):
  NerVisualizer().display(
      result = result.collect()[i],
      label_col = 'ner_chunk',
      document_col = 'document'
  )