![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/ER_MEDDRA.ipynb)

## Setup

In [21]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

locals().update(license_keys)

os.environ.update(license_keys)

In [22]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.4.1 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [23]:
import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp_jsl.pretrained import InternalResourceDownloader

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel

import pandas as pd
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 5.3.1
Spark NLP_JSL Version : 5.3.1


## MedDRA Models/ Mappers
|index|model|
|-----:|:-----|
| 1| [sbiobertresolve_meddra_preferred_term](https://nlp.johnsnowlabs.com/2024/03/24/sbiobertresolve_meddra_preferred_term_en.html)
| 2| [sbiobertresolve_meddra_lowest_level_term](https://nlp.johnsnowlabs.com/2024/03/24/sbiobertresolve_meddra_lowest_level_term_en.html)
| 3| [meddra_pt_hlt_mapper](https://nlp.johnsnowlabs.com/2024/03/28/meddra_pt_hlt_mapper_en.html)  
| 4| [meddra_hlt_pt_mapper](https://nlp.johnsnowlabs.com/2024/03/28/meddra_hlt_pt_mapper_en.html)
| 5| [icd10_meddra_llt_mapper](https://nlp.johnsnowlabs.com/2024/03/14/icd10_meddra_llt_mapper_en.html)
| 6| [meddra_llt_icd10_mapper](https://nlp.johnsnowlabs.com/2024/03/14/meddra_llt_icd10_mapper_en.html)
| 7| [meddra_llt_pt_mapper](https://nlp.johnsnowlabs.com/2024/03/18/meddra_llt_pt_mapper_en.html)
| 8| [meddra_pt_llt_mapper](https://nlp.johnsnowlabs.com/2024/03/18/meddra_pt_llt_mapper_en.html)
| 9| [meddra_pt_icd10_mapper](https://nlp.johnsnowlabs.com/2024/03/15/meddra_pt_icd10_mapper_en.html)
| 10| [icd10_meddra_pt_mapper](https://nlp.johnsnowlabs.com/2024/03/15/icd10_meddra_pt_mapper_en.html)  


# sbiobertresolve_meddra_preferred_term

In [78]:
documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")

ner_jsl = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")\
      .setInputCols(["sentence", "token", "embeddings"])\
      .setOutputCol("ner_jsl")

ner_jsl_converter = NerConverter()\
      .setInputCols(["sentence", "token", "ner_jsl"])\
      .setOutputCol("ner_jsl_chunk")\
      .setWhiteList(["Procedure","Kidney_Disease","Cerebrovascular_Disease","Heart_Disease",
                     "Disease_Syndrome_Disorder", "Symptom", "VS_Finding",
                     "EKG_Findings", "Communicable_Disease",
                     "Internal_organ_or_component","External_body_part_or_region",
                     "Triglycerides","Alcohol","Smoking","Pregnancy","Hypertension","Obesity",
                     "Injury_or_Poisoning","Test","Hyperlipidemia","Oncological",
                     "Psychological_Condition","LDL","Diabetes"])


ner_ade_clinical = MedicalNerModel.pretrained("ner_ade_clinical", "en", "clinical/models")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ade_clinica_ner")

ner_ade_clinical_converter = NerConverterInternal()\
      .setInputCols(["sentence", "token", "ade_clinica_ner"])\
      .setOutputCol("ner_ade_clinical_chunk")\
      .setWhiteList(["ADE"])

chunk_merger = ChunkMergeApproach()\
    .setInputCols('ner_ade_clinical_chunk',"ner_jsl_chunk")\
    .setOutputCol('merged_ner_chunk')

chunk2doc = Chunk2Doc() \
      .setInputCols("merged_ner_chunk") \
      .setOutputCol("ner_chunk_doc")

sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")\
     .setInputCols(["ner_chunk_doc"])\
     .setOutputCol("sbert_embeddings")\
     .setCaseSensitive(False)

meddra_resolver = SentenceEntityResolverModel.load("/content/sbiobertresolve_meddra_preferred_term") \
     .setInputCols(["sbert_embeddings"]) \
     .setOutputCol("meddra_pt_code")\
     .setDistanceFunction("EUCLIDEAN")

nlpPipeline= Pipeline(stages=[
                              documentAssembler,
                              sentenceDetector,
                              tokenizer,
                              word_embeddings,
                              ner_jsl,
                              ner_jsl_converter,
                              ner_ade_clinical,
                              ner_ade_clinical_converter,
                              chunk_merger,
                              chunk2doc,
                              sbert_embedder,
                              meddra_resolver
])

sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_jsl download started this may take some time.
[OK!]
ner_ade_clinical download started this may take some time.
[OK!]
sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]


In [85]:
text_list = [
    """This is an 82-year-old male with a history of prior tobacco use, benign hypertension, chronic renal insufficiency, chronic bronchitis, gastritis, and ischemic attack. He initially presented to Braintree with ST elevation and was transferred to St. Margaret’s Center. He underwent cardiac catheterization because of the left main coronary artery stenosis, which was complicated by hypotension and bradycardia. We describe the side effects of 5-FU in a colon cancer patient who suffered mucositis and dermatitis.""",
    """We describe the side effects of 5-FU in a colon cancer patient who suffered mucositis, dermatitis, prolonged myelosuppression, and neurologic toxicity that required admission to the intensive care unit. Anterior lumbosacral radiculopathy after intrathecal methotrexate treatment and acute erythroid leukemia after cyclophosphamide therapy for plasma cell myeloma: report of two cases.""",
    """ I've been using Arthrotec 50 on and off for over 10 years, only taking it when necessary. However, I have begun experiencing dizziness and tinnitus due to this medication. Due to worsening arthritis, I started taking it twice a day at age 50, which led to gastritis. I had to take it daily for the next month to assess its effects, and that's been the extent of my usage.""",
    ]

In [86]:
empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [87]:
df = spark.createDataFrame(pd.DataFrame({"text": text_list}))
result = pipelineModel.transform(df)
result.select(F.explode(F.arrays_zip(result.merged_ner_chunk.result,
                                     result.merged_ner_chunk.begin,
                                     result.merged_ner_chunk.end,
                                     result.merged_ner_chunk.metadata,
                                     result.meddra_pt_code.result,
                                     result.meddra_pt_code.metadata)).alias("cols"))\
        .select(F.expr("cols['0']").alias("ner_chunk"),
                F.expr("cols['1']").alias("begin"),
                F.expr("cols['2']").alias("end"),
                F.expr("cols['3'].entity").alias("entity"),
                F.expr("cols['4']").alias("meddra_pt_code"),
                F.expr("cols['5']['resolved_text']").alias("resolved_text"),
                F.expr("cols['5']['all_k_results']").alias("all_k_results"),
                F.expr("cols['5']['all_k_resolutions']").alias("all_k_resolutions")).show(60,truncate=False)

+----------------------------------+-----+---+-------------------------+--------------+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [88]:
from sparknlp_display import EntityResolverVisualizer

er_vis = EntityResolverVisualizer()

light_model = LightPipeline(pipelineModel)
lresult = light_model.fullAnnotate(text_list[0])

er_vis.display(lresult[0],
               label_col='merged_ner_chunk',
               resolution_col = 'meddra_pt_code',
               document_col='document'
               )

# sbiobertresolve_meddra_lowest_level_term

In [50]:
documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")

ner_jsl = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")\
      .setInputCols(["sentence", "token", "embeddings"])\
      .setOutputCol("ner_jsl")

ner_jsl_converter = NerConverter()\
      .setInputCols(["sentence", "token", "ner_jsl"])\
      .setOutputCol("ner_jsl_chunk")\
      .setWhiteList(["Procedure","Kidney_Disease","Cerebrovascular_Disease","Heart_Disease",
                     "Disease_Syndrome_Disorder", "Symptom", "VS_Finding",
                     "EKG_Findings", "Communicable_Disease",
                     "Internal_organ_or_component","External_body_part_or_region",
                     "Triglycerides","Alcohol","Smoking","Pregnancy","Hypertension","Obesity",
                     "Injury_or_Poisoning","Test","Hyperlipidemia","Oncological",
                     "Psychological_Condition","LDL","Diabetes"])

ner_ade_clinical = MedicalNerModel.pretrained("ner_ade_clinical", "en", "clinical/models")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ade_clinica_ner")

ner_ade_clinical_converter = NerConverterInternal()\
      .setInputCols(["sentence", "token", "ade_clinica_ner"])\
      .setOutputCol("ner_ade_clinical_chunk")\
      .setWhiteList(["ADE"])

chunk_merger = ChunkMergeApproach()\
    .setInputCols('ner_ade_clinical_chunk',"ner_jsl_chunk")\
    .setOutputCol('merged_ner_chunk')

chunk2doc = Chunk2Doc() \
      .setInputCols("merged_ner_chunk") \
      .setOutputCol("ner_chunk_doc")

sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")\
     .setInputCols(["ner_chunk_doc"])\
     .setOutputCol("sbert_embeddings")\
     .setCaseSensitive(False)

meddra_resolver = SentenceEntityResolverModel.load("/content/sbiobertresolve_meddra_lowest_level_term") \
     .setInputCols(["sbert_embeddings"]) \
     .setOutputCol("meddra_llt_code")\
     .setDistanceFunction("EUCLIDEAN")

nlpPipeline= Pipeline(stages=[
                              documentAssembler,
                              sentenceDetector,
                              tokenizer,
                              word_embeddings,
                              ner_jsl,
                              ner_jsl_converter,
                              ner_ade_clinical,
                              ner_ade_clinical_converter,
                              chunk_merger,
                              chunk2doc,
                              sbert_embedder,
                              meddra_resolver
])

text= """This is an 82-year-old male with a history of prior tobacco use, benign hypertension, chronic renal insufficiency, chronic bronchitis, gastritis, and ischemic attack. He initially presented to Braintree with ST elevation and was transferred to St. Margaret’s Center. He underwent cardiac catheterization because of the left main coronary artery stenosis, which was complicated by hypotension and bradycardia. We describe the side effects of 5-FU in a colon cancer patient who suffered mucositis and dermatitis."""

df= spark.createDataFrame([[text]]).toDF("text")

resolver_pipeline= nlpPipeline.fit(df)
result = resolver_pipeline.transform(df)

sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_jsl download started this may take some time.
[OK!]
ner_ade_clinical download started this may take some time.
[OK!]
sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]


In [51]:
result.select(F.explode(F.arrays_zip(result.merged_ner_chunk.result,
                                     result.merged_ner_chunk.begin,
                                     result.merged_ner_chunk.end,
                                     result.merged_ner_chunk.metadata,
                                     result.meddra_llt_code.result,
                                     result.meddra_llt_code.metadata)).alias("cols"))\
        .select(F.expr("cols['0']").alias("ner_chunk"),
                F.expr("cols['1']").alias("begin"),
                F.expr("cols['2']").alias("end"),
                F.expr("cols['3'].entity").alias("entity"),
                F.expr("cols['4']").alias("meddra_llt_code"),
                F.expr("cols['5']['resolved_text']").alias("resolution"),
                F.expr("cols['5']['all_k_results']").alias("all_k_results"),
                F.expr("cols['5']['all_k_resolutions']").alias("all_k_resolutions")).show(truncate=60)

+----------------------------------+-----+---+-------------------------+---------------+----------------------------------+------------------------------------------------------------+------------------------------------------------------------+
|                         ner_chunk|begin|end|                   entity|meddra_llt_code|                        resolution|                                               all_k_results|                                           all_k_resolutions|
+----------------------------------+-----+---+-------------------------+---------------+----------------------------------+------------------------------------------------------------+------------------------------------------------------------+
|                           tobacco|   52| 58|                  Smoking|       10067622|               tobacco interaction|10067622:::10086359:::10057581:::10082288:::10009180:::10...|tobacco interaction:::tobaccoism:::tobacco user:::exposur...|
|               

# meddra_llt_pt_mapper

In [None]:
document_assembler = DocumentAssembler()\
      .setInputCol('text')\
      .setOutputCol('doc')

chunk_assembler = Doc2Chunk()\
      .setInputCols(['doc'])\
      .setOutputCol('chunk')

mapperModel = ChunkMapperModel.load('/content/meddra_llt_pt_mapper')\
    .setInputCols(["chunk"])\
    .setOutputCol("mappings")\
    .setRels(["pt_code"])


pipeline = Pipeline(stages=[
    document_assembler,
    chunk_assembler,
    mapperModel
])

In [None]:
from pyspark.sql.types import StringType, IntegerType

texts=["""10002442""", """10000007""", """10003696"""]

data = spark.createDataFrame(texts, StringType()).toDF("text")

mapper_model = pipeline.fit(data)

result = mapper_model.transform(data)

In [None]:
result.select(F.explode(F.arrays_zip(result.chunk.result,
                                            result.mappings.result,
                                            result.mappings.metadata)).alias("cols"))\
    .select(F.expr("cols['0']").alias("llt_code"),
            F.expr("cols['1']").alias("pt_code"),
            F.expr("cols['2']['all_k_resolutions']").alias("all_k_resolutions")
           ).show()


+--------+--------------------+--------------------+
|llt_code|             pt_code|   all_k_resolutions|
+--------+--------------------+--------------------+
|10002442|10002442:Angiogra...|10002442:Angiogra...|
|10000007|10000007:17 ketos...|10000007:17 ketos...|
|10003696|10001324:Adrenal ...|10001324:Adrenal ...|
+--------+--------------------+--------------------+

