In [22]:
import json
import os

from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [23]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.5.1 spark-nlp==$PUBLIC_VERSION


# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [24]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp_jsl.pipeline_tracer import PipelineTracer
from sparknlp_jsl.pipeline_output_parser import PipelineOutputParser

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel


import pandas as pd
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 6.0.2
Spark NLP_JSL Version : 6.0.2


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
loaded_ner_model = MedicalNerModel.load("/content/drive/MyDrive/saved_ner_model")

In [50]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/refs/heads/master/tutorials/Certification_Trainings/Healthcare/data/mt_samples.csv


In [51]:
mt_samples_pandas=pd.read_csv('mt_samples.csv')

In [52]:

my_data_spark=spark.createDataFrame(mt_samples_pandas)

In [54]:
my_data_spark.show()

+--------------------+
|                text|
+--------------------+
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
|Sample Type / Med...|
+--------------------+
only showing top 20 rows



In [56]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

clinical_ner = MedicalNerModel.pretrained("ner_clinical","en","clinical/models")\
    .setInputCols(["sentence","token","embeddings"])\
    .setOutputCol("ner")\

clinical_ner_converter = NerConverterInternal()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("clinical_ner_chunk")



nlpPipeline = Pipeline(
     stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        clinical_ner,
        clinical_ner_converter,
        loaded_ner_model
       ])



model = nlpPipeline.fit(my_data_spark)






sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical download started this may take some time.
[OK!]


In [62]:
# fullAnnotate in LightPipeline

text = '''
A 45-year-old male with a history of hypertension diagnosed five years prior to presentation and subsequent chronic kidney disease (CKD), one prior episode of myocardial infarction two years prior to presentation, and associated with an acute bronchitis, presented with a two-day history of chest pain, shortness of breath, and dizziness.
He was on lisinopril, amlodipine, and furosemide for CKD and atorvastatin and clopidogrel for heart disease. He had been on amlodipine for one year at the time of presentation.
Physical examination on presentation was significant for elevated blood pressure; notably, his cardiac examination revealed irregular heartbeat with no murmurs or gallops. Pertinent laboratory findings on admission were: serum creatinine 1.8 mg/dL, hemoglobin 10.2 g/dL, potassium 5.1 mmol/L, ECG showing ST elevation, and troponin I 0.9 ng/mL.
'''

print (text)

light_model = LightPipeline(model)

light_result = light_model.fullAnnotate(text)


chunks = []
entities = []
sentence= []
begin = []
end = []
confidence = []

for n in light_result[0]['clinical_ner_chunk']:

    begin.append(n.begin)
    end.append(n.end)
    chunks.append(n.result)
    entities.append(n.metadata['entity'])
    sentence.append(n.metadata['sentence'])
    confidence.append(n.metadata["confidence"])


df_clinical = pd.DataFrame({'chunks':chunks, 'begin': begin, 'end':end,
                   'sentence_id':sentence, 'entities':entities, 'confidence':confidence})

df_clinical.head(20)


A 45-year-old male with a history of hypertension diagnosed five years prior to presentation and subsequent chronic kidney disease (CKD), one prior episode of myocardial infarction two years prior to presentation, and associated with an acute bronchitis, presented with a two-day history of chest pain, shortness of breath, and dizziness.
He was on lisinopril, amlodipine, and furosemide for CKD and atorvastatin and clopidogrel for heart disease. He had been on amlodipine for one year at the time of presentation.
Physical examination on presentation was significant for elevated blood pressure; notably, his cardiac examination revealed irregular heartbeat with no murmurs or gallops. Pertinent laboratory findings on admission were: serum creatinine 1.8 mg/dL, hemoglobin 10.2 g/dL, potassium 5.1 mmol/L, ECG showing ST elevation, and troponin I 0.9 ng/mL.



Unnamed: 0,chunks,begin,end,sentence_id,entities,confidence
0,hypertension,38,49,0,PROBLEM,0.9852
1,subsequent chronic kidney disease,98,130,0,PROBLEM,0.86320007
2,CKD,133,135,0,PROBLEM,0.9874
3,myocardial infarction,160,180,0,PROBLEM,0.98899996
4,an acute bronchitis,235,253,0,PROBLEM,0.94696665
5,chest pain,292,301,0,PROBLEM,0.98749995
6,shortness of breath,304,322,0,PROBLEM,0.8389
7,dizziness,329,337,0,PROBLEM,0.9946
8,lisinopril,350,359,1,TREATMENT,0.9999
9,amlodipine,362,371,1,TREATMENT,0.9999


In [63]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()

visualiser.display(light_result[0], label_col='clinical_ner_chunk', document_col='document', save_path="display_result.html")