In [2]:
import json
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.common import *
from sparknlp.training import *
from sparknlp.annotator import *

from sparknlp_jsl.base import *
from sparknlp_jsl.annotator import *

from pyspark.ml import Pipeline
import pyspark.sql.types as T
import pyspark.sql.functions as F

import pandas as pd

params = {"spark.executor.cores":12,    # you can change the configs 
          "spark.driver.memory":"40G",
          "spark.driver.maxResultSize":"5G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.serializer": "org.apache.spark.serializer.KryoSerializer"}

spark = sparknlp_jsl.start(os.environ["SECRET"], params=params) 

# "AWS_ACCESS_KEY_ID" "AWS_SECRET_ACCESS_KEY" "SPARK_NLP_LICENSE" "SECRET" those varibles should be already setted in you environmed varibles

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark.sparkContext.setLogLevel("ERROR")

spark

Spark NLP Version : 5.1.2
Spark NLP_JSL Version : 5.1.2


In [3]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = MedicalNerModel.pretrained("ner_clinical_large","en","clinical/models")\
    .setInputCols(["sentence","token","embeddings"])\
    .setOutputCol("ner")\
    .setLabelCasing("upper") #decide if we want to return the tags in upper or lower case

ner_converter = NerConverterInternal()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        clinical_ner,
        ner_converter
        ])


empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[ / ]sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
Download done! Loading the resource.
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[ / ]embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
Download done! Loading the resource.
[OK!]
ner_clinical_large download started this may take some time.
[ / ]ner_clinical_large download started this may take some time.
Approximate size to download 13.9 MB
[ \ ]Download done! Loading the resource.
[OK!]


In [4]:
# fullAnnotate in LightPipeline

text = '''
A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior episode of HTG-induced pancreatitis three years prior to presentation, and associated with an acute hepatitis, presented with a one-week history of polyuria, poor appetite, and vomiting.
She was on metformin, glipizide, and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG. She had been on dapagliflozin for six months at the time of presentation.
Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness, guarding, or rigidity. Pertinent laboratory findings on admission were: serum glucose 111 mg/dl,  creatinine 0.4 mg/dL, triglycerides 508 mg/dL, total cholesterol 122 mg/dL, and venous pH 7.27.
'''

print (text)

light_model = LightPipeline(model)

light_result = light_model.fullAnnotate(text)


chunks = []
entities = []
sentence= []
begin = []
end = []
confidence = []

for n in light_result[0]['ner_chunk']:

    begin.append(n.begin)
    end.append(n.end)
    chunks.append(n.result)
    entities.append(n.metadata['entity'])
    sentence.append(n.metadata['sentence'])
    confidence.append(n.metadata["confidence"])


df_clinical = pd.DataFrame({'chunks':chunks, 'begin': begin, 'end':end,
                            'sentence_id':sentence, 'entities':entities, 'confidence':confidence})

df_clinical.head(20)


A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior episode of HTG-induced pancreatitis three years prior to presentation, and associated with an acute hepatitis, presented with a one-week history of polyuria, poor appetite, and vomiting.
She was on metformin, glipizide, and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG. She had been on dapagliflozin for six months at the time of presentation.
Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness, guarding, or rigidity. Pertinent laboratory findings on admission were: serum glucose 111 mg/dl,  creatinine 0.4 mg/dL, triglycerides 508 mg/dL, total cholesterol 122 mg/dL, and venous pH 7.27.



Unnamed: 0,chunks,begin,end,sentence_id,entities,confidence
0,gestational diabetes mellitus,40,68,0,PROBLEM,0.91976666
1,subsequent type two diabetes mellitus,118,154,0,PROBLEM,0.75924003
2,T2DM,157,160,0,PROBLEM,0.9917
3,HTG-induced pancreatitis,185,208,0,PROBLEM,0.97535
4,an acute hepatitis,265,282,0,PROBLEM,0.9440667
5,polyuria,322,329,0,PROBLEM,0.9728
6,poor appetite,332,344,0,PROBLEM,0.9934
7,vomiting,351,358,0,PROBLEM,0.9854
8,metformin,372,380,1,TREATMENT,0.9998
9,glipizide,383,391,1,TREATMENT,0.9999
