![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Serving Spark NLP with API: Synapse ML

# SynapseML

## Installation

In [None]:
import json
import os
from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.2.0 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

! pip -q install requests


## Imports and Spark Session

In [3]:
import pandas as pd
import pyspark
import sparknlp
import sparknlp_jsl
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.functions as F
from pyspark.sql.types import *
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.training import *
from sparknlp.training import CoNLL
import time
import requests
import uuid
import json
import requests
from concurrent.futures import ThreadPoolExecutor


In [4]:
spark = SparkSession.builder \
    .appName("Spark") \
    .master("local[*]") \
    .config("spark.driver.memory", "16G") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:0.9.5,com.johnsnowlabs.nlp:spark-nlp-spark32_2.12:"+PUBLIC_VERSION)\
    .config("spark.jars", "https://pypi.johnsnowlabs.com/"+SECRET+"/spark-nlp-jsl-"+JSL_VERSION+"-spark32.jar")\
    .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")\
    .getOrCreate()

In [5]:
print(sparknlp.version())
print(sparknlp_jsl.version())

3.4.2
3.5.0


In [6]:
spark

In [7]:
import synapse.ml
from synapse.ml.io import *

## Preparing a pipeline with Entity Resolution

In [8]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
document_assembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

# Sentence Detector DL annotator, processes various sentences per line
sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", 'clinical/models') \
      .setInputCols(["document"]) \
      .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

# WordEmbeddingsModel pretrained "embeddings_clinical" includes a model of 1.7Gb that needs to be downloaded
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("word_embeddings")

# Named Entity Recognition for clinical concepts.
clinical_ner = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "word_embeddings"]) \
      .setOutputCol("ner")

ner_converter_icd = NerConverterInternal() \
      .setInputCols(["sentence", "token", "ner"]) \
      .setOutputCol("ner_chunk")\
      .setWhiteList(['PROBLEM'])\
      .setPreservePosition(False)

c2doc = Chunk2Doc()\
      .setInputCols("ner_chunk")\
      .setOutputCol("ner_chunk_doc") 

sbert_embedder = BertSentenceEmbeddings.pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk_doc"])\
      .setOutputCol("sentence_embeddings")\
      .setCaseSensitive(False)
    
icd_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented_billable_hcc","en", "clinical/models") \
     .setInputCols(["ner_chunk", "sentence_embeddings"]) \
     .setOutputCol("icd10cm_code")\
     .setDistanceFunction("EUCLIDEAN")
    

# Build up the pipeline
resolver_pipeline = Pipeline(
    stages = [
        document_assembler,
        sentenceDetectorDL,
        tokenizer,
        word_embeddings,
        clinical_ner,
        ner_converter_icd,
        c2doc,
        sbert_embedder,
        icd_resolver
  ])


empty_data = spark.createDataFrame([['']]).toDF("text")

resolver_p_model = resolver_pipeline.fit(empty_data)

sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_icd10cm_augmented_billable_hcc download started this may take some time.
Approximate size to download 1.1 GB
[OK!]


## Adding a clinical note as a text example

In [9]:
clinical_note = """A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years 
    prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior 
    episode of HTG-induced pancreatitis three years prior to presentation, associated 
    with an acute hepatitis, and obesity with a body mass index (BMI) of 33.5 kg/m2, 
    presented with a one-week history of polyuria, polydipsia, poor appetite, and vomiting. 
    Two weeks prior to presentation, she was treated with a five-day course of amoxicillin 
    for a respiratory tract infection. She was on metformin, glipizide, and dapagliflozin 
    for T2DM and atorvastatin and gemfibrozil for HTG. She had been on dapagliflozin for six months 
    at the time of presentation. Physical examination on presentation was significant for dry oral mucosa; 
    significantly, her abdominal examination was benign with no tenderness, guarding, or rigidity. Pertinent 
    laboratory findings on admission were: serum glucose 111 mg/dl, bicarbonate 18 mmol/l, anion gap 20, 
    creatinine 0.4 mg/dL, triglycerides 508 mg/dL, total cholesterol 122 mg/dL, glycated hemoglobin (HbA1c) 
    10%, and venous pH 7.27. Serum lipase was normal at 43 U/L. Serum acetone levels could not be assessed 
    as blood samples kept hemolyzing due to significant lipemia. The patient was initially admitted for 
    starvation ketosis, as she reported poor oral intake for three days prior to admission. However, 
    serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL, the anion gap 
    was still elevated at 21, serum bicarbonate was 16 mmol/L, triglyceride level peaked at 2050 mg/dL, and 
    lipase was 52 U/L. The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - 
    the original sample was centrifuged and the chylomicron layer removed prior to analysis due to 
    interference from turbidity caused by lipemia again. The patient was treated with an insulin drip 
    for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL, within 
    24 hours. Her euDKA was thought to be precipitated by her respiratory tract infection in the setting 
    of SGLT2 inhibitor use. The patient was seen by the endocrinology service and she was discharged on 
    40 units of insulin glargine at night, 12 units of insulin lispro with meals, and metformin 1000 mg 
    two times a day. It was determined that all SGLT2 inhibitors should be discontinued indefinitely. She 
    had close follow-up with endocrinology post discharge."""


data = spark.createDataFrame([[clinical_note]]).toDF("text")

## Creating a JSON file with the clinical note
Since SynapseML runs a webservice that accepts HTTP calls with json format

In [10]:
data_json = {"text": clinical_note }

## Running a Synapse server

In [11]:
serving_input = spark.readStream.server() \
    .address("localhost", 9999, "benchmark_api") \
    .option("name", "benchmark_api") \
    .load() \
    .parseRequest("benchmark_api", data.schema)

serving_output = resolver_p_model.transform(serving_input) \
    .makeReply("icd10cm_code")

server = serving_output.writeStream \
      .server() \
      .replyTo("benchmark_api") \
      .queryName("benchmark_query") \
      .option("checkpointLocation", "file:///tmp/checkpoints-{}".format(uuid.uuid1())) \
      .start()



In [None]:
def post_url(args):
    print(f"- Request {str(args[2])} launched!")
    res = requests.post(args[0], data=args[1])    
    print(f"**Response {str(args[2])} received**")
    return res

# If you want to send parallel calls, just add more tuples to list_of_urls array
# tuple: (URL from above, json, number_of_call)
list_of_urls = [("http://localhost:9999/benchmark_api",json.dumps(data_json), 0)]

with ThreadPoolExecutor() as pool:
    response_list = list(pool.map(post_url,list_of_urls))

## Checking Results

In [14]:
for i in range (0, len(response_list[0].json())):
  print(response_list[0].json()[i]['result'])

O2441
O2411
E11
K8520
B15
E669
Z6841
R35
R631
R630
R111
J988
E11
G600
K130
R52
M6283
R4689
O046
E785
E872
E639
H5330
R799
R829
E785
A832
G600
J988
