

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/SMOKING_STATUS.ipynb)

# **Colab Setup**

In [None]:
import json
import os

from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

In [5]:
import json
import os

from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql import SparkSession

import sparknlp
import sparknlp_jsl

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
from sparknlp.util import *
from sparknlp.pretrained import ResourceDownloader
from pyspark.sql import functions as F

import pandas as pd

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

import string
import numpy as np

params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(secret = SECRET, params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 4.2.4
Spark NLP_JSL Version : 4.2.3


## **Pipeline**

In [13]:
def model_pipeline(ner_model_name):

  documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

  sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

  tokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token")

  embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models")\
      .setInputCols(["sentence", "token"]) \
      .setOutputCol("embeddings")      

  if ner_model_name == "ner_jsl_enriched":
    ner = MedicalNerModel.pretrained(ner_model_name, "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("jsl_ner")

    ner_converter = NerConverter() \
        .setInputCols(["sentence", "token", "jsl_ner"]) \
        .setOutputCol("ner_chunk").setWhiteList(["Smoking_Status", "Smoking"])
      
    assertion = AssertionDLModel.pretrained("assertion_oncology_smoking_status_wip", "en", "clinical/models") \
        .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
        .setOutputCol("assertion")

    
  elif ner_model_name == "ner_oncology":         

    ner = MedicalNerModel.pretrained(ner_model_name, "en", "clinical/models") \
        .setInputCols(["sentence", "token", "embeddings"]) \
        .setOutputCol("ner")

    ner_converter = NerConverter() \
        .setInputCols(["sentence", "token", "ner"]) \
        .setOutputCol("ner_chunk").setWhiteList(["Smoking_Status", "Smoking"])
        
    assertion = AssertionDLModel.pretrained("assertion_oncology_smoking_status_wip", "en", "clinical/models") \
        .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
        .setOutputCol("assertion")
            

  elif ner_model_name == "ner_oncology_demographics":

    ner = MedicalNerModel.pretrained(ner_model_name, "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

    ner_converter = NerConverter() \
        .setInputCols(["sentence", "token", "ner"]) \
        .setOutputCol("ner_chunk").setWhiteList(["Smoking_Status", "Smoking"])
        
    assertion = AssertionDLModel.pretrained("assertion_oncology_smoking_status_wip", "en", "clinical/models") \
        .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
        .setOutputCol("assertion")



  elif ner_model_name == "jsl_ner_wip_clinical":

    ner = MedicalNerModel.pretrained(ner_model_name, "en", "clinical/models") \
        .setInputCols(["sentence", "token", "embeddings"]) \
        .setOutputCol("ner")

    ner_converter = NerConverter() \
        .setInputCols(["sentence", "token", "ner"]) \
        .setOutputCol("ner_chunk").setWhiteList(["Smoking_Status", "Smoking"])

    assertion = AssertionDLModel.pretrained("assertion_jsl_large", "en", "clinical/models") \
        .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
        .setOutputCol("assertion")


  elif ner_model_name == "ner_jsl":

    ner = MedicalNerModel.pretrained(ner_model_name, "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("jsl_ner")

    ner_converter = NerConverter() \
        .setInputCols(["sentence", "token", "jsl_ner"]) \
        .setOutputCol("ner_chunk").setWhiteList(["Smoking_Status", "Smoking"])
      
    assertion = AssertionDLModel.pretrained("assertion_oncology_smoking_status_wip", "en", "clinical/models") \
        .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
        .setOutputCol("assertion")


    
  elif ner_model_name == "jsl_ner_wip_modifier_clinical":

    ner = MedicalNerModel.pretrained(ner_model_name, "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("jsl_ner")

    ner_converter = NerConverter() \
        .setInputCols(["sentence", "token", "jsl_ner"]) \
        .setOutputCol("ner_chunk").setWhiteList(["Smoking_Status", "Smoking"])

    assertion = AssertionDLModel.pretrained("assertion_jsl_large", "en", "clinical/models") \
        .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
        .setOutputCol("assertion")
        

    
  elif ner_model_name == "jsl_rd_ner_wip_greedy_clinical":

    ner = MedicalNerModel.pretrained(ner_model_name, "en", "clinical/models") \
        .setInputCols(["sentence", "token", "embeddings"]) \
        .setOutputCol("ner")

    ner_converter = NerConverter() \
        .setInputCols(["sentence", "token", "ner"]) \
        .setOutputCol("ner_chunk").setWhiteList(["Smoking_Status", "Smoking"])

    assertion = AssertionDLModel.pretrained("assertion_jsl_large", "en", "clinical/models") \
        .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
        .setOutputCol("assertion")
        
  nlpPipeline = Pipeline(stages=[documentAssembler, 
                                  sentenceDetector, 
                                  tokenizer, 
                                  embeddings, 
                                  ner, 
                                  ner_converter,
                                  assertion])
  return nlpPipeline

# **Sample Text**

In [14]:
## Generating Example Files ##
text_list = ["The patient is a 40-years-old black woman with Breast Cancer. She started smoking when she was 20 years old, but she quit. Her mother died of breast cancer at age 55.",
"The patient is a 60-years-old native american woman, diagnosed with liver cancer at age 58. She is a nonsmoker and has no other medical history.",
"The patient is a 69 y.o. man with prostatic cancer. He was diagnosed at age 65, and he quit smoking at that time.",
"She has no family history of cancer, and none in her family is a smoker."]

## **Model Names**

## `"ner_oncology`

In [None]:
pipeline = model_pipeline("ner_oncology")

In [28]:
from pyspark.sql.types import StringType

data = spark.createDataFrame(text_list, StringType()).toDF('text')

result = pipeline.fit(data).transform(data)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result,  
                                     result.ner_chunk.begin, 
                                     result.ner_chunk.end, 
                                     result.ner_chunk.metadata, 
                                     result.assertion.result,
                                     result.assertion.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']").alias("begin"),
              F.expr("cols['2']").alias("end"),
              F.expr("cols['3']['entity']").alias("ner_label"),
              F.expr("cols['3']['sentence']").alias("sent_id"),
              F.expr("cols['4']").alias("assertion"),
              F.expr("cols['5']['confidence']").alias("confidence") ).show(truncate=False)

+---------+-----+---+--------------+-------+---------+----------+
|chunk    |begin|end|ner_label     |sent_id|assertion|confidence|
+---------+-----+---+--------------+-------+---------+----------+
|smoking  |74   |80 |Smoking_Status|1      |Past     |0.9865    |
|nonsmoker|101  |109|Smoking_Status|1      |Absent   |0.6854    |
|smoking  |92   |98 |Smoking_Status|1      |Past     |0.9355    |
|smoker   |65   |70 |Smoking_Status|0      |Absent   |0.9015    |
+---------+-----+---+--------------+-------+---------+----------+



## `ner_oncology_demographics`

In [None]:
pipeline = model_pipeline("ner_oncology_demographics")

In [24]:
from pyspark.sql.types import StringType

data = spark.createDataFrame(text_list, StringType()).toDF('text')

result = pipeline.fit(data).transform(data)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result,  
                                     result.ner_chunk.begin, 
                                     result.ner_chunk.end, 
                                     result.ner_chunk.metadata, 
                                     result.assertion.result,
                                     result.assertion.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']").alias("begin"),
              F.expr("cols['2']").alias("end"),
              F.expr("cols['3']['entity']").alias("ner_label"),
              F.expr("cols['3']['sentence']").alias("sent_id"),
              F.expr("cols['4']").alias("assertion"),
              F.expr("cols['5']['confidence']").alias("confidence") ).show(truncate=False)

+---------+-----+---+--------------+-------+---------+----------+
|chunk    |begin|end|ner_label     |sent_id|assertion|confidence|
+---------+-----+---+--------------+-------+---------+----------+
|smoking  |74   |80 |Smoking_Status|1      |Past     |0.9865    |
|nonsmoker|101  |109|Smoking_Status|1      |Absent   |0.6854    |
|smoking  |92   |98 |Smoking_Status|1      |Past     |0.9355    |
|smoker   |65   |70 |Smoking_Status|0      |Absent   |0.9015    |
+---------+-----+---+--------------+-------+---------+----------+



## `ner_jsl`

In [None]:
pipeline = model_pipeline("ner_jsl")

In [20]:
from pyspark.sql.types import StringType

data = spark.createDataFrame(text_list, StringType()).toDF('text')

result = pipeline.fit(data).transform(data)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result,  
                                     result.ner_chunk.begin, 
                                     result.ner_chunk.end, 
                                     result.ner_chunk.metadata, 
                                     result.assertion.result,
                                     result.assertion.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']").alias("begin"),
              F.expr("cols['2']").alias("end"),
              F.expr("cols['3']['entity']").alias("ner_label"),
              F.expr("cols['3']['sentence']").alias("sent_id"),
              F.expr("cols['4']").alias("assertion"),
              F.expr("cols['5']['confidence']").alias("confidence") ).show(truncate=False)

+---------+-----+---+---------+-------+---------+----------+
|chunk    |begin|end|ner_label|sent_id|assertion|confidence|
+---------+-----+---+---------+-------+---------+----------+
|smoking  |74   |80 |Smoking  |1      |Past     |0.9865    |
|nonsmoker|101  |109|Smoking  |1      |Absent   |0.6854    |
|smoking  |92   |98 |Smoking  |1      |Past     |0.9355    |
|smoker   |65   |70 |Smoking  |0      |Absent   |0.9015    |
+---------+-----+---+---------+-------+---------+----------+



## `ner_jsl_enriched`

In [None]:
pipeline = model_pipeline("ner_jsl_enriched")

In [26]:
from pyspark.sql.types import StringType

data = spark.createDataFrame(text_list, StringType()).toDF('text')

result = pipeline.fit(data).transform(data)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result,  
                                     result.ner_chunk.begin, 
                                     result.ner_chunk.end, 
                                     result.ner_chunk.metadata, 
                                     result.assertion.result,
                                     result.assertion.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']").alias("begin"),
              F.expr("cols['2']").alias("end"),
              F.expr("cols['3']['entity']").alias("ner_label"),
              F.expr("cols['3']['sentence']").alias("sent_id"),
              F.expr("cols['4']").alias("assertion"),
              F.expr("cols['5']['confidence']").alias("confidence") ).show(truncate=False)

+---------+-----+---+---------+-------+---------+----------+
|chunk    |begin|end|ner_label|sent_id|assertion|confidence|
+---------+-----+---+---------+-------+---------+----------+
|smoking  |74   |80 |Smoking  |1      |Past     |0.9865    |
|nonsmoker|101  |109|Smoking  |1      |Absent   |0.6854    |
|smoking  |92   |98 |Smoking  |1      |Past     |0.9355    |
|smoker   |65   |70 |Smoking  |0      |Absent   |0.9015    |
+---------+-----+---+---------+-------+---------+----------+



## `jsl_ner_wip_clinical`

In [None]:
pipeline = model_pipeline("jsl_ner_wip_clinical")

In [22]:
from pyspark.sql.types import StringType

data = spark.createDataFrame(text_list, StringType()).toDF('text')

result = pipeline.fit(data).transform(data)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result,  
                                     result.ner_chunk.begin, 
                                     result.ner_chunk.end, 
                                     result.ner_chunk.metadata, 
                                     result.assertion.result,
                                     result.assertion.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']").alias("begin"),
              F.expr("cols['2']").alias("end"),
              F.expr("cols['3']['entity']").alias("ner_label"),
              F.expr("cols['3']['sentence']").alias("sent_id"),
              F.expr("cols['4']").alias("assertion") ).show(truncate=False)

+---------+-----+---+---------+-------+---------+
|chunk    |begin|end|ner_label|sent_id|assertion|
+---------+-----+---+---------+-------+---------+
|smoking  |74   |80 |Smoking  |1      |past     |
|nonsmoker|101  |109|Smoking  |1      |absent   |
|smoking  |92   |98 |Smoking  |1      |past     |
|smoker   |65   |70 |Smoking  |0      |absent   |
+---------+-----+---+---------+-------+---------+



## `jsl_ner_wip_modifier_clinical`

In [None]:
pipeline = model_pipeline("jsl_ner_wip_modifier_clinical")

In [18]:
from pyspark.sql.types import StringType

data = spark.createDataFrame(text_list, StringType()).toDF('text')

result = pipeline.fit(data).transform(data)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result,  
                                     result.ner_chunk.begin, 
                                     result.ner_chunk.end, 
                                     result.ner_chunk.metadata, 
                                     result.assertion.result,
                                     result.assertion.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']").alias("begin"),
              F.expr("cols['2']").alias("end"),
              F.expr("cols['3']['entity']").alias("ner_label"),
              F.expr("cols['3']['sentence']").alias("sent_id"),
              F.expr("cols['4']").alias("assertion") ).show(truncate=False)

+---------+-----+---+---------+-------+---------+
|chunk    |begin|end|ner_label|sent_id|assertion|
+---------+-----+---+---------+-------+---------+
|smoking  |74   |80 |Smoking  |1      |past     |
|nonsmoker|101  |109|Smoking  |1      |absent   |
|smoking  |92   |98 |Smoking  |1      |past     |
|smoker   |65   |70 |Smoking  |0      |absent   |
+---------+-----+---+---------+-------+---------+



## `jsl_rd_ner_wip_greedy_clinical`

In [None]:
pipeline = model_pipeline("jsl_rd_ner_wip_greedy_clinical")

In [16]:
from pyspark.sql.types import StringType

data = spark.createDataFrame(text_list, StringType()).toDF('text')

result = pipeline.fit(data).transform(data)

result.select(F.explode(F.arrays_zip(result.ner_chunk.result,  
                                     result.ner_chunk.begin, 
                                     result.ner_chunk.end, 
                                     result.ner_chunk.metadata, 
                                     result.assertion.result,
                                     result.assertion.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']").alias("begin"),
              F.expr("cols['2']").alias("end"),
              F.expr("cols['3']['entity']").alias("ner_label"),
              F.expr("cols['3']['sentence']").alias("sent_id"),
              F.expr("cols['4']").alias("assertion") ).show(truncate=False)

+---------+-----+---+---------+-------+---------+
|chunk    |begin|end|ner_label|sent_id|assertion|
+---------+-----+---+---------+-------+---------+
|smoking  |74   |80 |Smoking  |1      |past     |
|nonsmoker|101  |109|Smoking  |1      |absent   |
|smoking  |92   |98 |Smoking  |1      |past     |
|smoker   |65   |70 |Smoking  |0      |absent   |
+---------+-----+---+---------+-------+---------+

