# Setup

In [7]:
import pandas as pd

In [1]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

Saving public344_jsl352.json to public344_jsl352.json


In [2]:
license_keys['JSL_VERSION']

'3.5.2'

In [3]:
license_keys['PUBLIC_VERSION']

'3.4.4'

In [4]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 70 kB/s 
[K     |████████████████████████████████| 145 kB 69.4 MB/s 
[K     |████████████████████████████████| 198 kB 71.8 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 151 kB 36.6 MB/s 
[K     |████████████████████████████████| 95 kB 4.3 MB/s 
[K     |████████████████████████████████| 66 kB 5.3 MB/s 
[?25h

In [5]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 3.4.4
Spark NLP_JSL Version : 3.5.2


# Sample data

In [13]:
sample_data = pd.read_json('/content/drive/MyDrive/Colab Notebooks/sample_texts.json')[0].tolist()

In [16]:
sample_data[0]

"A 65-year-old Taiwanese woman had a history of debulking surgery, bilateral oophorectomy with omentectomy, total anterior hysterectomy with radical pelvic lymph nodes dissection due to ovarian carcinoma (mucinous-type carcinoma, stage Ic) 1 year ago. Patient's medical compliance was poor and failed to complete her chemotherapy (cyclophosphamide 750 mg/m2, carboplatin 300 mg/m2). Recently, she noted a palpable right breast mass, which enlarged rapidly to about 15 cm in size and nearly occupied the whole right breast in 2 months. Core needle biopsy revealed metaplastic carcinoma. Neoadjuvant chemotherapy with the regimens of Taxotere (75 mg/m2), Epirubicin (75 mg/m2), and Cyclophosphamide (500 mg/m2) was given for 6 cycles with poor response, followed by a modified radical mastectomy (MRM) with dissection of axillary lymph nodes and skin grafting. Postoperatively, radiotherapy was done with 5000 cGy in 25 fractions. The histopathologic examination revealed a metaplastic carcinoma with m

# Pipeline

In [64]:
# White lists for assertion and entity resolution models (not all the entity types should have an assertion status or a normalized term)

assertion_wl = ['ImagingTest', 'Tumor_Finding', 'Pathology_Result', 'Chemotherapy', 'Oncogene', 'Metastasis', 'PalliativeTreatment',
                'CancerDx', 'TargetedTherapy', 'Adenopathy', 'PathologyTest', 'Biomarker', 'HormonalTherapy', 'Invasion', 'BenignTumor',
                'SmokingStatus', 'Radiotherapy', 'CancerSurgery', 'Immunotherapy', 'HistologicalType', 'UnspecificTherapy']

resolution_wl = ['SiteBrain', 'BenignTumor', 'SiteLiver', 'HistologicalType', 'Tumor_Finding', 'Pathology_Result', 'Oncogene',
                 'SiteOtherBodyPart', 'Metastasis', 'CancerDx', 'Adenopathy', 'Invasion', 'SiteLung', 'SiteBreast', 'SiteBone', 'SiteLymphNode']

In [54]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel().pretrained('embeddings_clinical', 'en', 'clinical/models')\
    .setInputCols(["sentence", 'token']) \
    .setOutputCol("embeddings")

ner_oncology = MedicalNerModel.load("/content/drive/MyDrive/JSL/Models/oncology_model")\
                    .setInputCols(["sentence", "token", "embeddings"])\
                    .setOutputCol("ner")

ner_converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("chunk")

ner_assertion_converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("assertion_chunk")\
    .setWhiteList(assertion_wl)

ner_resolution_converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("resolution_chunk")\
    .setWhiteList(resolution_wl)

clinical_assertion = AssertionDLModel.pretrained("assertion_dl_large", "en", "clinical/models") \
    .setInputCols(["sentence", "assertion_chunk", "embeddings"]) \
    .setOutputCol("assertion")

c2doc = Chunk2Doc()\
    .setInputCols("resolution_chunk")\
    .setOutputCol("chunk_doc") 

sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")\
    .setInputCols(["chunk_doc"])\
    .setOutputCol("sentence_embeddings")
    
resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icdo_augmented", "en", "clinical/models") \
    .setInputCols(["resolution_chunk", "sentence_embeddings"]) \
    .setOutputCol("resolution")\
    .setDistanceFunction("EUCLIDEAN")


ner_pipeline = Pipeline(stages=[documentAssembler,
                                sentenceDetector,
                                tokenizer,
                                word_embeddings, 
                                ner_oncology,
                                ner_converter,
                                ner_assertion_converter,
                                ner_resolution_converter,
                                clinical_assertion,
                                c2doc,
                                sbert_embedder,
                                resolver])

empty_dataframe = spark.createDataFrame([[""]]).toDF("text")

ner_model = ner_pipeline.fit(empty_dataframe)

sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
assertion_dl_large download started this may take some time.
[OK!]
sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_icdo_augmented download started this may take some time.
[OK!]


In [55]:
light_model = LightPipeline(ner_model)

In [56]:
annotated_samples = light_model.fullAnnotate(sample_data)

# Data visualization

In [25]:
from sparknlp_display import AssertionVisualizer, EntityResolverVisualizer

In [27]:
er_vis = EntityResolverVisualizer()

assertion_vis = AssertionVisualizer()

In [17]:
# Entities included in the NER model:

entities = []
for entity in ner_oncology.getClasses():
  entities.append(entity.split('-')[-1])

set(entities)

{'Adenopathy',
 'Age',
 'BenignTumor',
 'Biomarker',
 'Biomarker_Result',
 'CancerDx',
 'CancerScore',
 'CancerScoreValue',
 'CancerSurgery',
 'Chemotherapy',
 'Cyclecount',
 'Cycleday',
 'Cycledose',
 'Cyclelength',
 'Cyclenumber',
 'Date',
 'Death_Entity',
 'Direction',
 'Dosage',
 'Duration',
 'External_body_part_or_region',
 'Female_Reproductive_Status',
 'Form',
 'Frequency',
 'Gender',
 'Grade',
 'HistologicalType',
 'HormonalTherapy',
 'ImagingTest',
 'Immunotherapy',
 'Internal_organ_or_component',
 'Invasion',
 'LineOfTherapy',
 'Metastasis',
 'O',
 'Oncogene',
 'PalliativeTreatment',
 'PathologyTest',
 'Pathology_Result',
 'PerformanceStatus',
 'Race_Ethnicity',
 'RadiationDose',
 'Radiotherapy',
 'RelativeDate',
 'RelativeTime',
 'ResponseToTreatment',
 'Route',
 'SiteBone',
 'SiteBrain',
 'SiteBreast',
 'SiteLiver',
 'SiteLung',
 'SiteLymphNode',
 'SiteOtherBodyPart',
 'SizeTrend',
 'SmokingStatus',
 'Staging',
 'Strength',
 'TargetedTherapy',
 'Time',
 'TumorSize',
 'Tumor

## NER + Entity Resolution

In [57]:
er_vis.display(annotated_samples[0], label_col='chunk', resolution_col = 'resolution')

In [58]:
er_vis.display(annotated_samples[1], label_col='chunk', resolution_col = 'resolution')

In [59]:
er_vis.display(annotated_samples[2], label_col='chunk', resolution_col = 'resolution')

## NER + Assertion

In [61]:
assertion_vis.display(annotated_samples[3], label_col = 'chunk', assertion_col = 'assertion')

In [63]:
assertion_vis.display(annotated_samples[4], label_col = 'chunk', assertion_col = 'assertion')

In [62]:
assertion_vis.display(annotated_samples[5], label_col = 'chunk', assertion_col = 'assertion')