# Setup

In [7]:
import pandas as pd

In [1]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

Saving public344_jsl352.json to public344_jsl352.json


In [3]:
license_keys['JSL_VERSION']

'4.0.0'

In [4]:
license_keys['PUBLIC_VERSION']

'4.0.0'

In [4]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 70 kB/s 
[K     |████████████████████████████████| 145 kB 69.4 MB/s 
[K     |████████████████████████████████| 198 kB 71.8 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 151 kB 36.6 MB/s 
[K     |████████████████████████████████| 95 kB 4.3 MB/s 
[K     |████████████████████████████████| 66 kB 5.3 MB/s 
[?25h

In [5]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 3.4.4
Spark NLP_JSL Version : 3.5.2


# Sample data

In [8]:
sample_data_1 = '''A 65-year-old Taiwanese woman had a history of debulking surgery, bilateral oophorectomy with omentectomy, total anterior hysterectomy with radical pelvic lymph nodes dissection due to ovarian carcinoma (mucinous-type carcinoma, stage Ic) 1 year ago. Patient's medical compliance was poor and failed to complete her chemotherapy (cyclophosphamide 750 mg/m2, carboplatin 300 mg/m2). Recently, she noted a palpable right breast mass, which enlarged rapidly to about 15 cm in size and nearly occupied the whole right breast in 2 months. Core needle biopsy revealed metaplastic carcinoma. Neoadjuvant chemotherapy with the regimens of Taxotere (75 mg/m2), Epirubicin (75 mg/m2), and Cyclophosphamide (500 mg/m2) was given for 6 cycles with poor response, followed by a modified radical mastectomy (MRM) with dissection of axillary lymph nodes and skin grafting. Postoperatively, radiotherapy was done with 5000 cGy in 25 fractions. The histopathologic examination revealed a metaplastic carcinoma with myoepithelial and squamous differentiation associated with adenomyoepithelioma (Figure 1A-C). Immunohistochemistry study showed that the tumor cells are positive for epithelial markers-cytokeratin (AE1/AE3) stain, and myoepithelial markers, including cytokeratin 5/6 (CK 5/6), p63, and S100 stains (Figure 2A-E). Expressions of hormone receptors, including ER, PR, and Her-2/Neu, were all negative (Figure 3A-D). The dissected axillary lymph nodes showed metastastic carcinoma with hormone triple-negative in 3 out of 26 nodes. The patient was staged as pT3N1aM0, with histologic tumor grade III.

Seven months later, the patient complained about pain and numbness over left forearm, right lower back, and bilateral lower extremities. Fentanyl patch (2.5 mg/q72h) was used for pain control. Sonography of left forearm revealed a tumor with hypervascularity and cystic component, measuring about 6 x 5 cm in size. Positron emission tomography/computed tomography (PET/C) scan showed hypermetabolic masses in the left proximal forearm, right psoas, and quadratus lumborum muscles (Figure 4). Surgical excision of left forearm mass (Figure 5) was performed for symptomatic relief of intractable pain. However, excision for the right psoas and quadratus lumborum muscles lesion was not feasible due to high risks of morbidities. Histopathologic examination of left forearm tumor revealed metastatic carcinoma, similar to previous breast cancer with negative ER, PR, and Her-2/Neu immunostaining (Figure 3E-H), positive for epithelial markers-cytokeratin (AE1/AE3) stain, and myoepithelial markers as p63 (Figure 2F-H). Further boosted radiotherapy was added, but the metastatic tumors in the right psoas and quadratus lumborum muscles progressed in size. Eventually, lung metastasis occurred 10 months and she died 12 months later after MRM. The informed consent was obtained from the relative.'''

In [9]:
sample_data_2 = '''A chest radiograph revealed a large, well circumscribed mass projected over the right hilum, the so called hilum overlay sign. A subsequent computed tomography (CT) -thorax with intra-venous contrast confirmed a 9.0 x 5.5 cm mass arising from the apical segment of the right lower lobe. It demonstrated homogenous contrast enhancement with no evidence of calcifications or necrosis. The mass abutted the pleura with no sign of invasion. There was no evidence of lymphadenopathy, pleural effusions or osseous metastases. The liver and adrenal glands appeared normal. A positron emission tomography-CT (PET-CT) demonstrated increased fludeoxyglucose (FDG) uptake within the mass with standardized uptake value (SUV max 22.1) and no evidence of nodal involvement or distant metastases.

A CT-guided biopsy showed a non-small cell primary lung carcinoma with marked necrosis and poor cell differentiation. The subtype could not be more accurately determined. Mediastinoscopy for nodal sampling was negative for nodal metastases. A right pneumenectomy revealed a 9 cm large necrotic mass replacing most of the medial and posterior segments of the right lower lobe. It was a biphasic tumour composed of malignant glandular tissue with sub-nuclear vacuoles, associated with a pleomorphic stromal malignant blastematous component.'''

# Pipeline

In [14]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel().pretrained('embeddings_clinical', 'en', 'clinical/models')\
    .setInputCols(["sentence", 'token']) \
    .setOutputCol("embeddings")

ner_oncology = MedicalNerModel.pretrained('ner_oncology_wip', 'en', 'clinical/models')\
                    .setInputCols(["sentence", "token", "embeddings"])\
                    .setOutputCol("ner")

ner_converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("chunk")

# White lists for assertion model (not all the entity types should have an assertion status)

assertion_wl = ['Imaging_Test', 'Tumor_Finding', 'Pathology_Result', 'Chemotherapy', 'Oncogene', 'Metastasis',
                'Cancer_Dx', 'Targeted_Therapy', 'Pathology_Test', 'Biomarker', 'Hormonal_Therapy', 'Invasion',
                'Smoking_Status', 'Radiotherapy', 'Cancer_Surgery', 'Immunotherapy', 'Histological_Type', 'Unspecific_Therapy']

ner_assertion_converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("assertion_chunk")\
    .setWhiteList(assertion_wl)

# White lists for entity resolution model (not all the entity types should have a normalized icdo term)

resolution_wl = ['Anatomical_Site', 'Histological_Type', 'Tumor_Finding', 'Pathology_Result', 'Oncogene',
                 'Metastasis', 'Cancer_Dx', 'Site_Lymph_Node', 'Invasion']

ner_resolution_converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("resolution_chunk")\
    .setWhiteList(resolution_wl)

clinical_assertion = AssertionDLModel.pretrained("assertion_dl_large", "en", "clinical/models") \
    .setInputCols(["sentence", "assertion_chunk", "embeddings"]) \
    .setOutputCol("assertion")

c2doc = Chunk2Doc()\
    .setInputCols("resolution_chunk")\
    .setOutputCol("chunk_doc") 

sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")\
    .setInputCols(["chunk_doc"])\
    .setOutputCol("sentence_embeddings")
    
resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icdo_augmented", "en", "clinical/models") \
    .setInputCols(["resolution_chunk", "sentence_embeddings"]) \
    .setOutputCol("resolution")\
    .setDistanceFunction("EUCLIDEAN")


ner_pipeline = Pipeline(stages=[documentAssembler,
                                sentenceDetector,
                                tokenizer,
                                word_embeddings, 
                                ner_oncology,
                                ner_converter,
                                ner_assertion_converter,
                                ner_resolution_converter,
                                clinical_assertion,
                                c2doc,
                                sbert_embedder,
                                resolver])

empty_dataframe = spark.createDataFrame([[""]]).toDF("text")

ner_model = ner_pipeline.fit(empty_dataframe)

sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_oncology_wip download started this may take some time.
[OK!]
assertion_dl_large download started this may take some time.
[OK!]
sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_icdo_augmented download started this may take some time.
[OK!]


In [15]:
light_model = LightPipeline(ner_model)

In [27]:
annotated_samples = light_model.fullAnnotate([sample_data_1, sample_data_2])

# Data visualization

In [18]:
from sparknlp_display import AssertionVisualizer, EntityResolverVisualizer

In [19]:
er_vis = EntityResolverVisualizer()

assertion_vis = AssertionVisualizer()

In [20]:
# Entities included in the NER model:

entities = []
for entity in ner_oncology.getClasses():
  entities.append(entity.split('-')[-1])

set(entities)

{'Age',
 'Anatomical_Site',
 'Biomarker',
 'Biomarker_Result',
 'Cancer_Dx',
 'Cancer_Surgery',
 'Chemotherapy',
 'Cycle_Day',
 'Cycle_Number',
 'Date',
 'Direction',
 'Dosage',
 'Duration',
 'Frequency',
 'Gender',
 'Grade',
 'Histological_Type',
 'Hormonal_Therapy',
 'Imaging_Test',
 'Immunotherapy',
 'Invasion',
 'Line_Of_Therapy',
 'Metastasis',
 'O',
 'Oncogene',
 'Pathology_Result',
 'Pathology_Test',
 'Performance_Status',
 'Race_Ethnicity',
 'Radiation_Dose',
 'Radiotherapy',
 'Relative_Date',
 'Response_To_Treatment',
 'Route',
 'Site_Lymph_Node',
 'Smoking_Status',
 'Staging',
 'Targeted_Therapy',
 'Tumor_Finding',
 'Tumor_Size',
 'Unspecific_Therapy'}

## NER + Entity Resolution

In [28]:
er_vis.display(annotated_samples[0], label_col='chunk', resolution_col = 'resolution')

## NER + Assertion

In [30]:
assertion_vis.display(annotated_samples[1], label_col = 'chunk', assertion_col = 'assertion')