## Hybrid Pipeline with local models

In [6]:
from pyspark.sql import SparkSession

import os
import sys

sys.path.append('../jsl-repos/spark-nlp/python')

sys.path.append('../jsl-repos/spark-nlp-internal/python')

jar_path = "../jars/"

spark = SparkSession.builder \
    .appName("Spark NLP Enterprise 2.3.5") \
    .master("local[16]") \
    .config("spark.driver.memory","16G") \
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "600M")\
    .config("spark.jars", "{}spark-nlp-2.3.5.jar,{}spark-nlp-jsl-2.3.5.jar".format(jar_path,jar_path)) \
    .getOrCreate()

import pandas as pd
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
import string
import numpy as np

print ('------ Spark NLP jars loaded --------')


from sparknlp_jsl import annotators as jsl_annotators

path = "../cache_pretrained/"

document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')\
    .setCustomBounds(['\n'])

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('raw_token')

spell = NorvigSweetingModel.load(path+"spellcheck_norvig_en_2.1.0_2.4_1563017660080") \
      .setInputCols(["raw_token"]) \
      .setOutputCol("token")

embeddings = WordEmbeddingsModel.load(path+"embeddings_clinical_en_2.0.2_2.4_1558454742956")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")

clinical_pos = PerceptronModel.load(path+"pos_clinical_en_2.0.2_2.4_1556660550177") \
      .setInputCols(["sentence", "token"]) \
      .setOutputCol("clinical_pos")

pos = PerceptronModel.load(path+"pos_anc_en_2.0.2_2.4_1556659930154") \
      .setInputCols(["sentence", "token"]) \
      .setOutputCol("pos")

dependency_parser = DependencyParserModel.load(path+"dependency_conllu_en_2.0.8_2.4_1561435004077") \
  .setInputCols(["sentence","token", "clinical_pos"]) \
  .setOutputCol("dependency")

dependency_parser_typed = TypedDependencyParserModel.load(path+"dependency_typed_conllu_en_2.0.8_2.4_1561473259215")\
            .setInputCols(["token", "pos", "dependency"])\
            .setOutputCol("dependency_typed")

bio_ner = NerDLModel.load(path+"ner_bionlp_en_2.3.4_2.4_1574889731300") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")
    
converter = NerConverter()\
  .setInputCols(["sentence", "token", "ner"])\
  .setOutputCol("ner_span")

bio_assertion_dl = jsl_annotators.AssertionDLModel.load(path+"assertion_dl_en_2.0.2_2.4_1556655581078") \
  .setInputCols(["sentence", "ner_span", "embeddings"]) \
  .setOutputCol("bio_assertion")

clinical_ner = NerDLModel.load(path+"ner_clinical_en_2.0.2_2.4_1556659769638") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("clinical_ner")

clinical_converter = NerConverter()\
  .setInputCols(["sentence", "token", "clinical_ner"])\
  .setOutputCol("clinical_ner_span")

assertion_dl = jsl_annotators.AssertionDLModel.load(path+"assertion_dl_en_2.0.2_2.4_1556655581078") \
  .setInputCols(["sentence", "clinical_ner_span", "embeddings"]) \
  .setOutputCol("assertion")

custom_ner_model = NerDLModel.load(path+"custom_NER_model_v3")\
        .setInputCols(["sentence", "token", "embeddings"]) \
        .setOutputCol("custom_ner")

custom_ner_converter = NerConverter()\
  .setInputCols(["sentence", "token", "custom_ner"])\
  .setOutputCol("custom_ner_span")

custom_assertion_dl = jsl_annotators.AssertionDLModel.load(path+"assertion_dl_en_2.0.2_2.4_1556655581078") \
  .setInputCols(["sentence", "custom_ner_span", "embeddings"]) \
  .setOutputCol("custom_assertion")

glove_embeddings = WordEmbeddingsModel.load(path+"glove_100d_en_2.0.2_2.4_1556534397055") \
  .setInputCols(["sentence", "token"])\
  .setOutputCol("glove_embeddings")

onto_ner = NerDLModel.load(path+"onto_100_en_2.1.0_2.4_1564256329924") \
  .setInputCols(["sentence", "token", "glove_embeddings"]) \
  .setOutputCol("onto_ner")

onto_ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "onto_ner"]) \
  .setOutputCol("onto_ner_chunk")

chunk_doc = Chunk2Doc()\
    .setInputCols(["onto_ner_chunk"])\
    .setOutputCol("chunk_document")\

date_matcher = DateMatcher()\
    .setInputCols("chunk_document")\
    .setOutputCol("date")


clinical_ner_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        token,
        spell,
        embeddings,
        clinical_pos,
        pos,
        dependency_parser,
        dependency_parser_typed,
        bio_ner,
        converter,
        bio_assertion_dl,
        clinical_ner,
        clinical_converter,
        assertion_dl,
        custom_ner_model,
        custom_ner_converter,
        custom_assertion_dl,
        glove_embeddings,
        onto_ner,
        onto_ner_converter,
        chunk_doc,
        date_matcher
    ])

print ('------ Spark NLP pipeline loaded --------')


from pyspark.sql.types import *
import pandas as pd
import string

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = clinical_ner_pipeline.fit(empty_data)

from sparknlp.base import LightPipeline

light_model = LightPipeline(model)

print ('------ Spark NLP Light pipeline created --------')


------ Spark NLP jars loaded --------
------ Spark NLP pipeline loaded --------
------ Spark NLP Light pipeline created --------


In [19]:
def dateMatcher_mapper(date, onto_ner_chunk):  
 
    tuples=[]

    for d in date:

        for n in onto_ner_chunk:

            if d.metadata["sentence"]==n.metadata["sentence"] and d.metadata["chunk"]==n.metadata["chunk"]:
                                
                tuples.append((d.result, n.result, n.metadata["entity"], n.begin, n.end))

                break

    return tuples

def get_annotations (text):

    xx = light_model.fullAnnotate(text)
    
    annotations = {'bioNer_output': {n.result:n.metadata['entity'] for n in xx[0]['ner_span']},
                   'custom_NER_output': [(n.result, n.metadata['entity'], m.result) for n,m in zip(xx[0]['custom_ner_span'], xx[0]['custom_assertion'])],
     'ontoNer_output': {n.result:n.metadata['entity'] for n in xx[0]['onto_ner_chunk']},
     'assertion_output': {n.result:(n.metadata['entity'], m.result) for m,n in zip(xx[0]['assertion'], xx[0]['clinical_ner_span'])},
    'bio_assertion_output': {n.result:(n.metadata['entity'], m.result) for m,n in zip(xx[0]['bio_assertion'], xx[0]['ner_span'])},
    'dateMatcher_output': dateMatcher_mapper(xx[0]['date'], xx[0]['onto_ner_chunk'])}
     
    return annotations

In [27]:
text="""
Stage IV adenocarcinoma of lung with b/l lung nodules , probable L-2 metastasis and a negative brain MRI. Molecular testing of tumor demonstrates EGFR mutation . On Tarceva . Disease Stable . Plan Chemotherapy toxicity reviewed again . Patient agrees to proceed . Counseling time : 40 mins . 1 ) Continue Tarceva for lung cancer . CT CAP ordered before next visit 2 ) Continue anti-anxiety medication 3 ) Xgeva today 4 ) Return in 1 month with labs , after scan
"""

output = get_annotations(text)

output

{'bioNer_output': {'adenocarcinoma': 'Cancer',
  'lung': 'Organ',
  'lung nodules': 'Pathological_formation',
  'L-2': 'Gene_or_gene_product',
  'brain': 'Organ',
  'tumor': 'Cancer',
  'EGFR': 'Gene_or_gene_product',
  'Tarceva': 'Simple_chemical',
  'Patient': 'Organism',
  'lung cancer': 'Cancer',
  'CT CAP': 'Multi-tissue_structure'},
 'custom_NER_output': [('Stage IV', 'MET', 'present'),
  ('lung', 'SITELUNG', 'possible'),
  ('lung', 'SITELUNG', 'possible'),
  ('L-2', 'SITEBONE', 'possible'),
  ('metastasis', 'MET', 'possible'),
  ('brain', 'SITEBRAIN', 'possible'),
  ('lung', 'SITELUNG', 'possible'),
  ('1 month', 'TEMP', 'hypothetical')],
 'ontoNer_output': {'L-2': 'PRODUCT',
  'Tarceva': 'PERSON',
  '40 mins': 'TIME',
  '1': 'CARDINAL',
  '2': 'CARDINAL',
  '3': 'CARDINAL',
  'Xgeva': 'ORG',
  'today 4': 'DATE',
  '1 month': 'DATE'},
 'assertion_output': {'Stage IV adenocarcinoma of lung': ('PROBLEM',
   'present'),
  'b/l lung nodules': ('PROBLEM', 'present'),
  'L-2 metastasi

In [25]:
print (text)


Stage IV adenocarcinoma of lung with b/l lung nodules , probable L-2 metastasis and a negative brain MRI. Molecular testing of tumor demonstrates EGFR mutation . On Tarceva . Disease Stable . Plan Chemotherapy toxicity reviewed again . Patient agrees to proceed . Counseling time : 40 mins . 1 ) Continue Tarceva for lung cancer . CT CAP ordered before next visit 2 ) Continue anti-anxiety medication 3 ) Xgeva today 4 ) Return in 1 month with labs , after scan



## Entity Resolver

In [28]:

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")\
  .setExplodeSentences(False)

tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

embeddings = WordEmbeddingsModel.load(path+"embeddings_clinical_en_2.0.2_2.4_1558454742956")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

clinical_ner = NerDLModel.load(path+"ner_clinical_en_2.0.2_2.4_1556659769638")\
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")

chunk_tokenizer = ChunkTokenizer()\
  .setInputCols(["ner_chunk"])\
  .setOutputCol("ner_chunk_tokenized")

icd_resolution_cm = jsl_annotators.EntityResolverModel.load(path+"resolve_icd10cm_cl_em_en_2.0.8_2.4_1561686127600") \
    .setThreshold(100)\
    .setMergeChunks(False)\
    .setInputCols("ner_chunk_tokenized", "embeddings")\
    .setOutputCol("resolution_cm") 

K = icd_resolution_cm.getNeighbours()
                       
from pyspark.ml import Pipeline
                       
resolver_pipeline = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    embeddings,
    clinical_ner,
    ner_converter,
    chunk_tokenizer,
    icd_resolution_cm,
    ])



In [None]:

text_data = spark.createDataFrame([[text]]).toDF("text")

model_icd10 = resolver_pipeline.fit(text_data)

In [32]:

def get_entity_resolver(text):
    
    text_data = spark.createDataFrame([[text]]).toDF("text")

    data_icd10d = model_icd10.transform(text_data)

    result = data_icd10d.select(F.explode(F.arrays_zip("ner_chunk.result","ner_chunk.metadata","resolution_cm.result","resolution_cm.metadata")).alias("icd10cm")) \
    .select(F.expr("icd10cm['0']").alias("chunk"),
            F.expr("icd10cm['1'].entity").alias("entity"),
            F.expr("icd10cm['3'].resolved_text").alias("resolved_text"),
            F.expr("icd10cm['2']").alias("code"),
            F.expr("icd10cm['3'].alternative_codes").alias("alternative_codes"),
            F.expr("icd10cm['3'].confidence * {}".format(K)).alias("relative_confidence")) \
    .filter(F.expr("entity='PROBLEM' and code is not null").alias("entity")) \
    .distinct() \
    .na.fill(K, ["relative_confidence"]) \
    .orderBy("relative_confidence", ascending=False)

    return result.show()


In [33]:
get_entity_resolver(text)

+--------------------+-------+--------------------+------+--------------------+-------------------+
|               chunk| entity|       resolved_text|  code|   alternative_codes|relative_confidence|
+--------------------+-------+--------------------+------+--------------------+-------------------+
|         lung cancer|PROBLEM|Maple-bark-stripp...|  J676|J675,A150,D860,C4...| 1.6621133377574555|
|             Disease|PROBLEM|   Castleman disease| D47Z2|E7402,E7522,E710,...| 1.6097856681828806|
|       EGFR mutation|PROBLEM|         Perineocele| N8181|D6182,D6852,Q933,...| 1.5145645421705145|
|Plan Chemotherapy...|PROBLEM|Encounter for ant...| Z5111|Z9221,D6481,T8081...| 1.2173351990013233|
|Stage IV adenocar...|PROBLEM|Malignant neoplas...| C3490|D381,D0220,C3492,...| 1.1672446568797696|
|      L-2 metastasis|PROBLEM|Non-follicular ( ...| C8392|C8396,C775,C8378,...| 1.1298993385375564|
|    b/l lung nodules|PROBLEM|Bauxite fibrosis ...|  J631|J676,J675,D1430,D...| 1.0976144735906879|



Note 3
<div style="border:2px solid #747474; background-color: #e3e3e3; margin: 5px; padding: 10px"> 
<p>
The patient is a 38 year old Caucasian male, smoker (about 10–15 cigarettes per day for 20 years), without other relevant family or personal risk factors for neoplasic disease who had suffered from severe headaches and nausea (his first disease symptoms) since August of 2005. He did not seek any medical help until November of 2005, when he presented to his doctor, due to exacerbation of those symptoms. No abnormalities on both physical and neurological examinations were detected at that time.

On December 30th of 2005, the CT scan of his brain was remarkable for the following findings: “An expansive lesion of approximately 5 cm × 3 cm in size, located in the right temporal lobe, with nonhomogenic, post-contrast signal amplification. A large edema surrounding the lesion. A compression of the occipital corner of the right lateral ventricle. A slight enlargement of the supratentorial ventricular system, shifting to the left.”

On January 25th of 2006, the patient underwent surgical therapy, including right temporal craniotomy, with total resection of the tumor. On February 7th of 2006, a histopathology examination (identification number 475,958; Info-Pat, Poznań, Poland), confirmed a diagnosis of the GBM IV stage (according to WHO classification).

After the surgery, the patient was referred to the Institute of Radiation Oncology in Gliwice, Poland, for the post-operative radiotherapy. Although the patient was qualified for this treatment, he did not arrive to the Institute of Radiation Oncology on the day of the scheduled preparatory procedures. The reason for his absence was a simultaneous diagnostic finding of the left lung's tumor, for which the patient underwent a thoracotomy with the wedge tumor resection (for diagnostic purposes). On June 12th of 2006, based on the histopathological examination results, which showed post-tuberculosis lesions, the patient's pulmonary treatment was completed.

In February 2007, approximately 13 months after his brain tumor surgery, the patient again presented to the Institute of Oncology, and according to the follow-up diagnostic work-up, no brain tumor recurrence was found. Due to the absence of tumor, no radiotherapy was considered, and “watchful waiting” was recommended including brain imaging studies (CT or MRI) to be repeated every 3 months. In the face of the atypical disease course, an additional verification of the histopathological diagnosis was also performed, confirming the original findings of the GBM. The patient had remained under close control until February 2009 (37 months from his initial brain tumor surgery), and at that time the brain tumor recurrence was found. His recurrent tumor was located in the primary tumor's bed, and its size was 4 cm × 5.3 cm × 3.5 cm (Fig. 6). However, those findings were not associated with any particular symptoms or abnormalities on subsequent physical or neurological examinations of the patient. On March 16th 2009, the patient underwent another craniotomy with the subtotal tumor resection. (MRI scans after the second craniotomy are shown in Figs. 7 and 8.) The histopathology examination was again consistent with GBM. During the period from May 11th to June 19th of 2009, the patient received the radiotherapy dose of 60 Gy/30 fractions to the tumor lodge, including the residual tumor, with 2.5 cm of tissue margin. Due to the lack of the patient's consent, no chemotherapy was implemented. During the irradiation period, he had the first seizure episode, and was started on antiepileptic therapy (Depakine 200 mg a day). He continued this therapy for the rest of his life. After the radiotherapy, diagnostic follow-up examinations were conducted every 3 months. At the beginning of March 2010, another recurrence was found, and the tumor was localized in an upper part of the tumor bed, within the previously irradiated area (its size was 3.7 cm × 2.6 cm × 2.3 cm). Surprisingly, the patient had not experienced any symptoms, and his physical and neurological examinations were unremarkable. On March 13th of 2010, the stereotactic radiotherapy, using a single dose of 8 Gy applied to the area of recurrent tumor was performed. Unfortunately, on the control examination, on July 6th of 2010, further progression of the GBM was found. The patient expired on November 15th of 2010, in the local hospital (Zawiercie, Poland), due to the tumor expansion, resulting in cerebral edema, herniation, and multi-organ failure.</p>
</div>



In [34]:
note3= """
The patient is a 38 year old Caucasian male, smoker (about 10–15 cigarettes per day for 20 years), without other relevant family or personal risk factors for neoplasic disease who had suffered from severe headaches and nausea (his first disease symptoms) since August of 2005. He did not seek any medical help until November of 2005, when he presented to his doctor, due to exacerbation of those symptoms. No abnormalities on both physical and neurological examinations were detected at that time.

On December 30th of 2005, the CT scan of his brain was remarkable for the following findings: “An expansive lesion of approximately 5 cm × 3 cm in size, located in the right temporal lobe, with nonhomogenic, post-contrast signal amplification. A large edema surrounding the lesion. A compression of the occipital corner of the right lateral ventricle. A slight enlargement of the supratentorial ventricular system, shifting to the left.”

On January 25th of 2006, the patient underwent surgical therapy, including right temporal craniotomy, with total resection of the tumor. On February 7th of 2006, a histopathology examination (identification number 475,958; Info-Pat, Poznań, Poland), confirmed a diagnosis of the GBM IV stage (according to WHO classification).

After the surgery, the patient was referred to the Institute of Radiation Oncology in Gliwice, Poland, for the post-operative radiotherapy. Although the patient was qualified for this treatment, he did not arrive to the Institute of Radiation Oncology on the day of the scheduled preparatory procedures. The reason for his absence was a simultaneous diagnostic finding of the left lung's tumor, for which the patient underwent a thoracotomy with the wedge tumor resection (for diagnostic purposes). On June 12th of 2006, based on the histopathological examination results, which showed post-tuberculosis lesions, the patient's pulmonary treatment was completed.

In February 2007, approximately 13 months after his brain tumor surgery, the patient again presented to the Institute of Oncology, and according to the follow-up diagnostic work-up, no brain tumor recurrence was found. Due to the absence of tumor, no radiotherapy was considered, and “watchful waiting” was recommended including brain imaging studies (CT or MRI) to be repeated every 3 months. In the face of the atypical disease course, an additional verification of the histopathological diagnosis was also performed, confirming the original findings of the GBM. The patient had remained under close control until February 2009 (37 months from his initial brain tumor surgery), and at that time the brain tumor recurrence was found. His recurrent tumor was located in the primary tumor's bed, and its size was 4 cm × 5.3 cm × 3.5 cm (Fig. 6). However, those findings were not associated with any particular symptoms or abnormalities on subsequent physical or neurological examinations of the patient. On March 16th 2009, the patient underwent another craniotomy with the subtotal tumor resection. (MRI scans after the second craniotomy are shown in Figs. 7 and 8.) The histopathology examination was again consistent with GBM. During the period from May 11th to June 19th of 2009, the patient received the radiotherapy dose of 60 Gy/30 fractions to the tumor lodge, including the residual tumor, with 2.5 cm of tissue margin. Due to the lack of the patient's consent, no chemotherapy was implemented. During the irradiation period, he had the first seizure episode, and was started on antiepileptic therapy (Depakine 200 mg a day). He continued this therapy for the rest of his life. After the radiotherapy, diagnostic follow-up examinations were conducted every 3 months. At the beginning of March 2010, another recurrence was found, and the tumor was localized in an upper part of the tumor bed, within the previously irradiated area (its size was 3.7 cm × 2.6 cm × 2.3 cm). Surprisingly, the patient had not experienced any symptoms, and his physical and neurological examinations were unremarkable. On March 13th of 2010, the stereotactic radiotherapy, using a single dose of 8 Gy applied to the area of recurrent tumor was performed. Unfortunately, on the control examination, on July 6th of 2010, further progression of the GBM was found. The patient expired on November 15th of 2010, in the local hospital (Zawiercie, Poland), due to the tumor expansion, resulting in cerebral edema, herniation, and multi-organ failure.
"""

In [35]:
output = get_annotations(note3)

output

{'bioNer_output': {'patient': 'Organism',
  'CT': 'Organ',
  'brain': 'Organ',
  '“An': 'Gene_or_gene_product',
  'lesion': 'Pathological_formation',
  'lobe': 'Cancer',
  'edema': 'Pathological_formation',
  'occipital corner': 'Multi-tissue_structure',
  'right lateral ventricle': 'Multi-tissue_structure',
  'supratentorial ventricular system': 'Multi-tissue_structure',
  'tumor': 'Cancer',
  'wedge tumor': 'Cancer',
  'lesions': 'Pathological_formation',
  "patient's": 'Organism',
  'pulmonary': 'Organ',
  'brain tumor': 'Cancer',
  'GBM': 'Cancer',
  "tumor's": 'Cancer',
  'tissue margin': 'Pathological_formation',
  'Depakine': 'Simple_chemical',
  'tumor bed': 'Multi-tissue_structure',
  'cerebral edema': 'Pathological_formation'},
 'custom_NER_output': [('20 years', 'TEMP', 'present'),
  ('August of 2005', 'DATEMONTH', 'present'),
  ('November of 2005', 'DATEDAY', 'absent'),
  ('December 30th of 2005', 'DATEDAY', 'present'),
  ('brain', 'SITEBRAIN', 'present'),
  ('temporal lobe

In [36]:
get_entity_resolver(note3)

+--------------------+-------+--------------------+-------+--------------------+-------------------+
|               chunk| entity|       resolved_text|   code|   alternative_codes|relative_confidence|
+--------------------+-------+--------------------+-------+--------------------+-------------------+
|              nausea|PROBLEM|              Nausea|   R110|R400,R143,R1114,R...|               25.0|
| the tumor expansion|PROBLEM|      Cerebral edema|   G936|I6782,J810,P910,I...|               25.0|
|   neoplasic disease|PROBLEM|   Castleman disease|  D47Z2|E7522,E7402,E7523...| 2.0129375669263947|
|any particular sy...|PROBLEM|Abnormal echoence...|  R9081|R770,R772,Q999,R1...| 1.4825080545586333|
|       abnormalities|PROBLEM|Abnormal echoence...|  R9081|R770,R772,Q999,R1...| 1.4825080545586333|
|      those symptoms|PROBLEM|Other general sym...|  R6889|R4689,R448,N6459,...|    1.4301781247332|
|his first disease...|PROBLEM|      Krabbe disease|  E7523|G3182,H8103,K5090...| 1.39442828