In [2]:
import sparknlp_jsl # licensed version of Spark NLP
import sparknlp

spark = sparknlp_jsl.start("xxxxx") # xxxx is a secret key. If you don't have it, please get in touch with JSL.

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.3.6
Apache Spark version:  2.4.4


Loading the pretrained clinical pipeline ("explain_clinical_doc_dl"). It has the following annotators inside.

- Tokenizer
- Sentence Detector
- Clinical Word Embeddings (glove trained on pubmed dataset)
- Clinical NER-DL (trained by SOTA algorithm on i2b2 dataset)
- AssertionDL model (trained by SOTA algorithm on i2b2 dataset)

In [4]:
from pyspark.ml import PipelineModel

pretrained_model = PipelineModel.load("path/explain_clinical_doc_dl")


### with LightPipeline

In [6]:
from sparknlp.base import LightPipeline

ner_lightModel = LightPipeline(pretrained_model)

In [7]:
clinical_text = """
Patient with severe fever and sore throat. 
He shows no stomach pain and he maintained on an epidural and PCA for pain control.
He also became short of breath with climbing a flight of stairs.
After CT, lung tumour located at the right lower lobe. Father with Alzheimer.
"""

In [10]:
result = ner_lightModel.annotate(clinical_text)
result.keys()

dict_keys(['document', 'ner_chunk', 'assertion', 'token', 'ner', 'embeddings', 'sentence'])

In [11]:
list(zip(result['token'],result['ner']))

[('Patient', 'O'),
 ('with', 'O'),
 ('severe', 'B-PROBLEM'),
 ('fever', 'I-PROBLEM'),
 ('and', 'O'),
 ('sore', 'B-PROBLEM'),
 ('throat', 'I-PROBLEM'),
 ('.', 'O'),
 ('He', 'O'),
 ('shows', 'O'),
 ('no', 'O'),
 ('stomach', 'B-PROBLEM'),
 ('pain', 'I-PROBLEM'),
 ('and', 'O'),
 ('he', 'O'),
 ('maintained', 'O'),
 ('on', 'O'),
 ('an', 'B-TREATMENT'),
 ('epidural', 'I-TREATMENT'),
 ('and', 'O'),
 ('PCA', 'B-TREATMENT'),
 ('for', 'O'),
 ('pain', 'B-PROBLEM'),
 ('control', 'I-PROBLEM'),
 ('.', 'O'),
 ('He', 'O'),
 ('also', 'O'),
 ('became', 'O'),
 ('short', 'B-PROBLEM'),
 ('of', 'I-PROBLEM'),
 ('breath', 'I-PROBLEM'),
 ('with', 'O'),
 ('climbing', 'O'),
 ('a', 'O'),
 ('flight', 'O'),
 ('of', 'O'),
 ('stairs', 'O'),
 ('.', 'O'),
 ('After', 'O'),
 ('CT', 'B-TEST'),
 (',', 'O'),
 ('lung', 'B-PROBLEM'),
 ('tumour', 'I-PROBLEM'),
 ('located', 'O'),
 ('at', 'O'),
 ('the', 'O'),
 ('right', 'O'),
 ('lower', 'O'),
 ('lobe', 'O'),
 ('.', 'O'),
 ('Father', 'O'),
 ('with', 'O'),
 ('Alzheimer', 'B-PROBLEM

In [9]:
result = ner_lightModel.annotate(clinical_text)
list(zip(result['ner_chunk'],result['assertion']))

[('severe fever', 'present'),
 ('sore throat', 'present'),
 ('stomach pain', 'absent'),
 ('an epidural', 'present'),
 ('PCA', 'present'),
 ('pain control', 'present'),
 ('short of breath', 'conditional'),
 ('CT', 'present'),
 ('lung tumour', 'present'),
 ('Alzheimer', 'associated_with_someone_else')]

In [12]:
%%time 

result = ner_lightModel.fullAnnotate(clinical_text)

entity_tuples = [(n.result, n.metadata['entity'], m.result, n.begin, n.end) for n,m in zip(result[0]['ner_chunk'],result[0]['assertion'])]

CPU times: user 66 ms, sys: 20.8 ms, total: 86.8 ms
Wall time: 354 ms


In [13]:
entity_tuples

[('severe fever', 'PROBLEM', 'present', 14, 25),
 ('sore throat', 'PROBLEM', 'present', 31, 41),
 ('stomach pain', 'PROBLEM', 'absent', 57, 68),
 ('an epidural', 'TREATMENT', 'present', 91, 101),
 ('PCA', 'TREATMENT', 'present', 107, 109),
 ('pain control', 'PROBLEM', 'present', 115, 126),
 ('short of breath', 'PROBLEM', 'conditional', 144, 158),
 ('CT', 'TEST', 'present', 200, 201),
 ('lung tumour', 'PROBLEM', 'present', 204, 214),
 ('Alzheimer', 'PROBLEM', 'associated_with_someone_else', 261, 269)]

In [14]:
import pandas as pd

pd.DataFrame(entity_tuples, columns=["phrase","entity","assertion","start","end"])

Unnamed: 0,phrase,entity,assertion,start,end
0,severe fever,PROBLEM,present,14,25
1,sore throat,PROBLEM,present,31,41
2,stomach pain,PROBLEM,absent,57,68
3,an epidural,TREATMENT,present,91,101
4,PCA,TREATMENT,present,107,109
5,pain control,PROBLEM,present,115,126
6,short of breath,PROBLEM,conditional,144,158
7,CT,TEST,present,200,201
8,lung tumour,PROBLEM,present,204,214
9,Alzheimer,PROBLEM,associated_with_someone_else,261,269


## with Spark dataframes

In [15]:
data = spark.createDataFrame([
  ["Patient with severe feber and sore throat"],
  ["Patient shows no stomach pain"],
  ["She was maintained on an epidural and PCA for pain control."],
  ["He also became short of breath with climbing a flight of stairs."],
  ["Lung tumour located at the right lower lobe"],
  ["Father with Alzheimer."]
]).toDF("text")

data.show(truncate=False)

+----------------------------------------------------------------+
|text                                                            |
+----------------------------------------------------------------+
|Patient with severe feber and sore throat                       |
|Patient shows no stomach pain                                   |
|She was maintained on an epidural and PCA for pain control.     |
|He also became short of breath with climbing a flight of stairs.|
|Lung tumour located at the right lower lobe                     |
|Father with Alzheimer.                                          |
+----------------------------------------------------------------+



In [18]:
pretrained_model.transform(data).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|           assertion|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Patient with seve...|[[document, 0, 40...|[[document, 0, 40...|[[token, 0, 6, Pa...|[[word_embeddings...|[[named_entity, 0...|[[chunk, 13, 24, ...|[[assertion, 13, ...|
|Patient shows no ...|[[document, 0, 28...|[[document, 0, 28...|[[token, 0, 6, Pa...|[[word_embeddings...|[[named_entity, 0...|[[chunk, 17, 28, ...|[[assertion, 17, ...|
|She was maintaine...|[[document, 0, 58...|[[document, 0, 58...|[[token, 0, 2, Sh...|[[word_embeddings...|[[named_entity, 0...|[[chunk, 22, 32, ...|[[

In [16]:
pretrained_model.transform(data).select("token.result","ner.result").show(truncate=False)

+-------------------------------------------------------------------------------+----------------------------------------------------------------------------------+
|result                                                                         |result                                                                            |
+-------------------------------------------------------------------------------+----------------------------------------------------------------------------------+
|[Patient, with, severe, feber, and, sore, throat]                              |[O, O, B-PROBLEM, I-PROBLEM, O, B-PROBLEM, I-PROBLEM]                             |
|[Patient, shows, no, stomach, pain]                                            |[O, O, O, B-PROBLEM, I-PROBLEM]                                                   |
|[She, was, maintained, on, an, epidural, and, PCA, for, pain, control, .]      |[O, O, O, O, B-TREATMENT, I-TREATMENT, O, B-TREATMENT, O, B-PROBLEM, I-PROBLEM, O]|
|[He, also

In [17]:
pretrained_model.transform(data).select("ner_chunk.result", "assertion.result").show(truncate=False)


+--------------------------------+--------------------------------+
|result                          |result                          |
+--------------------------------+--------------------------------+
|[severe feber, sore throat]     |[present, present]              |
|[stomach pain]                  |[absent]                        |
|[an epidural, PCA, pain control]|[present, present, hypothetical]|
|[short of breath]               |[conditional]                   |
|[Lung tumour]                   |[present]                       |
|[Alzheimer]                     |[associated_with_someone_else]  |
+--------------------------------+--------------------------------+

