

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/RE_RADIOLOGY.ipynb)




# **Extract relations between problem, test, and findings in reports**

Find relations between diagnosis, tests and imaging findings in radiology reports. 1 : The two entities are related. 0 : The two entities are not related




## 1. Colab Setup

In [None]:
import json, os
from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [3]:
import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql.types import StringType, IntegerType

import pandas as pd
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 4.2.8
Spark NLP_JSL Version : 4.2.8


## 2. Select the Relation Extraction model and construct the pipeline

Select the models:


* Clinical Relation Extraction models: **re_test_problem_finding**




For more details: https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb

In [4]:
# Change this to the model you want to use and re-run the cells below.
RE_MODEL_NAME = "re_test_problem_finding"
NER_MODEL_NAME = "jsl_ner_wip_clinical"

Create the pipeline

In [5]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencer = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentences")

tokenizer = Tokenizer()\
    .setInputCols(["sentences"])\
    .setOutputCol("tokens")

words_embedder = WordEmbeddingsModel()\
    .pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("embeddings")

pos_tagger = PerceptronModel()\
    .pretrained("pos_clinical", "en", "clinical/models") \
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("pos_tags")

clinical_ner_tagger = MedicalNerModel().pretrained(NER_MODEL_NAME, "en", "clinical/models")\
    .setInputCols(["sentences", "tokens", "embeddings"])\
    .setOutputCol("ner_tags")

ner_chunker = NerConverterInternal()\
    .setInputCols(["sentences", "tokens", "ner_tags"])\
    .setOutputCol("ner_chunks")\

dependency_parser = DependencyParserModel()\
    .pretrained("dependency_conllu", "en")\
    .setInputCols(["sentences", "pos_tags", "tokens"])\
    .setOutputCol("dependencies")

clinical_re_Model = RelationExtractionModel().pretrained(RE_MODEL_NAME, "en", 'clinical/models')\
    .setInputCols(["embeddings","pos_tags","ner_chunks","dependencies"])\
    .setOutputCol("relations")\
    .setMaxSyntacticDistance(20)\
    .setPredictionThreshold(0.5)\
    .setRelationPairs(["procedure-symptom",
                       "test-symptom",
                       "test-test_result",
                       "test-injury_or_poisoning",
                       "injury_or_poisoning-symptom",
                       "test_result-test",
                       "test-heart_disease",
                       "test_result-symptom",
                       "test-ekg_findings",
                       "imagingfindings-test",
                       "imagingfindings-symptom",
                       "imagingfindings-imaging_technique",
                       "test-imagingfindings",
                       "procedure-imagingfindings",
                       "imaging_technique-test_result",
                       "test-disease_syndrome_disorder",
                       "imaging_technique-symptom",
                       "symptom-disease_syndrome_disorder"])
    
clinical_re_Model.setCustomLabels({"1": "is_related", "0": "not_related"})

pipeline = Pipeline(
    stages=[
        documenter,
        sentencer,
        tokenizer, 
        words_embedder, 
        pos_tagger, 
        clinical_ner_tagger,
        ner_chunker,
        dependency_parser,
        clinical_re_Model
        ])

empty_df = spark.createDataFrame([['']]).toDF("text")
pipeline_model = pipeline.fit(empty_df)
light_pipeline = LightPipeline(pipeline_model)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
pos_clinical download started this may take some time.
Approximate size to download 1.5 MB
[OK!]
jsl_ner_wip_clinical download started this may take some time.
[OK!]
dependency_conllu download started this may take some time.
Approximate size to download 16.7 MB
[OK!]
re_test_problem_finding download started this may take some time.
Approximate size to download 9 MB
[OK!]


## 3. Create example inputs

In [6]:
import pandas as pd
# Enter examples as strings in this array

input_list = [
    """Echocardiogram showed: AV fibrosis/calcification, AV stenosis/insufficiency, MV stenosis with annular calcification and regurgitation, moderate TR, decreased LV systolic function, severe LAE.""",
    """Coronary artery CTA with cardiac function revealed codominance between the right coronary artery and circumflex artery and there is mild to moderate stenosis of the proximal LAD. CCTA reveals dense calcific plaque within the left circumflex artery without evidence of occlusion.""",
    """MRI revealed a transverse fracture of the anterior superior calcaneal process of the calcaneus, no evidence of active marrow stress phenomenon.""",
    """MRI brain revealed increased T2 signal within the periventricular white matter, bilaterally. EEG reportedly showed diffuse slowing.""",
    """Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measuring approximately 0.5 x 0.5 x 0.4 cm in diameter located within the anteromedial aspect of the left shoulder. This mass demonstrates isoechoic echotexture to the adjacent muscle, with no evidence of internal color flow."""
]

df = spark.createDataFrame(input_list, StringType()).toDF("text")
df.show(truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                                text|
+----------------------------------------------------------------------------------------------------+
|Echocardiogram showed: AV fibrosis/calcification, AV stenosis/insufficiency, MV stenosis with ann...|
|Coronary artery CTA with cardiac function revealed codominance between the right coronary artery ...|
|MRI revealed a transverse fracture of the anterior superior calcaneal process of the calcaneus, n...|
|MRI brain revealed increased T2 signal within the periventricular white matter, bilaterally. EEG ...|
|Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measurin...|
+----------------------------------------------------------------------------------------------------+



## 4. Run the pipeline

In [7]:
result = pipeline_model.transform(df).toPandas()
result

Unnamed: 0,text,document,sentences,tokens,embeddings,pos_tags,ner_tags,ner_chunks,dependencies,relations
0,"Echocardiogram showed: AV fibrosis/calcification, AV stenosis/insufficiency, MV stenosis with annular calcification and regurgitation, moderate TR, decreased LV systolic function, severe LAE.","[(document, 0, 190, Echocardiogram showed: AV fibrosis/calcification, AV stenosis/insufficiency, MV stenosis with annular calcification and regurgitation, moderate TR, decreased LV systolic functi...","[(document, 0, 190, Echocardiogram showed: AV fibrosis/calcification, AV stenosis/insufficiency, MV stenosis with annular calcification and regurgitation, moderate TR, decreased LV systolic functi...","[(token, 0, 13, Echocardiogram, {'sentence': '0'}, []), (token, 15, 20, showed, {'sentence': '0'}, []), (token, 21, 21, :, {'sentence': '0'}, []), (token, 23, 24, AV, {'sentence': '0'}, []), (toke...","[(word_embeddings, 0, 13, Echocardiogram, {'sentence': '0', 'isOOV': 'false', 'isWordStart': 'true', 'pieceId': '-1', 'token': 'Echocardiogram'}, [-0.07145026326179504, 0.5303494334220886, 0.66936...","[(pos, 0, 13, NN, {'sentence': '0', 'word': 'Echocardiogram'}, []), (pos, 15, 20, VVD, {'sentence': '0', 'word': 'showed'}, []), (pos, 21, 21, NN, {'sentence': '0', 'word': ':'}, []), (pos, 23, 24...","[(named_entity, 0, 13, B-Test, {'sentence': '0', 'word': 'Echocardiogram', 'confidence': '0.9941'}, []), (named_entity, 15, 20, O, {'sentence': '0', 'word': 'showed', 'confidence': '0.9968'}, []),...","[(chunk, 0, 13, Echocardiogram, {'sentence': '0', 'chunk': '0', 'ner_source': 'ner_chunks', 'entity': 'Test', 'confidence': '0.9941'}, []), (chunk, 23, 47, AV fibrosis/calcification, {'sentence': ...","[(dependency, 0, 13, showed, {'head': '2', 'head.end': '20', 'sentence': '0', 'head.begin': '15'}, []), (dependency, 15, 20, ROOT, {'head': '0', 'head.end': '-1', 'sentence': '0', 'head.begin': '-...","[(category, 0, 47, is_related, {'entity1': 'Test', 'entity1_begin': '0', 'entity2': 'Heart_Disease', 'chunk1_confidence': '0.9941', 'chunk1': 'Echocardiogram', 'chunk2': 'AV fibrosis/calcification..."
1,Coronary artery CTA with cardiac function revealed codominance between the right coronary artery and circumflex artery and there is mild to moderate stenosis of the proximal LAD. CCTA reveals dens...,"[(document, 0, 277, Coronary artery CTA with cardiac function revealed codominance between the right coronary artery and circumflex artery and there is mild to moderate stenosis of the proximal LA...","[(document, 0, 177, Coronary artery CTA with cardiac function revealed codominance between the right coronary artery and circumflex artery and there is mild to moderate stenosis of the proximal LA...","[(token, 0, 7, Coronary, {'sentence': '0'}, []), (token, 9, 14, artery, {'sentence': '0'}, []), (token, 16, 18, CTA, {'sentence': '0'}, []), (token, 20, 23, with, {'sentence': '0'}, []), (token, 2...","[(word_embeddings, 0, 7, Coronary, {'sentence': '0', 'isOOV': 'false', 'isWordStart': 'true', 'pieceId': '-1', 'token': 'Coronary'}, [0.3791410028934479, 0.057826247066259384, 0.04966732859611511,...","[(pos, 0, 7, JJ, {'sentence': '0', 'word': 'Coronary'}, []), (pos, 9, 14, NN, {'sentence': '0', 'word': 'artery'}, []), (pos, 16, 18, NN, {'sentence': '0', 'word': 'CTA'}, []), (pos, 20, 23, II, {...","[(named_entity, 0, 7, B-Test, {'sentence': '0', 'word': 'Coronary', 'confidence': '0.7048'}, []), (named_entity, 9, 14, I-Test, {'sentence': '0', 'word': 'artery', 'confidence': '0.3439'}, []), (n...","[(chunk, 0, 18, Coronary artery CTA, {'sentence': '0', 'chunk': '0', 'ner_source': 'ner_chunks', 'entity': 'Test', 'confidence': '0.65099996'}, []), (chunk, 25, 40, cardiac function, {'sentence': ...","[(dependency, 0, 7, ROOT, {'head': '0', 'head.end': '-1', 'sentence': '0', 'head.begin': '-1'}, []), (dependency, 9, 14, CTA, {'head': '3', 'head.end': '18', 'sentence': '0', 'head.begin': '16'}, ...","[(category, 0, 176, is_related, {'entity1': 'Test', 'entity1_begin': '0', 'entity2': 'Heart_Disease', 'chunk1_confidence': '0.65099996', 'chunk1': 'Coronary artery CTA', 'chunk2': 'stenosis of the..."
2,"MRI revealed a transverse fracture of the anterior superior calcaneal process of the calcaneus, no evidence of active marrow stress phenomenon.","[(document, 0, 142, MRI revealed a transverse fracture of the anterior superior calcaneal process of the calcaneus, no evidence of active marrow stress phenomenon., {'sentence': '0'}, [])]","[(document, 0, 142, MRI revealed a transverse fracture of the anterior superior calcaneal process of the calcaneus, no evidence of active marrow stress phenomenon., {'sentence': '0'}, [])]","[(token, 0, 2, MRI, {'sentence': '0'}, []), (token, 4, 11, revealed, {'sentence': '0'}, []), (token, 13, 13, a, {'sentence': '0'}, []), (token, 15, 24, transverse, {'sentence': '0'}, []), (token, ...","[(word_embeddings, 0, 2, MRI, {'sentence': '0', 'isOOV': 'false', 'isWordStart': 'true', 'pieceId': '-1', 'token': 'MRI'}, [0.30669355392456055, -0.18632292747497559, 0.6304829716682434, -0.174884...","[(pos, 0, 2, NN, {'sentence': '0', 'word': 'MRI'}, []), (pos, 4, 11, VVD, {'sentence': '0', 'word': 'revealed'}, []), (pos, 13, 13, DD, {'sentence': '0', 'word': 'a'}, []), (pos, 15, 24, JJ, {'sen...","[(named_entity, 0, 2, B-Test, {'sentence': '0', 'word': 'MRI', 'confidence': '0.9993'}, []), (named_entity, 4, 11, O, {'sentence': '0', 'word': 'revealed', 'confidence': '0.9969'}, []), (named_ent...","[(chunk, 0, 2, MRI, {'sentence': '0', 'chunk': '0', 'ner_source': 'ner_chunks', 'entity': 'Test', 'confidence': '0.9993'}, []), (chunk, 15, 24, transverse, {'sentence': '0', 'chunk': '1', 'ner_sou...","[(dependency, 0, 2, revealed, {'head': '2', 'head.end': '11', 'sentence': '0', 'head.begin': '4'}, []), (dependency, 4, 11, ROOT, {'head': '0', 'head.end': '-1', 'sentence': '0', 'head.begin': '-1...","[(category, 0, 33, is_related, {'entity1': 'Test', 'entity1_begin': '0', 'entity2': 'Injury_or_Poisoning', 'chunk1_confidence': '0.9993', 'chunk1': 'MRI', 'chunk2': 'fracture', 'entity2_begin': '2..."
3,"MRI brain revealed increased T2 signal within the periventricular white matter, bilaterally. EEG reportedly showed diffuse slowing.","[(document, 0, 130, MRI brain revealed increased T2 signal within the periventricular white matter, bilaterally. EEG reportedly showed diffuse slowing., {'sentence': '0'}, [])]","[(document, 0, 91, MRI brain revealed increased T2 signal within the periventricular white matter, bilaterally., {'sentence': '0'}, []), (document, 93, 130, EEG reportedly showed diffuse slowing.,...","[(token, 0, 2, MRI, {'sentence': '0'}, []), (token, 4, 8, brain, {'sentence': '0'}, []), (token, 10, 17, revealed, {'sentence': '0'}, []), (token, 19, 27, increased, {'sentence': '0'}, []), (token...","[(word_embeddings, 0, 2, MRI, {'sentence': '0', 'isOOV': 'false', 'isWordStart': 'true', 'pieceId': '-1', 'token': 'MRI'}, [0.30669355392456055, -0.18632292747497559, 0.6304829716682434, -0.174884...","[(pos, 0, 2, NN, {'sentence': '0', 'word': 'MRI'}, []), (pos, 4, 8, NN, {'sentence': '0', 'word': 'brain'}, []), (pos, 10, 17, VVD, {'sentence': '0', 'word': 'revealed'}, []), (pos, 19, 27, VVNJ, ...","[(named_entity, 0, 2, B-Test, {'sentence': '0', 'word': 'MRI', 'confidence': '0.9983'}, []), (named_entity, 4, 8, I-Test, {'sentence': '0', 'word': 'brain', 'confidence': '0.8312'}, []), (named_en...","[(chunk, 0, 8, MRI brain, {'sentence': '0', 'chunk': '0', 'ner_source': 'ner_chunks', 'entity': 'Test', 'confidence': '0.91475'}, []), (chunk, 19, 37, increased T2 signal, {'sentence': '0', 'chunk...","[(dependency, 0, 2, ROOT, {'head': '0', 'head.end': '-1', 'sentence': '0', 'head.begin': '-1'}, []), (dependency, 4, 8, MRI, {'head': '1', 'head.end': '2', 'sentence': '0', 'head.begin': '0'}, [])...","[(category, 0, 37, is_related, {'entity1': 'Test', 'entity1_begin': '0', 'entity2': 'ImagingFindings', 'chunk1_confidence': '0.91475', 'chunk1': 'MRI brain', 'chunk2': 'increased T2 signal', 'enti..."
4,"Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measuring approximately 0.5 x 0.5 x 0.4 cm in diameter located within the anteromedial aspect of the left s...","[(document, 0, 313, Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measuring approximately 0.5 x 0.5 x 0.4 cm in diameter located within the anteromedial ...","[(document, 0, 203, Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measuring approximately 0.5 x 0.5 x 0.4 cm in diameter located within the anteromedial ...","[(token, 0, 8, Bilateral, {'sentence': '0'}, []), (token, 10, 15, breast, {'sentence': '0'}, []), (token, 17, 26, ultrasound, {'sentence': '0'}, []), (token, 28, 30, was, {'sentence': '0'}, []), (...","[(word_embeddings, 0, 8, Bilateral, {'sentence': '0', 'isOOV': 'false', 'isWordStart': 'true', 'pieceId': '-1', 'token': 'Bilateral'}, [-0.5400417447090149, 0.24526117742061615, -0.067149899899959...","[(pos, 0, 8, JJ, {'sentence': '0', 'word': 'Bilateral'}, []), (pos, 10, 15, NN, {'sentence': '0', 'word': 'breast'}, []), (pos, 17, 26, NN, {'sentence': '0', 'word': 'ultrasound'}, []), (pos, 28, ...","[(named_entity, 0, 8, B-Direction, {'sentence': '0', 'word': 'Bilateral', 'confidence': '0.9055'}, []), (named_entity, 10, 15, B-Test, {'sentence': '0', 'word': 'breast', 'confidence': '0.7009'}, ...","[(chunk, 0, 8, Bilateral, {'sentence': '0', 'chunk': '0', 'ner_source': 'ner_chunks', 'entity': 'Direction', 'confidence': '0.9055'}, []), (chunk, 10, 26, breast ultrasound, {'sentence': '0', 'chu...","[(dependency, 0, 8, ROOT, {'head': '0', 'head.end': '-1', 'sentence': '0', 'head.begin': '-1'}, []), (dependency, 10, 15, ultrasound, {'head': '3', 'head.end': '26', 'sentence': '0', 'head.begin':...","[(category, 10, 87, is_related, {'entity1': 'Test', 'entity1_begin': '10', 'entity2': 'Symptom', 'chunk1_confidence': '0.8014', 'chunk1': 'breast ultrasound', 'chunk2': 'mass', 'entity2_begin': '8..."


In [8]:
result.relations[0]

[Row(annotatorType='category', begin=0, end=47, result='is_related', metadata={'entity1': 'Test', 'entity1_begin': '0', 'entity2': 'Heart_Disease', 'chunk1_confidence': '0.9941', 'chunk1': 'Echocardiogram', 'chunk2': 'AV fibrosis/calcification', 'entity2_begin': '23', 'confidence': '1.0', 'entity1_end': '13', 'entity2_end': '47', 'chunk2_confidence': '0.62409997', 'direction': 'both'}, embeddings=[]),
 Row(annotatorType='category', begin=0, end=74, result='is_related', metadata={'entity1': 'Test', 'entity1_begin': '0', 'entity2': 'EKG_Findings', 'chunk1_confidence': '0.9941', 'chunk1': 'Echocardiogram', 'chunk2': 'AV stenosis/insufficiency', 'entity2_begin': '50', 'confidence': '0.9999999', 'entity1_end': '13', 'entity2_end': '74', 'chunk2_confidence': '0.512', 'direction': 'both'}, embeddings=[]),
 Row(annotatorType='category', begin=0, end=87, result='is_related', metadata={'entity1': 'Test', 'entity1_begin': '0', 'entity2': 'Heart_Disease', 'chunk1_confidence': '0.9941', 'chunk1': '

## 5. Visualize

In [9]:
from sparknlp_display import RelationExtractionVisualizer

vis = RelationExtractionVisualizer()

for i in range(len(input_list)):
  light_result = light_pipeline.fullAnnotate(input_list[i])[0]
  vis.display(light_result, #should be the results of a single example, not the complete dataframe
                relation_col = 'relations', #specify relations column
                document_col = 'document', #specify document column
                show_relations=True) # default show_relations: True