

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare_jsl/RE_RADIOLOGY.ipynb)




# **Extract relations between problem, test, and findings in reports**

Find relations between diagnosis, tests and imaging findings in radiology reports. 1 : The two entities are related. 0 : The two entities are not related




## 1. Colab Setup

In [None]:
# Install the johnsnowlabs library to access Spark-OCR and Spark-NLP for Healthcare, Finance, and Legal.
! pip install -q johnsnowlabs

In [None]:
from google.colab import files
print("Please Upload your John Snow Labs License using the button below")
license_keys = files.upload()

In [None]:
from johnsnowlabs import *

# After uploading your license run this to install all licensed Python Wheels and pre-download Jars the Spark Session JVM
# Make sure to restart your notebook afterwards for changes to take effect

jsl.install()

## Start Session

In [None]:
from johnsnowlabs import *
# Automatically load license data and start a session with all jars user has access to
spark = jsl.start()

## 2. Select the Relation Extraction model and construct the pipeline

Select the models:


* Clinical Relation Extraction models: **re_test_problem_finding**




For more details: https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb

In [None]:
# Change this to the model you want to use and re-run the cells below.
RE_MODEL_NAME = "re_test_problem_finding"
NER_MODEL_NAME = "jsl_ner_wip_clinical"

Create the pipeline

In [None]:
documenter = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencer = nlp.SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentences")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["sentences"])\
    .setOutputCol("tokens")

words_embedder = nlp.WordEmbeddingsModel()\
    .pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("embeddings")

pos_tagger = nlp.PerceptronModel()\
    .pretrained("pos_clinical", "en", "clinical/models") \
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("pos_tags")

clinical_ner_tagger = medical.NerModel().pretrained(NER_MODEL_NAME, "en", "clinical/models")\
    .setInputCols(["sentences", "tokens", "embeddings"])\
    .setOutputCol("ner_tags")

ner_chunker = medical.NerConverterInternal()\
    .setInputCols(["sentences", "tokens", "ner_tags"])\
    .setOutputCol("ner_chunks")\

dependency_parser = nlp.DependencyParserModel()\
    .pretrained("dependency_conllu", "en")\
    .setInputCols(["sentences", "pos_tags", "tokens"])\
    .setOutputCol("dependencies")

clinical_re_Model = medical.RelationExtractionModel().pretrained(RE_MODEL_NAME, "en", 'clinical/models')\
    .setInputCols(["embeddings","pos_tags","ner_chunks","dependencies"])\
    .setOutputCol("relations")\
    .setMaxSyntacticDistance(20)\
    .setPredictionThreshold(0.5)\
    .setRelationPairs(["procedure-symptom",
                       "test-symptom",
                       "test-test_result",
                       "test-injury_or_poisoning",
                       "injury_or_poisoning-symptom",
                       "test_result-test",
                       "test-heart_disease",
                       "test_result-symptom",
                       "test-ekg_findings",
                       "imagingfindings-test",
                       "imagingfindings-symptom",
                       "imagingfindings-imaging_technique",
                       "test-imagingfindings",
                       "procedure-imagingfindings",
                       "imaging_technique-test_result",
                       "test-disease_syndrome_disorder",
                       "imaging_technique-symptom",
                       "symptom-disease_syndrome_disorder"])
    
clinical_re_Model.setCustomLabels({"1": "is_related", "0": "not_related"})

pipeline = Pipeline(
    stages=[
        documenter,
        sentencer,
        tokenizer, 
        words_embedder, 
        pos_tagger, 
        clinical_ner_tagger,
        ner_chunker,
        dependency_parser,
        clinical_re_Model
        ])

empty_df = spark.createDataFrame([['']]).toDF("text")
pipeline_model = pipeline.fit(empty_df)
light_pipeline = LightPipeline(pipeline_model)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
pos_clinical download started this may take some time.
Approximate size to download 1.5 MB
[OK!]
jsl_ner_wip_clinical download started this may take some time.
[OK!]
dependency_conllu download started this may take some time.
Approximate size to download 16.7 MB
[OK!]
re_test_problem_finding download started this may take some time.
Approximate size to download 9 MB
[OK!]


## 3. Create example inputs

In [None]:
import pandas as pd
# Enter examples as strings in this array

input_list = [
    """Echocardiogram showed: AV fibrosis/calcification, AV stenosis/insufficiency, MV stenosis with annular calcification and regurgitation, moderate TR, decreased LV systolic function, severe LAE.""",
    """Coronary artery CTA with cardiac function revealed codominance between the right coronary artery and circumflex artery and there is mild to moderate stenosis of the proximal LAD. CCTA reveals dense calcific plaque within the left circumflex artery without evidence of occlusion.""",
    """MRI revealed a transverse fracture of the anterior superior calcaneal process of the calcaneus, no evidence of active marrow stress phenomenon.""",
    """MRI brain revealed increased T2 signal within the periventricular white matter, bilaterally. EEG reportedly showed diffuse slowing.""",
    """Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measuring approximately 0.5 x 0.5 x 0.4 cm in diameter located within the anteromedial aspect of the left shoulder. This mass demonstrates isoechoic echotexture to the adjacent muscle, with no evidence of internal color flow."""
]

df = spark.createDataFrame(input_list, StringType()).toDF("text")
df.show(truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                                text|
+----------------------------------------------------------------------------------------------------+
|Echocardiogram showed: AV fibrosis/calcification, AV stenosis/insufficiency, MV stenosis with ann...|
|Coronary artery CTA with cardiac function revealed codominance between the right coronary artery ...|
|MRI revealed a transverse fracture of the anterior superior calcaneal process of the calcaneus, n...|
|MRI brain revealed increased T2 signal within the periventricular white matter, bilaterally. EEG ...|
|Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measurin...|
+----------------------------------------------------------------------------------------------------+



## 4. Run the pipeline

In [None]:
result = pipeline_model.transform(df).toPandas()
result

Unnamed: 0,text,document,sentences,tokens,embeddings,pos_tags,ner_tags,ner_chunks,dependencies,relations
0,Echocardiogram showed: AV fibrosis/calcificati...,"[(document, 0, 190, Echocardiogram showed: AV ...","[(document, 0, 190, Echocardiogram showed: AV ...","[(token, 0, 13, Echocardiogram, {'sentence': '...","[(word_embeddings, 0, 13, Echocardiogram, {'se...","[(pos, 0, 13, NN, {'sentence': '0', 'word': 'E...","[(named_entity, 0, 13, B-Test, {'sentence': '0...","[(chunk, 0, 13, Echocardiogram, {'sentence': '...","[(dependency, 0, 13, showed, {'head': '2', 'he...","[(category, 0, 47, is_related, {'entity1': 'Te..."
1,Coronary artery CTA with cardiac function reve...,"[(document, 0, 277, Coronary artery CTA with c...","[(document, 0, 177, Coronary artery CTA with c...","[(token, 0, 7, Coronary, {'sentence': '0'}, []...","[(word_embeddings, 0, 7, Coronary, {'sentence'...","[(pos, 0, 7, JJ, {'sentence': '0', 'word': 'Co...","[(named_entity, 0, 7, B-Test, {'sentence': '0'...","[(chunk, 0, 18, Coronary artery CTA, {'sentenc...","[(dependency, 0, 7, ROOT, {'head': '0', 'head....","[(category, 0, 176, is_related, {'entity1': 'T..."
2,MRI revealed a transverse fracture of the ante...,"[(document, 0, 142, MRI revealed a transverse ...","[(document, 0, 142, MRI revealed a transverse ...","[(token, 0, 2, MRI, {'sentence': '0'}, []), (t...","[(word_embeddings, 0, 2, MRI, {'sentence': '0'...","[(pos, 0, 2, NN, {'sentence': '0', 'word': 'MR...","[(named_entity, 0, 2, B-Test, {'sentence': '0'...","[(chunk, 0, 2, MRI, {'sentence': '0', 'chunk':...","[(dependency, 0, 2, revealed, {'head': '2', 'h...","[(category, 0, 33, is_related, {'entity1': 'Te..."
3,MRI brain revealed increased T2 signal within ...,"[(document, 0, 130, MRI brain revealed increas...","[(document, 0, 91, MRI brain revealed increase...","[(token, 0, 2, MRI, {'sentence': '0'}, []), (t...","[(word_embeddings, 0, 2, MRI, {'sentence': '0'...","[(pos, 0, 2, NN, {'sentence': '0', 'word': 'MR...","[(named_entity, 0, 2, B-Test, {'sentence': '0'...","[(chunk, 0, 8, MRI brain, {'sentence': '0', 'c...","[(dependency, 0, 2, ROOT, {'head': '0', 'head....","[(category, 0, 37, is_related, {'entity1': 'Te..."
4,Bilateral breast ultrasound was subsequently p...,"[(document, 0, 313, Bilateral breast ultrasoun...","[(document, 0, 203, Bilateral breast ultrasoun...","[(token, 0, 8, Bilateral, {'sentence': '0'}, [...","[(word_embeddings, 0, 8, Bilateral, {'sentence...","[(pos, 0, 8, JJ, {'sentence': '0', 'word': 'Bi...","[(named_entity, 0, 8, B-Direction, {'sentence'...","[(chunk, 0, 8, Bilateral, {'sentence': '0', 'c...","[(dependency, 0, 8, ROOT, {'head': '0', 'head....","[(category, 10, 87, is_related, {'entity1': 'T..."


In [None]:
result.relations[0]

[Row(annotatorType='category', begin=0, end=47, result='is_related', metadata={'entity1': 'Test', 'entity1_begin': '0', 'entity2': 'Heart_Disease', 'chunk1': 'Echocardiogram', 'chunk2': 'AV fibrosis/calcification', 'entity2_begin': '23', 'confidence': '1.0', 'entity1_end': '13', 'entity2_end': '47'}, embeddings=[]),
 Row(annotatorType='category', begin=0, end=74, result='is_related', metadata={'entity1': 'Test', 'entity1_begin': '0', 'entity2': 'EKG_Findings', 'chunk1': 'Echocardiogram', 'chunk2': 'AV stenosis/insufficiency', 'entity2_begin': '50', 'confidence': '0.9999999', 'entity1_end': '13', 'entity2_end': '74'}, embeddings=[]),
 Row(annotatorType='category', begin=0, end=87, result='is_related', metadata={'entity1': 'Test', 'entity1_begin': '0', 'entity2': 'Heart_Disease', 'chunk1': 'Echocardiogram', 'chunk2': 'MV stenosis', 'entity2_begin': '77', 'confidence': '1.0', 'entity1_end': '13', 'entity2_end': '87'}, embeddings=[]),
 Row(annotatorType='category', begin=0, end=114, result

## 5. Visualize

In [None]:
from sparknlp_display import RelationExtractionVisualizer

vis = RelationExtractionVisualizer()

for i in range(len(input_list)):
  light_result = light_pipeline.fullAnnotate(input_list[i])[0]
  vis.display(light_result, #should be the results of a single example, not the complete dataframe
                relation_col = 'relations', #specify relations column
                document_col = 'document', #specify document column
                show_relations=True) # default show_relations: True