![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.1.Clinical_Relation_Extraction_BodyParts_Models.ipynb)

# 10.1 Clinical Relation Extraction BodyPart Models

(requires Spark NLP 2.7.1 and Spark NLP Healthcare 2.7.2))

In [None]:
import json

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

license_keys.keys()

In [None]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

secret = license_keys['SECRET']

os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['AWS_ACCESS_KEY_ID']= license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
version = license_keys['PUBLIC_VERSION']
jsl_version = license_keys['JSL_VERSION']

! pip install --ignore-installed -q pyspark==2.4.4

! python -m pip install --upgrade spark-nlp-jsl==$jsl_version  --extra-index-url https://pypi.johnsnowlabs.com/$secret

! pip install --ignore-installed -q spark-nlp==$version

import sparknlp

print (sparknlp.version())

import json
import os
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl

params = {"spark.driver.memory":"16G",
"spark.kryoserializer.buffer.max":"2000M",
"spark.driver.maxResultSize":"2000M"}
spark = sparknlp_jsl.start(secret, params=params)

In [None]:
spark

# Prediction Pipeline for Clinical Binary Relation Models

In [None]:
import pandas as pd

# This function will be utilized to show prediction results in a dataframe
def get_relations_df (results, col='relations'):
    rel_pairs=[]
    for rel in results[0][col]:
        rel_pairs.append((
          rel.result, 
          rel.metadata['entity1'], 
          rel.metadata['entity1_begin'],
          rel.metadata['entity1_end'],
          rel.metadata['chunk1'], 
          rel.metadata['entity2'],
          rel.metadata['entity2_begin'],
          rel.metadata['entity2_end'],
          rel.metadata['chunk2'], 
          rel.metadata['confidence']
        ))

    rel_df = pd.DataFrame(rel_pairs, columns=['relations',
                                              'entity1','entity1_begin','entity1_end','chunk1',
                                              'entity2','entity2_end','entity2_end','chunk2', 
                                              'confidence'])
    # limit df columns to get entity and chunks with results only
    rel_df = rel_df.iloc[:,[0,1,4,5,8,9]]
    
    return rel_df

In [None]:
# run only once
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencer = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentences")

tokenizer = sparknlp.annotators.Tokenizer()\
    .setInputCols(["sentences"])\
    .setOutputCol("tokens")\

words_embedder = WordEmbeddingsModel()\
    .pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("embeddings")

pos_tagger = PerceptronModel()\
    .pretrained("pos_clinical", "en", "clinical/models") \
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("pos_tags")
    
dependency_parser = sparknlp.annotators.DependencyParserModel()\
    .pretrained("dependency_conllu", "en")\
    .setInputCols(["sentences", "pos_tags", "tokens"])\
    .setOutputCol("dependencies")

# get pretrained ner model 
clinical_ner_tagger = sparknlp.annotators.NerDLModel()\
    .pretrained('jsl_ner_wip_greedy_clinical','en','clinical/models')\
    .setInputCols("sentences", "tokens", "embeddings")\
    .setOutputCol("ner_tags")    

ner_chunker = NerConverter()\
    .setInputCols(["sentences", "tokens", "ner_tags"])\
    .setOutputCol("ner_chunks")





embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
pos_clinical download started this may take some time.
Approximate size to download 1.7 MB
[OK!]
dependency_conllu download started this may take some time.
Approximate size to download 16.6 MB
[OK!]
jsl_ner_wip_greedy_clinical download started this may take some time.
Approximate size to download 14.5 MB
[OK!]
re_bodypart_problem download started this may take some time.
Approximate size to download 9.2 MB
[OK!]


In [None]:
# example pairs for relation entities
# bodypart entities >> ['external_body_part_or_region', 'internal_organ_or_component']

# 1. bodypart vs problem
pair1 = ['symptom-external_body_part_or_region', 'external_body_part_or_region-symptom']

# 2. bodypart vs procedure and test
pair2 = ['internal_organ_or_component-imagingtest',
 'imagingtest-internal_organ_or_component',
 'internal_organ_or_component-procedure',
 'procedure-internal_organ_or_component',
 'internal_organ_or_component-test',
 'test-internal_organ_or_component',
 'external_body_part_or_region-imagingtest',
 'imagingtest-external_body_part_or_region',
 'external_body_part_or_region-procedure',
 'procedure-external_body_part_or_region',
 'external_body_part_or_region-test',
 'test-external_body_part_or_region']

# 3. bodypart vs direction
pair3 = ['direction-external_body_part_or_region', 'external_body_part_or_region-direction',
        'internal_organ_or_component-direction','direction-internal_organ_or_component']

# 4. date vs other clinical entities
# date entities >> ['Date', 'RelativeDate', 'Duration', 'RelativeTime', 'Time']
pair4 = ['symptom-date', 'date-procedure', 'delativedate-test']

 **Pretrained relation model names**; use this names in `RelationExtractionModel()` ;  
 
 + `re_bodypart_problem`  
 
 + `re_bodypart_directions`  
 
 + `re_bodypart_proceduretest`  
 
 + `re_date_clinical`  

In [None]:
# get pretained relation model ; 

re_model = RelationExtractionModel()\
    .pretrained("re_bodypart_directions", "en", 'clinical/models')\
    .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\
    .setOutputCol("relations")\
    .setRelationPairs(['direction-external_body_part_or_region', 
                       'external_body_part_or_region-direction',
                       'direction-internal_organ_or_component',
                       'internal_organ_or_component-direction'
                      ])\
    .setMaxSyntacticDistance(3)\
    .setPredictionThreshold(0.9)

trained_pipeline = Pipeline(stages=[
    documenter,
    sentencer,
    tokenizer, 
    words_embedder, 
    pos_tagger, 
    clinical_ner_tagger,
    ner_chunker,
    dependency_parser,
    re_model
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

loaded_re_model = trained_pipeline.fit(empty_data)

re_bodypart_directions download started this may take some time.
Approximate size to download 9.2 MB
[OK!]


## Sample clinical tetxs

In [None]:
# bodypart vs problem 
text1 = '''No neurologic deficits other than some numbness in his left hand.'''

# bodypart  vs procedure and test
#text2 = 'Common bile duct was noted to be 10 mm in size on that ultrasound.'
#text2 = 'Biopsies of the distal duodenum, gastric antrum, distalesophagus were taken and sent for pathological evaluation.'
text2 = 'TECHNIQUE IN DETAIL: After informed consent was obtained from the patient and his mother, the chest was scanned with portable ultrasound.'

# bodypart direction
text3 = '''MRI demonstrated infarction in the upper brain stem , left cerebellum and  right basil ganglia'''

# date vs other clinical entities
text4 = '''This 73 y/o patient had CT  on 1/12/95, with progressive memory and cognitive decline since 8/11/94.'''

**Get Single Prediction** with `LightPipeline()`

In [None]:
# choose one of the sample texts depending on the pretrained relation model you want to use
text = text3

loaded_re_model_light = LightPipeline(loaded_re_model)
annotations = loaded_re_model_light.fullAnnotate(text)


rel_df = get_relations_df(annotations) # << get_relations_df() is the function defined in the 3rd cell

print('\n',text)

rel_df[rel_df.relations!="0"]
#rel_df




 MRI demonstrated infarction in the upper brain stem , left cerebellum and  right basil ganglia


Unnamed: 0,relations,entity1,chunk1,entity2,chunk2,confidence
0,1,Direction,upper,Internal_organ_or_component,brain stem,0.9999989
4,1,Direction,left,Internal_organ_or_component,cerebellum,1.0
7,1,Direction,right,Internal_organ_or_component,basil ganglia,1.0


In [None]:
# Previous cell content is merged in this function to get quick predictions, for custom cases please check parameters 
# in RelationExtractionModel()
def relation_exraction(model_name, pairs, text):
    
    re_model = RelationExtractionModel()\
        .pretrained(model_name, "en", 'clinical/models')\
        .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\
        .setOutputCol("relations")\
        .setRelationPairs(pairs)\
        .setMaxSyntacticDistance(4)\
        .setPredictionThreshold(0.9)

    trained_pipeline = Pipeline(stages=[
        documenter,
        sentencer,
        tokenizer, 
        words_embedder, 
        pos_tagger, 
        clinical_ner_tagger,
        ner_chunker,
        dependency_parser,
        re_model
    ])

    empty_data = spark.createDataFrame([[""]]).toDF("text")

    loaded_re_model = trained_pipeline.fit(empty_data)
    
    loaded_re_model_light = LightPipeline(loaded_re_model)
    annotations = loaded_re_model_light.fullAnnotate(text)

    rel_df = get_relations_df(annotations) # << get_relations_df() is the function defined in the 3rd cell

    print('\n',text)

    #rel_df
    return rel_df[rel_df.relations!="0"]
    



In [None]:
# bodypart vs problem 
model_name =  're_bodypart_problem'
pairs =  ['symptom-external_body_part_or_region', 'external_body_part_or_region-symptom']

text = text1

relation_exraction(model_name, pairs, text)


re_bodypart_problem download started this may take some time.
Approximate size to download 9.2 MB
[OK!]

 No neurologic deficits other than some numbness in his left hand.


Unnamed: 0,relations,entity1,chunk1,entity2,chunk2,confidence
0,1,Symptom,numbness,External_body_part_or_region,hand,1.0


In [None]:
# bodypart vs procedure and test 
model_name =  're_bodypart_proceduretest'
pairs = pair2
text = text2

relation_exraction(model_name, pairs, text)


re_bodypart_proceduretest download started this may take some time.
Approximate size to download 9.2 MB
[OK!]

 TECHNIQUE IN DETAIL: After informed consent was obtained from the patient and his mother, the chest was scanned with portable ultrasound.


Unnamed: 0,relations,entity1,chunk1,entity2,chunk2,confidence
0,1,External_body_part_or_region,chest,Test,portable ultrasound,1.0


In [None]:
# bodypart vs directions
model_name =  're_bodypart_directions'
pairs = pair3
text = text3

relation_exraction(model_name, pairs, text)


re_bodypart_directions download started this may take some time.
Approximate size to download 9.2 MB
[OK!]

 MRI demonstrated infarction in the upper brain stem , left cerebellum and  right basil ganglia


Unnamed: 0,relations,entity1,chunk1,entity2,chunk2,confidence
0,1,Direction,upper,Internal_organ_or_component,brain stem,0.9999989
4,1,Direction,left,Internal_organ_or_component,cerebellum,1.0
7,1,Direction,right,Internal_organ_or_component,basil ganglia,1.0


In [None]:
# date vs clinical date entities
model_name =  're_date_clinical'
pairs = pair4
text = text4

relation_exraction(model_name, pairs, text)


re_date_clinical download started this may take some time.
Approximate size to download 9.2 MB
[OK!]

 This 73 y/o patient had CT  on 1/12/95, with progressive memory and cognitive decline since 8/11/94.


Unnamed: 0,relations,entity1,chunk1,entity2,chunk2,confidence
0,1,Date,1/12/95,Symptom,progressive memory and cognitive decline,1.0
1,1,Symptom,progressive memory and cognitive decline,Date,8/11/94,1.0
