

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/CONTEXTUAL_PARSER.ipynb)




# **Detect demographics and vital signs using rules**

## Environment setup

Open license keys so the licensed models can be downloaded.

In [None]:
import os
import json

with open('/content/spark_nlp_for_healthcare.json', 'r') as f:
    license_keys = json.load(f)

license_keys.keys()

secret = license_keys['SECRET']
os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['AWS_ACCESS_KEY_ID'] = license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
sparknlp_version = license_keys["PUBLIC_VERSION"]
jsl_version = license_keys["JSL_VERSION"]

print ('SparkNLP Version:', sparknlp_version)
print ('SparkNLP-JSL Version:', jsl_version)


Install and import necessary dependencies for Spark NLP.

In [None]:
# Install Java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4

# Install Spark NLP
jsl_secret = "2.5.5-4f4b7f600f8ba3cdc5973a6baa47b901b0c8d8a3"
jsl_version = jsl_secret.split('-')[0]
! pip install --ignore-installed -q spark-nlp
! python -m pip install --upgrade spark-nlp-jsl==$jsl_version --extra-index-url https://pypi.johnsnowlabs.com/$jsl_secret

openjdk version "11.0.8" 2020-07-14
OpenJDK Runtime Environment (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)
[K     |████████████████████████████████| 215.7MB 64kB/s 
[K     |████████████████████████████████| 204kB 14.6MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 133kB 3.4MB/s 
[?25hLooking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/2.5.5-4f4b7f600f8ba3cdc5973a6baa47b901b0c8d8a3
Collecting spark-nlp-jsl==2.5.5
  Downloading https://pypi.johnsnowlabs.com/2.5.5-4f4b7f600f8ba3cdc5973a6baa47b901b0c8d8a3/spark-nlp-jsl/spark_nlp_jsl-2.5.5-py3-none-any.whl
Collecting spark-nlp==2.5.5
[?25l  Downloading https://files.pythonhosted.org/packages/b5/a2/5c2e18a65784442ded6f6c58af175ca4d99649337de569fac55b04d7ed8e/spark_nlp-2.5.5-py2.py3-none-any.whl (124kB)
[K     |████████████████████████████████| 133kB 3.3M

In [None]:
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ['PATH'] = os.environ['JAVA_HOME'] + "/bin:" + os.environ['PATH']

import pandas as pd
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import sparknlp
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl

spark = sparknlp_jsl.start(jsl_secret)

In [None]:
# make a directory for the rules we will create later
! mkdir rules

## HTML display of outputs

In [None]:
from IPython.display import HTML, display
import random

In [None]:
def get_color():
    r = lambda: random.randint(128,255)
    return "#%02x%02x%02x" % (r(), r(), r())

In [None]:
def annotation_to_html(full_annotation):
    annotation = full_annotation[0]
    text = annotation['document'][0].result
    ner_chunks = []
    label_color = {}
    unified_entities = {'entity': []}
    for entity_name in annotation.keys():
        if ("entity" in entity_name) and (len(annotation[entity_name]) > 0):
            ner_chunks.append(entity_name)
            label = annotation[entity_name][0].metadata['field']
            label_color[label] = get_color()
            unified_entities['entity'].extend(annotation[entity_name])
    unified_entities['entity'].sort(key=lambda x: x.begin, reverse=False)

    html_output = "<div>"
    pos = 0

    for n in unified_entities['entity']:
        if pos < n.begin and pos < len(text):
            html_output += f"<span class=\"others\">{text[pos:n.begin]}</span>"
        pos = n.end + 1
        html_output += f"<span class=\"entity-wrapper\" style=\"color: black; background-color: {label_color[n.metadata['field']]}\"> <span class=\"entity-name\">{n.result}</span> <span class=\"entity-type\">[{n.metadata['field']}]</span></span>"

    if pos < len(text):
        html_output += f"<span class=\"others\">{text[pos:]}</span>"

    html_output += "</div>"
    display(HTML(html_output))

## Rule creation

### Vital signs

In [None]:
# regex matches any number between 90 and 109 inclusive, including decimals

with open('rules/temperature.json', 'w') as f:
    json.dump({
        'entity': "Temperature",
        'ruleScope': "sentence",
        'matchScope': "token",
        'regex': "\\b((9[0-9])|(10[0-9]))((\\.|,)[0-9]+)?\\b",
        'prefix': ["temperature", "fever"],
        'suffix': ["Fahrenheit", "Celsius", "centigrade", "F", "C"],
        'contextLength': 30
    }, f)

temperature_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_temperature') \
    .setJsonPath('/content/rules/temperature.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

In [None]:
# regex matches any number in the format S/D where S is between 40 and 199
# and D is between 30 and 150

with open('rules/blood_pressure.json', 'w') as f:
    json.dump({
        'entity': "Blood pressure",
        'ruleScope': "sentence",
        'matchScope': "token",
        'regex': "\\b([4-9]|1\\d)\\d\\/([3-9]|1[0-4])\\d\\b",
        'contextException': ["exam", "test", "scored", "score", "scores"],
        'exceptionDistance': 15
    }, f)

blood_pressure_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_blood_pressure') \
    .setJsonPath('/content/rules/blood_pressure.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

In [None]:
# regex matches any integer between 40 and 189 inclusive

with open('rules/pulse.json', 'w') as f:
    json.dump({
        'entity': "Pulse",
        'ruleScope': "sentence",
        'matchScope': "token",
        'regex': "\\b(([4-9]\\d)|(1\\d\\d))\\b",
        'prefix': ["pulse", "heart"],
        'suffix': ["beats"],
        'contextLength': 20
    }, f)

pulse_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_pulse') \
    .setJsonPath('/content/rules/pulse.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

In [None]:
# regex matches any number between 1 and 79 inclusive, not including decimals

with open('rules/respiration_rate.json', 'w') as f:
    json.dump({
        'entity': "Respiration rate",
        'ruleScope': "sentence",
        'matchScope': "token",
        'regex': "\\b(([1-9])|([0-7][0-9]))\\b",
        'prefix': ["respiration", "respirations", "respiratory"],
        'suffix': ["breath", "breaths"],
        'contextLength': 25,
        'contextException': ["pulse", "beats", "heart",
            "Fahrenheit", "Celsius", "centigrade", "degrees", "temperature"],
        'exceptionDistance': 15
    }, f)

respirations_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_respirations') \
    .setJsonPath('/content/rules/respiration_rate.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

In [None]:
# regex matches any number between 50 and 100 inclusive, including decimals, and
# including percent sign if present

with open('rules/o2_saturation.json', 'w') as f:
    json.dump({
        'entity': "O2 saturation",
        'ruleScope': "sentence",
        'matchScope': "token",
        'regex': "\\b(([5-9][0-9])|(100))(\\.[0-9]+)?%?\\b",
        'prefix': ["saturation", "saturating", "saturated", "saturate",
                   "oxygen", "oximetry", "oximeter", "air", "O2"],
        'suffix': ["oxygen", "saturation", "air"],
        'contextLength': 25,
        'contextException': ["year", "years", "old",
            "Fahrenheit", "Celsius", "centigrade", "degrees", "temperature",
            "pressure", "nonrebreather", "pulse", "beats"],
        'exceptionDistance': 15
    }, f)

saturation_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_saturation') \
    .setJsonPath('/content/rules/o2_saturation.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

### Dates and money amounts

In [None]:
# regex matches numerical dates separated by slashes or dashes, with at least a
# valid month and day and optionally a year

date_rule = {
    'entity': "Date - short",
    'ruleScope': "sentence",
    'matchScope': "token",
    'regex': "\\b[0-3]?[0-9](\\/|\\-)[0-3]?[0-9]((\\/|\\-)((19)|(20))?([0-9][0-9]))?\\b",
    'contextLength': 20,
    'contextException': ["pressure", "rate", "when",
        "score", "exam", "test", "tested", "tests", "MMSE"],
    'exceptionDistance': 20
}

with open('rules/date_-_short.json', 'w') as f:
    json.dump(date_rule, f)

date_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_date') \
    .setJsonPath('/content/rules/date_-_short.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

In [None]:
# regex identifies numbers from 0 to 39, optionally ending in "rd", "nd", or
# "th"

date_rule2 = {
    'entity': "Date - long",
    'ruleScope': "sentence",
    'matchScope': "token",
    'regex': "\\b[0-3]?[0-9]((th)|(nd)|(rd))?(,|\\b)",
    'prefix': ["january", "february", "march", "april", "may", "june", "july",
        "august", "september", "october", "november", "december", "jan", "feb",
        "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"],
    'suffix': ["january", "february", "march", "april", "may", "june", "july",
        "august", "september", "october", "november", "december", "jan", "feb",
        "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"],
    'contextLength': 15
}

with open('rules/date_-_long.json', 'w') as f:
    json.dump(date_rule2, f)

date_contextual_parser2 = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_date2') \
    .setJsonPath('/content/rules/date_-_long.json') \
    .setCaseSensitive(True) \
    .setContextMatch(False)

In [None]:
# regex matches numbers, including those broken up by commas or periods,
# prefixed by a "$", "£", or "€" and ending in a digit

money_rule = {
    'entity': "Money - short",
    'ruleScope': "sentence",
    'matchScope': "token",
    'regex': "[£€\\$]([0-9\\.,]*[0-9])"
}

with open('rules/money_-_short.json', 'w') as f:
    json.dump(money_rule, f)

money_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_money') \
    .setJsonPath('/content/rules/money_-_short.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

In [None]:
# regex matches numbers, including those broken up by commas or periods,
# ending in a digit
# NOTE: suffix will not match phrases like "17 pounds" because of possible
# confusion with units of weight.

money_rule2 = {
    'entity': "Money - long",
    'ruleScope': "sentence",
    'matchScope': "token",
    'regex': "[0-9\\.,]*[0-9]",
    'suffix': ["dollars", "euros", "cents", "pence", "USD", "EUR", "GBP"],
    'contextLength': 15
}

with open('rules/money_-_long.json', 'w') as f:
    json.dump(money_rule2, f)

money_contextual_parser2 = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_money2') \
    .setJsonPath('/content/rules/money_-_long.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

### Demographics

In [None]:
gender_dictionary = """female,female,she,her,hers,girl,woman,old-lady,lady
male,male,man,gentleman,boy,he,him,his
neutral,neutral,gender-neutral,agender,nonbinary,non-binary"""

gender_rule = {
    'entity': "Gender",
    'ruleScope': "sentence",
    'completeMatchRegex': "true"
}

with open('rules/gender.csv', 'w') as f:
    f.write(gender_dictionary)

with open('rules/gender.json', 'w') as f:
    json.dump(gender_rule, f)

gender_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_gender') \
    .setJsonPath('/content/rules/gender.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False) \
    .setDictionary('/content/rules/gender.csv',
                   read_as=ReadAs.TEXT,
                   options={'delimiter': ","})

In [None]:
# regex matches any number followed by "y/o" or any form of "-[timespan]-old"

age_rule = {
    'entity': "Age - short",
    'ruleScope': "sentence",
    'matchScope': "token",
    'regex': "\\d+(y\\/o|-(year|month|week|day)s?-old)"
}

with open('rules/age_-_short.json', 'w') as f:
    json.dump(age_rule, f)

age_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_age') \
    .setJsonPath('/content/rules/age_-_short.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

In [None]:
# regex detects any number from 0 to 109

age_rule2 = {
    'entity': "Age - long",
    'ruleScope': "sentence",
    'matchScope': "token",
    'regex': "\\b(\\d?\\d|10\\d)\\b",
    'prefix': ["age"],
    'suffix': ["age", "old",
        "y/o", "year-old", "years-old", "month-old", "months-old"],
    'contextLength': 15
}

with open('rules/age_-_long.json', 'w') as f:
    json.dump(age_rule2, f)

age_contextual_parser2 = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_age2') \
    .setJsonPath('/content/rules/age_-_long.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

In [None]:
# regex matches heights in the form F'II", where F is any number from 0 to 7 and
# FF is any number from 0 to 12, with an optional leading zero. the quotation
# mark at the end is optional.

# healthy appearing woman: 5 foot and 6 inches tall
# pleasant elderly woman: Height 4 feet 11 inches
# social history: Height: 21 inches

height_rule = {
    'entity': "Height - short",
    'ruleScope': "sentence",
    'matchScope': "token",
    'regex': "[0-7]'((0?[0-9])|(1(0|1)))\"?"
}

with open('rules/height_-_short.json', 'w') as f:
    json.dump(height_rule, f)

height_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_height') \
    .setJsonPath('/content/rules/height_-_short.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

In [None]:
# matches numbers from 0 to 249, optionally with a decimal or unit of
# measurement trailing.
# alternatively, matches a spelled-out number between one and nine.

height_rule2 = {
    'entity': "Height - long",
    'ruleScope': "sentence",
    'matchScope': "token",
    'completeMatchRegex': "true",
    'regex': "\\b([1-2]?\\d?\\d(\\.\\d|cm|ft|\\.\\d{1,2}m|in)?|one|two|three|four|five|six|seven|eight|nine)\\b",
    'prefix': ["stand", "stands", "stood", "height", "tall"],
    'suffix': ["tall"],
    'contextLength': 20,
    'contextException': ["pressure"],
    'exceptionDistance': 25
}

with open('rules/height_-_long.json', 'w') as f:
    json.dump(height_rule2, f)

height_contextual_parser2 = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_height2') \
    .setJsonPath('/content/rules/height_-_long.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

In [None]:
# regex matches numbers between 0 and 2999 (high numbers enabled because birth
# weights are sometimes written like "1102"), with up to one decimal and
# optionally followed by "lb", "#" or "kg" (with or without an "s" trailing) and
# optionally followed by a number of ounces from 0 to 19.

# disable the contextException to match more weights other than the patient's
# body weight, such as weight gains or weights of objects.

weight_rule = {
    'entity': "Weight",
    'ruleScope': "sentence",
    'matchScope': "token",
    'regex': "\\b(\\d{1,3}(\\.\\d)?(((kg)|(#|lb))s?)?(1?\\doz)?|[0-2]\\d{3})\\b",
    'prefix': ["weighs", "weighed", "weight"],
    'suffix': ["pounds", "lbs", "lb", "#", "#s", "kg", "kgs", "oz",
               "kilograms", "kilos", "ounces"],
    'contextLength': 15,
    'contextException': ["gain", "g", "gram", "grams",
                         "mg", "milligram", "milligrams",
                         "BMI", "gain", "gains", "gained", "gaining",
                         "lose", "lost", "loses", "losing",
                         "temperature", "pulse", "height"],
    'exceptionDistance': 25
}

with open('rules/weight.json', 'w') as f:
    json.dump(weight_rule, f)

weight_contextual_parser = ContextualParserApproach() \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('entity_weight') \
    .setJsonPath('/content/rules/weight.json') \
    .setCaseSensitive(False) \
    .setContextMatch(False)

## Pipeline creation

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

sentence_detector = SentenceDetector() \
    .setInputCols(['document']) \
    .setOutputCol('sentence')

tokenizer = Tokenizer() \
    .setInputCols(['sentence']) \
    .setOutputCol('token')

In [None]:
pipeline = Pipeline(stages=[
    document_assembler, 
    sentence_detector,
    tokenizer,
    temperature_contextual_parser,
    blood_pressure_contextual_parser,
    pulse_contextual_parser,
    respirations_contextual_parser,
    saturation_contextual_parser,
    date_contextual_parser,
    date_contextual_parser2,
    money_contextual_parser,
    money_contextual_parser2,
    gender_contextual_parser,
    age_contextual_parser,
    age_contextual_parser2,
    height_contextual_parser,
    height_contextual_parser2,
    weight_contextual_parser
])

empty_df = spark.createDataFrame([[""]]).toDF('text')
pipeline_model = pipeline.fit(empty_df)
light_pipeline = LightPipeline(pipeline_model)

## Example generation

Create input and output file paths.

In [None]:
MODEL_NAME = "ContextualParser"

INPUT_FILE_PATH = f"inputs/{MODEL_NAME}/"
OUTPUT_FILE_PATH = f"outputs/{MODEL_NAME}/"

Select example inputs that highlight the model.

In [None]:
example_inputs = [
    """On examination today, this is a pleasant and healthy appearing woman.
VITAL SIGNS: Blood pressure 154/72, heart rate 87, temperature 98.8, and weight 153 pounds. Pain is 0/10.
HEAD: Head is normocephalic and atraumatic. Head circumference is 54 cm, which is in the 10-25th percentile for a woman who is 5 foot and 6 inches tall and 153 lbs.""",
    """GENERAL: She is a pleasant elderly woman, currently in no acute distress.
VITAL SIGNS: Height 4 feet 11 inches, weight 128 pounds, temperature 97.2 degrees Fahrenheit, blood pressure 142/70, pulse 47, respiratory rate 16, and O2 saturation 100%""",
    """On examination today, this is a pleasant 81-year-old man who is brought back from the clinic waiting area in a wheelchair. He is well developed, well nourished, and kempt.
Vital Signs: Temperature 96.7, pulse 62, respirations 16, blood pressure 123/71, and weight 184.
Head: The head is normocephalic and atraumatic.""",
    """The baby is an ex-32 weeks small for gestational age infant with birth weight 1102. Baby was born at ABCD Hospital at 1333 on 07/14/2006. Mother is a 20-year-old gravida 1, para 0 female who received prenatal care. Prenatal course was complicated by low amniotic fluid index and hypertension. She was evaluated for evolving preeclampsia and had a C-section secondary to the nonreassuring fetal status. Baby delivered operatively, Apgar scores were 8 and 9 initially taken to level 2 satellite nursery and arrangements were to transfer to Children's Hospital. Infant was transferred to Children's Hospital for higher level of care, stayed at Children's Hospital for approximately 2 weeks, and was transferred back to ABCD where he stayed until he was discharged on 08/16/2006.""",
    """SOCIAL HISTORY: The patient lives at home with 23-year-old mother, who is a homemaker and 24-year-old father, John, who is a supervisor at Excel. The family lives in Bentley, Kansas. No smoking in the home. Family does have one pet cat.
REVIEW OF SYSTEMS: As per HPI, otherwise, negative.
OBJECTIVE: Weight: 7 pounds 12 ounces. Height: 21 inches. Head circumference: 35 cm. Temperature: 97.2 degrees. Pulse: 64 beats per minute. Blood pressure 104/63.
General: Well-developed, well-nourished, cooperative, alert, interactive 2-week-old white female in no acute distress. Temperature is significantly down from 101.2 degrees two days ago""",
    """The patient was reluctant to use medicine. She stated that she felt uncomfortable using pills that cost $20 each. We discussed getting support if $20 per dose was a financial hardship for her.""",
    """I had the pleasure of meeting Ms. ABC for evaluation for bariatric surgery. As you know she is a pleasant 54-year-old female who has multiple medical problems and is seeking evaluation for laparoscopic gastric banding. I saw her on October 3, 2008 in the office after she had attended a Fairfield County Bariatrics and Surgical Specialists seminar. She is 5'7" tall and weighs 242 pounds. She has been overweight since age 27. She is now at her highest adult weight.""",
    """In short, the patient is a 55-year-old gentleman with long-standing morbid obesity, resistant to nonsurgical methods of weight loss with BMI of 69.7 with comorbidities of hypertension, atrial fibrillation, hyperlipidemia, possible sleep apnea, and also osteoarthritis of the lower extremities. On physical examination today, he weighs 514.8 pounds, he has gained 21 pounds since the last visit with us. His pulse is 78, temperature is 97.5, blood pressure is 132/74.""",
    """29 y/o male with cerebral palsy, non-shunted hydrocephalus, spastic quadriplegia, mental retardation, bilateral sensory neural hearing loss, severe neurogenic scoliosis and multiple contractures of the 4 extremities, neurogenic bowel and bladder incontinence, and a history of seizures.
He was seen for evaluation of seizures which first began at age 27 years, two years before presentation. Spontaneous Vaginal delivery at 36weeks gestation to a G2P1 mother. Birth weight 7#10oz.""",
    """Today temperature is 100.1, weight is 73.5 kg, pulse is 84, blood pressure is 121/61, and height is 158. Patient reported temperature of 101 the day before."""
]

Write the example inputs to file.

In [None]:
! rm -r $INPUT_FILE_PATH
! mkdir -p $INPUT_FILE_PATH
for index, text in enumerate(example_inputs):
    excerpt = text[:min(len(text)-10, 100)].replace('\n', ' ') + "... \n"
    write_path = os.path.join(INPUT_FILE_PATH, f'Example{index + 1}.txt')
    open(write_path, 'w').write(excerpt + text)

rm: cannot remove 'inputs/ContextualParser/': No such file or directory


Read the example inputs back from file.

In [None]:
file_list = os.listdir(INPUT_FILE_PATH)
file_paths = [os.path.join(INPUT_FILE_PATH, path) for path in file_list]

input_list = []
for file_path in file_paths:
    text = "".join(open(file_path, 'r').readlines()[1:])
    input_list.append(text)

Transform the inputs to create outputs.

In [None]:
df = spark.createDataFrame(pd.DataFrame({'text': input_list}))
result = pipeline_model.transform(df).toPandas()

Write the outputs to file.

In [None]:
! rm -r $OUTPUT_FILE_PATH
! mkdir -p $OUTPUT_FILE_PATH

def add_or_replace(entity_chunks, new_chunk):
    """Adds the new entity chunk to the list, unless there is already a chunk
    in the same location in the text with equal or greater confidence."""
    preexisting = False
    for index, entity in enumerate(entity_chunks):
        if entity[1] == new_chunk[1]:
            preexisting = True
            if new_chunk[4]['confidenceValue'] > entity[4]['confidenceValue']:
                entity_chunks[index] = new_chunk
    if not preexisting:
        entity_chunks.append(new_chunk)


for i in result.index:
    # extract all chunks for example i from each entity column
    entity_chunks = []
    for col in result.columns:
        if "entity" in col:
            for row in result[col].iloc[i]:
                add_or_replace(entity_chunks, row)
    
    # reformat the output to use the 'entity' key for the name of the feature
    # instead of 'field' so it is compatible with the NER streamlit app format.
    for entity in entity_chunks:
        entity[4]['entity'] = entity[4]['field']
        del(entity[4]['field'])
    
    # sort the chunks in order of their first character so they don't display
    # out of order
    entity_chunks = sorted(entity_chunks, key=lambda x: x[1])
    pd.Series({'ner_chunk': entity_chunks}).to_json(
        os.path.join(OUTPUT_FILE_PATH, file_list[i].split('.')[0] + '.json'))

rm: cannot remove 'outputs/ContextualParser/': No such file or directory


In [None]:
for example in example_inputs:
    annotation_to_html(light_pipeline.fullAnnotate(example))