In [1]:
import pandas as pandas

df = pandas.read_csv('../data/engel_scores_output.csv')
df.head(5)

Unnamed: 0,clinical_note,engel_score,reasoning
0,"Dear Dr,\n\nRe: \tMr Iestyn Hitchinbrook DOB 2...",IVB,The patient continues to have frequent focal s...
1,Clinic date 30/01/2018\n\nDiagnosis: Focal Se...,IIA,The patient is experiencing focal seizures con...
2,Clinic date 7th February 2018\n\nRe. Mr John J...,IIB,The patient is experiencing rare disabling sei...
3,Consultant Paediatric Neurologist\n\nClinic Da...,IIB,The patient is experiencing rare disabling gen...
4,Epilepsy service\nOur ref:\tRE/AS/Q12453\nNHS ...,IIA,The patient is experiencing rare disabling sei...


In [2]:
import nltk

nltk.download('all')

def check_nltk_data():
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('taggers/averaged_perceptron_tagger')
        print("All required NLTK data is already downloaded.")
    except LookupError as e:
        print(f"Missing NLTK data: {e}")

check_nltk_data()

All required NLTK data is already downloaded.


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/Viresh/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/Viresh/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/Viresh/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/Viresh/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/Viresh/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nlt

In [4]:
import re
from datetime import datetime
import time
import os
from random import choice

import bs4 as bs
import contractions
import nltk
import pandas as pd
import requests
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from num2words import num2words

nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

def preprocess_headline(headline):
    def replace_numbers_and_percentages(text):
        """
        Replace all numbers and percentages in a string with their English word equivalents.
        """
        number_pattern = re.compile(r'(\d+(\.\d+)?%?)')

        def convert_number_to_words(match):
            """
            Convert a matched number or percentage to words.
            """
            number_str = match.group(0)

            if '%' in number_str:
                # Handle percentages
                number = float(number_str.replace('%', ''))
                number_in_words = num2words(number) + ' percent'
            else:
                # Handle regular numbers (integers or floats)
                number = float(number_str)
                number_in_words = num2words(number)

            return number_in_words

        # Replace numbers and percentages in the text with their word equivalents
        text_with_words = number_pattern.sub(convert_number_to_words, text)
        return text_with_words

    def get_wordnet_pos(treebank_tag):
        """
        Convert TreeBank POS tags to WordNet POS tags for lemmatization.
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def remove_stop_words():
        stop_words = set(stopwords.words('english')) - negation_words
        return [word for word in toks if word not in stop_words]

    def lemmatize_tokens():
        lemmatizer = WordNetLemmatizer()
        pos_tags = nltk.pos_tag(toks)
        lemmatized_toks = []
        for token, pos in pos_tags:
            lemma = lemmatizer.lemmatize(token, get_wordnet_pos(pos))
            lemmatized_toks.append(lemma)
        return lemmatized_toks

    def handle_negation():
        negated_toks = []
        negate = False
        for token in toks:
            if token in negation_words:
                negate = True
                continue
            if negate:
                negated_toks.append('not_' + token)
                negate = False
            else:
                negated_toks.append(token)
        return negated_toks
    
    MEDICAL_ABBREVIATIONS = {
    # Vital Signs
    "BP": "blood pressure",
    "HR": "heart rate",
    "RR": "respiratory rate",
    "Temp": "temperature",
    "O2": "oxygen",
    "SpO2": "peripheral capillary oxygen saturation",
    "BPM": "beats per minute",

    # Routes of Administration
    "IV": "intravenous",
    "IM": "intramuscular",
    "PO": "by mouth",
    "PRN": "as needed",
    "SL": "sublingual",
    "TOP": "topical",
    "SC": "subcutaneous",
    "GTT": "gtt",

    # Medical Procedures and Tests
    "ECG": "electrocardiogram",
    "EKG": "electrocardiogram",
    "MRI": "magnetic resonance imaging",
    "CT": "computed tomography",
    "US": "ultrasound",
    "CXR": "chest x-ray",
    "CBC": "complete blood count",
    "CMP": "comprehensive metabolic panel",
    "ABG": "arterial blood gas",
    "EEG": "electroencephalogram",
    "PET": "positron emission tomography",
    "DVT": "deep vein thrombosis",
    "PT": "physical therapy",
    "OT": "occupational therapy",
    "STAT": "immediately",
    "ASAP": "as soon as possible",
    "AIDET": "acknowledge, introduce, duration, explanation, thank you",

    # Common Conditions and Diagnoses
    "COPD": "chronic obstructive pulmonary disease",
    "CHF": "congestive heart failure",
    "CAD": "coronary artery disease",
    "DM": "diabetes mellitus",
    "HTN": "hypertension",
    "PE": "pulmonary embolism",
    "TIA": "transient ischemic attack",
    "MI": "myocardial infarction",
    "CVA": "cerebrovascular accident",
    "AKI": "acute kidney injury",
    "CKD": "chronic kidney disease",
    "UTI": "urinary tract infection",
    "GERD": "gastroesophageal reflux disease",
    "OSA": "obstructive sleep apnea",
    "H&P": "history and physical",
    "CC": "chief complaint",
    "HPI": "history of present illness",
    "ROS": "review of systems",
    "PMHx": "past medical history",
    "FHx": "family history",
    "SHx": "social history",
    "PE": "physical examination",
    "VS": "vital signs",
    "LOC": "level of consciousness",
    "A&O": "alert and oriented",
    "A&O x 3": "alert and oriented to person, place, and time",
    "Q2H": "every two hours",
    "Q4H": "every four hours",
    "Q6H": "every six hours",
    "Q8H": "every eight hours",
    "QOD": "every other day",
    "CID": "three times a day",

    # Medications and Treatments
    "Rx": "prescription",
    "OTC": "over-the-counter",
    "NSAID": "nonsteroidal anti-inflammatory drug",
    "ASA": "acetylsalicylic acid",
    "PTSD": "post-traumatic stress disorder",
    "AED": "anti-epileptic drug",
    "ACEi": "angiotensin-converting enzyme inhibitor",
    "ARBs": "angiotensin II receptor blockers",
    "BB": "beta-blocker",
    "CNS": "central nervous system",
    "PNS": "peripheral nervous system",
    "TID": "three times daily",
    "BID": "twice daily",
    "QID": "four times daily",
    "HS": "at bedtime",
    "QD": "once daily",
    "PRN": "as needed",
    "SOB": "shortness of breath",
    "N/V": "nausea and vomiting",
    "LOC": "loss of consciousness",
    "A&O": "alert and oriented",
    "A&O x 3": "alert and oriented to person, place, and time",
    
    # Laboratory Values
    "Na+": "sodium",
    "K+": "potassium",
    "Cl-": "chloride",
    "HCO3-": "bicarbonate",
    "Gluc": "glucose",
    "Hb": "hemoglobin",
    "Hct": "hematocrit",
    "WBC": "white blood cell count",
    "RBC": "red blood cell count",
    "Plt": "platelets",
    "Cr": "creatinine",
    "BUN": "blood urea nitrogen",
    "AST": "aspartate aminotransferase",
    "ALT": "alanine aminotransferase",
    "ALP": "alkaline phosphatase",
    "INR": "international normalized ratio",
    "PTT": "partial thromboplastin time",
    "LDH": "lactate dehydrogenase",
    "TSH": "thyroid-stimulating hormone",
    
    # Anatomical Terms
    "CNS": "central nervous system",
    "PNS": "peripheral nervous system",
    "GI": "gastrointestinal",
    "GU": "genitourinary",
    "ENT": "ear, nose, throat",
    "MSK": "musculoskeletal",
    "HEENT": "head, eyes, ears, nose, throat",
    
    # Miscellaneous
    "NPO": "nothing by mouth",
    "ADL": "activities of daily living",
    "H&P": "history and physical",
    "CC": "chief complaint",
    "HPI": "history of present illness",
    "ROS": "review of systems",
    "PMHx": "past medical history",
    "FHx": "family history",
    "SHx": "social history",
    "PE": "physical examination",
    "Dx": "diagnosis",
    "Tx": "treatment",
    "Sx": "symptoms",
    "Fx": "fracture",
    "LMP": "last menstrual period",
    "D/C": "discontinue or discharge",
    "BID": "twice a day",
    "CID": "three times a day",
    "Q2H": "every two hours",
    "Q4H": "every four hours",
    "Q6H": "every six hours",
    "Q8H": "every eight hours",
    "PRN": "as needed",
    "SOB": "shortness of breath",
    "N/V": "nausea and vomiting",
    "LOC": "level of consciousness",
    "A&O": "alert and oriented",
    "A&O x 3": "alert and oriented to person, place, and time",
    "VS": "vital signs",
    "PR": "pulse rate"
}


    def expand_medical_abbreviations(text):
        """
        Expand medical abbreviations in the text using MEDICAL_ABBREVIATIONS dictionary.
        """
        # Create a regex pattern to match whole words only (case-insensitive)
        pattern = re.compile(r'\b(' + '|'.join(map(re.escape, MEDICAL_ABBREVIATIONS.keys())) + r')\b', flags=re.IGNORECASE)

        def replace(match):
            abbr = match.group(0)
            expanded = MEDICAL_ABBREVIATIONS.get(abbr.upper(), abbr)
            return expanded.lower()

        return pattern.sub(replace, text)

    negation_words = {'not', 'no', 'never', 'neither', 'nor', 'nobody', "n't"}
    # Convert to lowercase
    headline = headline.lower()
    # Expand contractions
    headline = contractions.fix(headline)
    # convert numbers to words
    headline = replace_numbers_and_percentages(headline)
    # Remove punctuation
    headline = re.sub(r'[^\w\s]', '', headline)
    # Tokenize the text
    toks = nltk.word_tokenize(headline)
    # Remove stop words, excluding negation
    toks = remove_stop_words()
    # Lemmatize the tokens
    toks = lemmatize_tokens()
    # Handle negation
    toks = handle_negation()

    return expand_medical_abbreviations(' '.join(toks))

# Example usage
text = "The patient's BP wasn't normal, and he's experiencing SOB. We'll need to do an ECG ASAP."
preprocessed_text = preprocess_headline(text)
print(preprocessed_text)


patient blood pressure not_normal experience shortness of breath need electrocardiogram soon possible


In [8]:
import re

def extract_json(text):
    json_pattern = r'```json\n({.*?})\n```'
    json_matches = re.findall(json_pattern, text, re.DOTALL) 
    if json_matches:
        return json_matches[0]
    return None

text = """

- The note states the patient has had "further generalized tonic clonic seizures" and describes a recent severe seizure associated with injury. This indicates no appreciable reduction or improvement in seizures, meeting criteria for Engel IVB (No appreciable change).

- While IVB fits best based on the information given, there are some details missing that leave open the possibility of other scores like IVB or IVC. But IVB is the most reasonable assumption given the continued disabling seizures described.

In summary, the lack of any documented seizure improvement or reduction means IVB (No appreciable change) is the most appropriate Engel score based on the details provided.

```json
{
  "score": "IA",
  "reasoning": "The patient has been completely seizure-free since February 2005, which was over 2 years ago from the time of this clinic note. This meets the criteria for Engel class IA - 'Completely seizure-free since surgery'. No other Engel scores seem applicable based on the information provided."
}
```
"""
json_data = extract_json(text)
# convert the JSON string to a Python dictionary
import json

# handle any escaped characters in the JSON string
json_data = json_data.replace('\\"', '"')
# convert the JSON string to a Python dictionary

json_data = json.loads(json_data)


print(json_data)

AttributeError: 'NoneType' object has no attribute 'replace'