## NegSpacy and Custom Spans exploration

In [1]:
# !pip install spacy
# !pip install negspacy
# !python -m spacy download en_core_web_sm

In [2]:
import pandas as pd
import spacy
import pandas as pd
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from negspacy.negation import Negex

In [3]:
# Set display options for better viewing
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)


## Make Test Dataset

In [4]:
# Creating the sample data
data = {
    "Incident Description": [
        "The patient had a 12-lead ECG performed.",
        "I didn't do a 12-lead ECG on the patient.",
        "No signs of stroke, but we did an ECG.",
        "The 12-lead ECG was not necessary.",
        "A 12-lead ECG was performed in the ambulance, showing ST elevation in leads II, III, and aVF, prompting immediate transport to a cardiac center.",
        "The patient was assessed for chest pain, and vital signs were recorded. No ECG was performed at this stage.",
        "Paramedics conducted a full cardiac assessment, including a 12-lead ECG, which showed normal sinus rhythm with no acute ischemic changes.",
        "Due to ongoing symptoms, the crew considered a 12-lead ECG but deferred it based on the patient's history and presentation.",
        "The ambulance team focused on managing the patient's respiratory distress with oxygen therapy and positioning.",
        "Following the initial assessment, a 12-lead ECG was not deemed necessary as the patient exhibited no cardiac-related symptoms.",
        "A standard set of observations was completed, and the crew proceeded with rapid transport to the hospital without additional interventions.",
        "A 12-lead ECG was attempted but could not be completed due to patient movement and environmental factors.",
        "After ruling out immediate life threats, paramedics continued monitoring without performing an ECG.",
        "As part of the protocol for suspected cardiac events, a 12-lead ECG was conducted, confirming atrial fibrillation with a rapid ventricular response."
    ],
    "Expected Result": [
        "Performed",
        "Not Performed",
        "12 lead not mentioned",
        "Not Performed",
        "Performed",
        "12 lead not mentioned",
        "Performed",
        "Not Performed",
        "12 lead not mentioned",
        "Not Performed",
        "12 lead not mentioned",
        "Not Performed",
        "12 lead not mentioned",
        "Performed"
    ]
}

# Creating the dataframe
df = pd.DataFrame(data)

# Displaying the dataframe
df


Unnamed: 0,Incident Description,Expected Result
0,The patient had a 12-lead ECG performed.,Performed
1,I didn't do a 12-lead ECG on the patient.,Not Performed
2,"No signs of stroke, but we did an ECG.",12 lead not mentioned
3,The 12-lead ECG was not necessary.,Not Performed
4,"A 12-lead ECG was performed in the ambulance, showing ST elevation in leads II, III, and aVF, prompting immediate transport to a cardiac center.",Performed
5,"The patient was assessed for chest pain, and vital signs were recorded. No ECG was performed at this stage.",12 lead not mentioned
6,"Paramedics conducted a full cardiac assessment, including a 12-lead ECG, which showed normal sinus rhythm with no acute ischemic changes.",Performed
7,"Due to ongoing symptoms, the crew considered a 12-lead ECG but deferred it based on the patient's history and presentation.",Not Performed
8,The ambulance team focused on managing the patient's respiratory distress with oxygen therapy and positioning.,12 lead not mentioned
9,"Following the initial assessment, a 12-lead ECG was not deemed necessary as the patient exhibited no cardiac-related symptoms.",Not Performed


In [5]:
# Load Spacy model
nlp = spacy.load("en_core_web_sm")

In [6]:
# Create a PhraseMatcher outside the component
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
custom_terms = ["12-lead ECG"]
patterns = [nlp(term) for term in custom_terms]
matcher.add("CUSTOM_ENTITY", patterns)

In [7]:
# Create custom entity component
@spacy.Language.component("custom_entity_component")
def custom_entity_component(doc):
    matches = matcher(doc)

    # If no matches, return doc unchanged
    if not matches:
        return doc

    # Create new Spans for the matches
    spans = []
    for match_id, start, end in matches:
        spans.append(Span(doc, start, end, label="CUSTOM_ENTITY"))

    # Filter out any overlapping spans
    filtered_spans = spacy.util.filter_spans(spans)

    # Combine with existing entities, avoiding duplicates
    all_ents = list(doc.ents)
    for span in filtered_spans:
        # Check if this span overlaps with existing entities
        if not any(existing.start <= span.start < existing.end or
                  existing.start < span.end <= existing.end
                  for existing in all_ents):
            all_ents.append(span)

    # Sort entities by start position
    all_ents = sorted(all_ents, key=lambda x: x.start)

    # Set entities on the doc
    doc.ents = all_ents
    return doc

In [8]:
# Add components to pipeline in correct order
nlp.add_pipe("custom_entity_component", before="ner")

# Add Negex AFTER the custom entity component and NER
nlp.add_pipe("negex", last=True,
             config={"ent_types": ["CUSTOM_ENTITY"],
                    "chunk_prefix": ["no", "not", "didn't", "never", "wasn't"]
              }
                    )

<negspacy.negation.Negex at 0x18f10febbd0>

## Simple output

In [9]:
# Function to process text and detect negation
def process_text(text):
    doc = nlp(text)

    results = []
    for ent in doc.ents:
        if ent.label_ == "CUSTOM_ENTITY":
            # Access the negex extension
            is_negated = ent._.negex
            results.append((ent.text, is_negated))

    return results

In [10]:
df["negation_results"] = df["Incident Description"].apply(process_text)

In [11]:
df

Unnamed: 0,Incident Description,Expected Result,negation_results
0,The patient had a 12-lead ECG performed.,Performed,"[(12-lead ECG, False)]"
1,I didn't do a 12-lead ECG on the patient.,Not Performed,"[(12-lead ECG, True)]"
2,"No signs of stroke, but we did an ECG.",12 lead not mentioned,[]
3,The 12-lead ECG was not necessary.,Not Performed,"[(12-lead ECG, True)]"
4,"A 12-lead ECG was performed in the ambulance, showing ST elevation in leads II, III, and aVF, prompting immediate transport to a cardiac center.",Performed,"[(12-lead ECG, False)]"
5,"The patient was assessed for chest pain, and vital signs were recorded. No ECG was performed at this stage.",12 lead not mentioned,[]
6,"Paramedics conducted a full cardiac assessment, including a 12-lead ECG, which showed normal sinus rhythm with no acute ischemic changes.",Performed,"[(12-lead ECG, False)]"
7,"Due to ongoing symptoms, the crew considered a 12-lead ECG but deferred it based on the patient's history and presentation.",Not Performed,"[(12-lead ECG, False)]"
8,The ambulance team focused on managing the patient's respiratory distress with oxygen therapy and positioning.,12 lead not mentioned,[]
9,"Following the initial assessment, a 12-lead ECG was not deemed necessary as the patient exhibited no cardiac-related symptoms.",Not Performed,"[(12-lead ECG, True)]"


This isn't very easy to interpret, so let's split it out into some better columns. 

## More complex output

In [12]:
# Function to process text and detect negation
def process_text_advanced(text):
    doc = nlp(text)

    # Default values if no entity is found
    span_detected = False
    negation_detected = False

    for ent in doc.ents:
        if ent.label_ == "CUSTOM_ENTITY":
            span_detected = True
            # Access the negex extension
            negation_detected = ent._.negex
            break  # We only care about the first 12-lead ECG mention

    if span_detected:
        if negation_detected:
            result = "Not Performed"
        else:
            result = "Performed"
    else:
        result = "12 lead not mentioned"

    # Return a dictionary with all required information
    return {
        "Span Detected": span_detected,
        "Negation Detected": negation_detected,
        "12-lead ECG performed": result
    }

# Apply function to DataFrame and expand the results
results = df["Incident Description"].apply(process_text_advanced)
df_results = pd.DataFrame(results.tolist())

# Combine with original DataFrame
final_df = pd.concat([df, df_results], axis=1)

final_df['NLP Correct'] = final_df['Expected Result'] == final_df['12-lead ECG performed']

final_df

Unnamed: 0,Incident Description,Expected Result,negation_results,Span Detected,Negation Detected,12-lead ECG performed,NLP Correct
0,The patient had a 12-lead ECG performed.,Performed,"[(12-lead ECG, False)]",True,False,Performed,True
1,I didn't do a 12-lead ECG on the patient.,Not Performed,"[(12-lead ECG, True)]",True,True,Not Performed,True
2,"No signs of stroke, but we did an ECG.",12 lead not mentioned,[],False,False,12 lead not mentioned,True
3,The 12-lead ECG was not necessary.,Not Performed,"[(12-lead ECG, True)]",True,True,Not Performed,True
4,"A 12-lead ECG was performed in the ambulance, showing ST elevation in leads II, III, and aVF, prompting immediate transport to a cardiac center.",Performed,"[(12-lead ECG, False)]",True,False,Performed,True
5,"The patient was assessed for chest pain, and vital signs were recorded. No ECG was performed at this stage.",12 lead not mentioned,[],False,False,12 lead not mentioned,True
6,"Paramedics conducted a full cardiac assessment, including a 12-lead ECG, which showed normal sinus rhythm with no acute ischemic changes.",Performed,"[(12-lead ECG, False)]",True,False,Performed,True
7,"Due to ongoing symptoms, the crew considered a 12-lead ECG but deferred it based on the patient's history and presentation.",Not Performed,"[(12-lead ECG, False)]",True,False,Performed,False
8,The ambulance team focused on managing the patient's respiratory distress with oxygen therapy and positioning.,12 lead not mentioned,[],False,False,12 lead not mentioned,True
9,"Following the initial assessment, a 12-lead ECG was not deemed necessary as the patient exhibited no cardiac-related symptoms.",Not Performed,"[(12-lead ECG, True)]",True,True,Not Performed,True


Let's check how many are correctly identified.

In [13]:
final_df['NLP Correct'].value_counts()

True     12
False     2
Name: NLP Correct, dtype: int64

Performance here isn't bad but it doesn't work on the following:

- "Due to ongoing symptoms, the crew considered a 12-lead ECG but deferred it based on the patient's history and presentation."

- "A 12-lead ECG was attempted but could not be completed due to patient movement and environmental factors."

## Try improving negations

In [14]:
# Load Spacy model
nlp = spacy.load("en_core_web_sm")

# Add components to pipeline in correct order
nlp.add_pipe("custom_entity_component", before="ner")

nlp.add_pipe("negex", last=True,
             config={
                 "ent_types": ["CUSTOM_ENTITY"],
                 "chunk_prefix": ["no", "not", "didn't", "never", "wasn't"],
                "neg_termset":{
                 "pseudo_negations": ["attempted but", "considered but"],
                 "preceding_negations": ["could not", "deferred", "no", "not", "didn't", "never", "wasn't"],
                 "following_negations": ["could not be completed", "was deferred"],
                 "termination": ["but", "however", "nevertheless", "yet"]
             }
             })

# Apply function to DataFrame and expand the results
results = df["Incident Description"].apply(process_text_advanced)
df_results = pd.DataFrame(results.tolist())

# Combine with original DataFrame
final_df_advanced_negation = pd.concat([df, df_results], axis=1)

final_df_advanced_negation['NLP Correct'] = final_df_advanced_negation['Expected Result'] == final_df_advanced_negation['12-lead ECG performed']


final_df_advanced_negation[['NLP Correct', 'Incident Description', 'Expected Result',
                            '12-lead ECG performed', 'Span Detected', 'Negation Detected']]

Unnamed: 0,NLP Correct,Incident Description,Expected Result,12-lead ECG performed,Span Detected,Negation Detected
0,True,The patient had a 12-lead ECG performed.,Performed,Performed,True,False
1,True,I didn't do a 12-lead ECG on the patient.,Not Performed,Not Performed,True,True
2,True,"No signs of stroke, but we did an ECG.",12 lead not mentioned,12 lead not mentioned,False,False
3,False,The 12-lead ECG was not necessary.,Not Performed,Performed,True,False
4,True,"A 12-lead ECG was performed in the ambulance, showing ST elevation in leads II, III, and aVF, prompting immediate transport to a cardiac center.",Performed,Performed,True,False
5,True,"The patient was assessed for chest pain, and vital signs were recorded. No ECG was performed at this stage.",12 lead not mentioned,12 lead not mentioned,False,False
6,True,"Paramedics conducted a full cardiac assessment, including a 12-lead ECG, which showed normal sinus rhythm with no acute ischemic changes.",Performed,Performed,True,False
7,False,"Due to ongoing symptoms, the crew considered a 12-lead ECG but deferred it based on the patient's history and presentation.",Not Performed,Performed,True,False
8,True,The ambulance team focused on managing the patient's respiratory distress with oxygen therapy and positioning.,12 lead not mentioned,12 lead not mentioned,False,False
9,False,"Following the initial assessment, a 12-lead ECG was not deemed necessary as the patient exhibited no cardiac-related symptoms.",Not Performed,Performed,True,False


In [15]:
final_df_advanced_negation['NLP Correct'].value_counts()

True     10
False     4
Name: NLP Correct, dtype: int64

We've succeeded in making it worse!