In [None]:
# -*- coding: utf-8 -*-
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.7
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# Notebook 2: NLP Extraction Testing & Debugging

**Objective:** Load the spaCy model and test the NLP extraction pipeline on specific examples or subsets of notes. Useful for debugging entity recognition, normalization, severity linking, and negation detection.

## Cell 1: Setup Project Root Path & Core Imports

In [None]:
import os
import sys
import pandas as pd
import spacy # Import spacy directly for testing
import warnings
from spacy.matcher import Matcher
from spacy.tokens import Span
import sys
print(sys.executable)
print(sys.path)

In [None]:
# --- Add project root to sys.path ---
# Get the current working directory of the notebook (e.g., C:\...\ClinNLP\notebooks)
notebook_dir = os.getcwd()
# Go up one level to get the project root directory (e.g., C:\...\ClinNLP)
project_root = os.path.dirname(notebook_dir)

In [None]:
# Check if the project root is already in sys.path, if not, add it
if project_root not in sys.path:
    print(f"Adding project root to sys.path: {project_root}")
    sys.path.insert(0, project_root) # Use insert(0, ...) to prioritize this path
else:
    print(f"Project root already in sys.path: {project_root}")
# ------------------------------------

In [None]:
# Filter warnings for cleaner output
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# --- Import modules from src and config ---
try:
    # This code is indented (usually 4 spaces)
    from src.config import (NOTES_DATA_PATH, AE_NORMALIZATION_MAP,
                           SEVERITY_TERMS, SEVERITY_PATTERNS, AE_LABELS, DRUG_LABELS,
                           NEGSPACY_AVAILABLE) # Import NEGSPACY_AVAILABLE flag
    from src.data_processing import load_notes_data
    from src.nlp_extraction import (setup_nlp, link_ae_severity_improved,
                                    extract_entities_advanced_nlp) # Import setup and main function

    print("\nSuccessfully imported modules from 'src' and config.")
    print(f"Project Root: {project_root}")

# This 'except' is NOT indented, aligned with 'try'
except ImportError as e:
    # This code IS indented
    print(f"\nERROR: Could not import from 'src'.")
    print(f"Ensure the notebook is inside the 'notebooks' directory")
    print(f"and the 'src' directory exists at the project root: {project_root}")
    print(f"ImportError: {e}")
    # raise e
# This 'except' is also NOT indented, aligned with 'try'
except Exception as e:
    # This code IS indented
    print(f"An unexpected error occurred during import: {e}")
    # raise e

In [None]:
except ImportError as e:
    print(f"\nERROR: Could not import from 'src'.")
    print(f"Ensure the notebook is inside the 'notebooks' directory")
    print(f"and the 'src' directory exists at the project root: {project_root}")
    print(f"ImportError: {e}")
    # raise e
except Exception as e:
    print(f"An unexpected error occurred during import: {e}")
    # raise e

## Cell 2: Load Sample Notes Data

In [None]:
print("\nLoading notes data...")
notes_df = None # Initialize
notes_sample_df = None # Initialize sample df too

try:
    # This line is indented under 'try'
    notes_df = load_notes_data(NOTES_DATA_PATH)
    print(f"\nLoaded {len(notes_df)} notes.")
    print(notes_df.head())

    # --- !! SELECT A SMALL SAMPLE FOR TESTING !! ---
    # This 'if' is indented under 'try'
    if len(notes_df) > 50: # Only sample if df is large
        # This 'print' is indented under 'if'
        print("\nSelecting a small sample of 20 notes for detailed testing...")
        # This assignment MUST be indented under 'if'
        notes_sample_df = notes_df.sample(n=20, random_state=42)
    # This 'else' MUST be aligned vertically with 'if'
    else:
        # This assignment MUST be indented under 'else'
        notes_sample_df = notes_df

    # This 'print' should be aligned with the 'if/else' block, still inside 'try'
    # Add a check to make sure notes_sample_df was created before printing length
    if notes_sample_df is not None:
        print(f"\nUsing sample of {len(notes_sample_df)} notes for testing.")
    else:
        print("\nSample DataFrame could not be created (notes_df might be empty).")

# This 'except' MUST be aligned vertically with 'try'
except FileNotFoundError as e:
    print(f"\nERROR: Notes data file not found at {NOTES_DATA_PATH}")
# This 'except' MUST be aligned vertically with 'try'
except Exception as e:
    print(f"An error occurred loading notes data: {e}")

In [None]:
except FileNotFoundError as e:
    print(f"\nERROR: Notes data file not found at {NOTES_DATA_PATH}")
except Exception as e:
    print(f"An error occurred loading notes data: {e}")

## Cell 3: Setup spaCy Model & Components

In [None]:
print("\nSetting up spaCy NLP model...")
nlp = None
matcher = None
if notes_df is not None: # Only setup if data loaded
    try:
        nlp, matcher = setup_nlp() # Loads model, matcher, negex pipe
        print("\nspaCy model and components initialized successfully.")
    except Exception as e:
        print(f"ERROR setting up spaCy model: {e}")
else:
    print("Skipping spaCy setup because notes data failed to load.")

## Cell 4: Process a Single Note Example

In [None]:
# Choose a note index or note_id from your sample/full dataframe
example_note_index = notes_sample_df.index[0] if notes_sample_df is not None and not notes_sample_df.empty else None
# Or set a specific note_id if you know one: example_note_id = 'some_note_123'
# example_note_index = notes_df[notes_df['note_id'] == example_note_id].index[0]

In [None]:
if nlp and example_note_index is not None:
    example_text = notes_sample_df.loc[example_note_index, 'note_text']
    example_note_id = notes_sample_df.loc[example_note_index, 'note_id']
    print(f"\n--- Processing Example Note (ID: {example_note_id}) ---")
    print("Text:")
    print(example_text)
    print("-" * 30)

    # Process the text with the loaded nlp object
    doc = nlp(example_text)

    print("\nEntities Found (doc.ents):")
    if not doc.ents:
        print("  No entities found by the model.")
    for ent in doc.ents:
        negation_status = ""
        if NEGSPACY_AVAILABLE and hasattr(ent._, 'negex'):
             negation_status = f" | Negated: {ent._.negex}"
        elif ent.label_ in AE_LABELS and not NEGSPACY_AVAILABLE:
             negation_status = " | Negation: (negspacy unavailable)"

        print(f"  - Text: '{ent.text}' | Label: {ent.label_}{negation_status}")

    print("\nSeverity Terms Found (matcher):")
    severity_matches = matcher(doc)
    if not severity_matches:
        print("  No severity terms found by the matcher.")
    for match_id, start, end in severity_matches:
        span = Span(doc, start, end, label=nlp.vocab.strings[match_id])
        print(f"  - Text: '{span.text}' | Label: {span.label_} | Position: ({start}-{end})")

    print("\nLinking AEs and Severity (Simple Proximity):")
    ae_entities = [ent for ent in doc.ents if ent.label_ in AE_LABELS]
    if not ae_entities:
        print("  No AE entities found to link severity.")
    else:
        ae_severity_links = link_ae_severity_improved(doc, ae_entities, severity_matches)
        for ae, linked_info in ae_severity_links.items():
            print(f"  - AE: '{ae.text}' -> Severity Term: '{linked_info['severity_term']}' (Grade: {linked_info['severity_grade']})")

    print("\nNormalization Check:")
    if not ae_entities:
        print("  No AE entities found to normalize.")
    else:
        for ent in ae_entities:
             original_text = ent.text.lower().strip()
             normalized_text = AE_NORMALIZATION_MAP.get(original_text, original_text) # Use default if not found
             if original_text != normalized_text:
                  print(f"  - Original: '{original_text}' -> Normalized: '{normalized_text}'")
             # else:
             #      print(f"  - Original: '{original_text}' -> Normalized: '{normalized_text}' (No change)")

else:
    print("\nSkipping single note processing (NLP model not loaded or no example index).")

## Cell 5: Run Full Extraction Pipeline on the Small Sample

This uses the main extraction function from nlp_extraction.py on the sample data

In [None]:
if notes_sample_df is not None and not notes_sample_df.empty:
    print(f"\n--- Running full extraction pipeline on sample of {len(notes_sample_df)} notes ---")
    try:
        # Rerun setup_nlp just in case it wasn't run or failed before
        if not nlp:
             nlp, matcher = setup_nlp()

        if nlp: # Proceed only if NLP setup succeeded
            sample_nlp_results_df = extract_entities_advanced_nlp(notes_sample_df)

            if not sample_nlp_results_df.empty:
                print(f"\nExtraction on sample completed. Found {len(sample_nlp_results_df)} affirmative entities.")
                print("Sample results from full pipeline run on sample:")
                print(sample_nlp_results_df[['note_id', 'entity_type', 'entity_text', 'severity_grade']].head(10))

                # Further inspection of results
                print("\nChecking sample results for specific conditions:")
                print("AEs with Severity:")
                print(sample_nlp_results_df[sample_nlp_results_df['severity_grade'].notna() & sample_nlp_results_df['entity_type'].isin(AE_LABELS)].head())
                print("\nNormalized AEs:")
                print(sample_nlp_results_df[sample_nlp_results_df['entity_text_original'] != sample_nlp_results_df['entity_text']].head())

            else:
                print("\nWarning: Full extraction on sample yielded no affirmative entities.")
        else:
            print("Skipping full extraction on sample because NLP setup failed.")

    except Exception as e:
        print(f"ERROR during full extraction pipeline run on sample: {e}")
else:
    print("\nSkipping full extraction on sample (sample data not available).")

## Cell 6: Further Testing (Add more specific tests as needed)

In [None]:
# Example: Test a specific normalization rule
test_term = "low white count"
normalized = AE_NORMALIZATION_MAP.get(test_term, test_term)
print(f"\nTesting normalization for '{test_term}': Result = '{normalized}'")
# Assert statement for automated testing (won't stop notebook execution on fail by default)
try:
    assert normalized == "Neutropenia"
    print("  Assertion PASSED.")
except AssertionError:
    print("  Assertion FAILED.")

In [None]:
# Example: Test a specific severity term
test_sev = "g3"
grade = SEVERITY_TERMS.get(test_sev, None)
print(f"Testing severity for '{test_sev}': Result = {grade}")
try:
    assert grade == 3
    print("  Assertion PASSED.")
except AssertionError:
    print("  Assertion FAILED.")

Add more tests for edge cases, different sentence structures, negation patterns etc.

In [None]:
print("\n\n--- NLP Testing Notebook Finished ---")