In [None]:
# -*- coding: utf-8 -*-
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.13.7
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

## Cell 1: Setup Project Root Path & Core Imports

In [None]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
# --- Add project root to sys.path ---
# Get the current working directory of the notebook (e.g., C:\...\ClinNLP\notebooks)
notebook_dir = os.getcwd()
# Go up one level to get the project root directory (e.g., C:\...\ClinNLP)
project_root = os.path.dirname(notebook_dir)

In [None]:
# Check if the project root is already in sys.path, if not, add it
if project_root not in sys.path:
    print(f"Adding project root to sys.path: {project_root}")
    sys.path.insert(0, project_root) # Use insert(0, ...) to prioritize this path
else:
    print(f"Project root already in sys.path: {project_root}")
# ------------------------------------

In [None]:
# Filter warnings for cleaner output
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", message="Using categorical_feature in Dataset.") # Uncomment if needed

In [None]:
# --- Import modules from src and config ---
try:
    from src.config import (DATA_DIR, PATIENT_DATA_PATH, NOTES_DATA_PATH,
                            AE_EVENTS_PATH, PLOT_DIR, AE_LABELS) # Add other configs if needed
    from src.data_processing import load_patient_data, load_notes_data
    from src.nlp_extraction import extract_entities_advanced_nlp # Needed to generate NLP results here
    from src.visualization import (plot_ae_frequency, plot_ae_severity,
                                   plot_ae_by_drug, plot_patient_timeline)

    print("\nSuccessfully imported modules from 'src' and config.")
    print(f"Project Root: {project_root}")
    print(f"Data Dir: {DATA_DIR}")

In [None]:
except ImportError as e:
    print(f"\nERROR: Could not import from 'src'.")
    print(f"Ensure the notebook is inside the 'notebooks' directory")
    print(f"and the 'src' directory exists at the project root: {project_root}")
    print(f"ImportError: {e}")
    # Optionally raise error or exit if imports fail:
    # raise e
except Exception as e:
    print(f"An unexpected error occurred during import: {e}")
    # raise e

## Cell 2: Load Raw Data

In [None]:
print("\nLoading raw patient and notes data...")
try:
    patients_df = load_patient_data(PATIENT_DATA_PATH)
    notes_df = load_notes_data(NOTES_DATA_PATH)
    # Optional: Load AE events if needed for cross-referencing exploration
    # ae_events_df = pd.read_csv(AE_EVENTS_PATH, parse_dates=['ae_timestamp'])

    print(f"\nLoaded {len(patients_df)} patients.")
    print(patients_df.head())
    print(f"\nLoaded {len(notes_df)} notes.")
    print(notes_df.head())
    print("\nRaw data loaded successfully.")

In [None]:
except FileNotFoundError as e:
    print(f"\nERROR: Data file not found.")
    print(e)
    print("Please ensure data files exist at the paths defined in src/config.py and are accessible.")
    # Set dfs to None or empty to prevent errors later if loading fails
    patients_df = None
    notes_df = None
except Exception as e:
    print(f"An error occurred loading raw data: {e}")
    patients_df = None
    notes_df = None

In [None]:
# ## Cell 3: Run NLP Extraction (Potentially Time-Consuming)
#
# This step generates the 
lp_results_df needed for the subsequent visualizations.
# If you have pre-computed NLP results, load them here instead.

In [None]:
nlp_results_df = pd.DataFrame() # Initialize empty DataFrame

In [None]:
if notes_df is not None and not notes_df.empty:
    print("\nStarting NLP entity extraction (this may take a while)...")
    try:
        # Consider running on a smaller sample for faster exploration initially:
        # sample_notes_df = notes_df.sample(n=1000, random_state=42) # Example sample
        # nlp_results_df = extract_entities_advanced_nlp(sample_notes_df)
        # Or run on all notes:
        nlp_results_df = extract_entities_advanced_nlp(notes_df)

        if not nlp_results_df.empty:
            print(f"\nNLP extraction completed. Found {len(nlp_results_df)} affirmative entities.")
            print("Sample NLP results:")
            print(nlp_results_df.head())
        else:
            print("\nWarning: NLP extraction did not yield any affirmative entities.")

    except Exception as e:
        print(f"\nERROR during NLP extraction in the notebook: {e}")
        print("Subsequent plots relying on NLP results may fail or be empty.")
        nlp_results_df = pd.DataFrame() # Ensure it's empty on error

In [None]:
else:
    print("\nSkipping NLP extraction because notes data failed to load.")

## Cell 4: Visualize AE Frequency

In [None]:
# Ensure plot directory exists
os.makedirs(PLOT_DIR, exist_ok=True)

In [None]:
if not nlp_results_df.empty:
    try:
        plot_ae_frequency(nlp_results_df, save_plot=True) # save_plot=True saves to output/plots
    except Exception as e:
        print(f"Error generating AE frequency plot: {e}")
else:
    print("\nSkipping AE frequency plot (no NLP results).")

## Cell 5: Visualize AE Severity Distribution

In [None]:
if not nlp_results_df.empty:
    try:
        plot_ae_severity(nlp_results_df, save_plot=True)
    except Exception as e:
        print(f"Error generating AE severity plot: {e}")
else:
    print("\nSkipping AE severity plot (no NLP results).")

## Cell 6: Visualize AE-Drug Co-occurrence

In [None]:
if not nlp_results_df.empty:
    try:
        plot_ae_by_drug(nlp_results_df, save_plot=True)
    except Exception as e:
        print(f"Error generating AE-drug co-occurrence plot: {e}")
else:
    print("\nSkipping AE-drug co-occurrence plot (no NLP results).")

## Cell 7: Visualize Example Patient Timeline (Optional)

In [None]:
if not nlp_results_df.empty:
    # Find a patient with some AEs to plot
    patients_with_aes = nlp_results_df[
        nlp_results_df['entity_type'].isin(AE_LABELS) & nlp_results_df['severity_grade'].notna()
        ]['patient_id'].unique()

    if len(patients_with_aes) > 0:
        example_patient_id = patients_with_aes[0] # Plot the first one found
        print(f"\nAttempting timeline plot for example Patient ID: {example_patient_id}")
        try:
            plot_patient_timeline(nlp_results_df, patient_id_to_plot=example_patient_id, save_plot=True)
        except Exception as e:
            print(f"Error generating patient timeline plot: {e}")
    else:
        print("\nCould not find any patients with AEs+Severity in the NLP results for timeline example.")
else:
    print("\nSkipping patient timeline plot (no NLP results).")

In [None]:
print("\n\n--- Data Exploration Notebook Finished ---")