In [8]:
import polars as pl
import os
from datetime import timedelta

In [5]:
procedures = pl.read_parquet("/home/alex/ews/NEWS2_Evaluation/procedures_newest.parquet")
ews = pl.read_parquet("/home/alex/ews/NEWS2_Evaluation/finalized_with_embeddings.parquet")

In [None]:
# Join recorded_time from ews to procedures
procedures = procedures.join(
    ews.select("PT_ID", "CSN", "recorded_time"),
    on=["PT_ID", "CSN"],
    how="left"
)

# Filter procedures that happen within 24H after recorded_time
procedures = (
    procedures
    .with_columns(
        cutoff_time=pl.col("recorded_time") + timedelta(seconds=86400)
    )
    .filter(
        (pl.col("recorded_time").is_not_null()) &
        (pl.col("Procedure_Date").is_not_null()) & 
        (pl.col("Procedure_Date") >= pl.col("recorded_time")) &
        (pl.col("Procedure_Date") <= pl.col("cutoff_time"))
    )
    .sort(["PT_ID", "CSN"])
)

In [None]:
# Now i want to keep the procedures that are only found in the ews dataframe

procedures = procedures.filter(pl.col("CSN").is_in(ews["CSN"]))

procedures.head()

In [None]:
# Now i want to keep the procedures that are only found in the ews dataframe

procedures = procedures.filter(pl.col("CSN").is_in(ews["CSN"]))

procedures.head()

In [12]:
# Count the number of distinct procedures each individual (per hospitalization number, i.e. CSN) has been exposed to
csns_interventions = (
    procedures
    .group_by("CSN")
    .agg(pl.col("SKS_Group").unique())
    .explode("SKS_Group")
    .sort("CSN")
)

In [None]:
csns_interventions.head()

In [14]:
# Individuals that have received Anesthesia or Intensive Care or Surgical Operations
csns_interventions_groups = csns_interventions.filter(
    (pl.col("SKS_Group") == "Anesthesia or Intensive Care") | 
    (pl.col("SKS_Group") == "Surgical Operations")
)

In [15]:
# Get CSNs with SKS codes containing "BGDA" (ventilation support)
csns_bgda = (
    procedures
    .filter(pl.col("SKS_Code").str.contains("BGDA"))
    .select("CSN")
    .unique()
)

In [16]:
# Combine both criteria: either specific SKS_Groups OR BGDA codes
csns_with_interventions = (
    csns_interventions_groups
    .select("CSN")
    .unique()
    .vstack(csns_bgda)
    .unique()
)

In [17]:
# Remove any null values
csns_with_interventions = csns_with_interventions.drop_nulls()

In [19]:
# Now we need to go back to the EWS data
# If we have individuals that have been exposed to Anesthesia/ITA or Surgical Operations
# OR have BGDA codes (ventilation support), OR have early_icu_respiratory_24h == 1
# we consider that there has been an intervention
ews = ews.with_columns(
    Interventions_24=pl.when(
        (pl.col("CSN").is_in(csns_with_interventions.select("CSN"))) |
        (pl.col("early_icu_respiratory_24h") == 1)
    )
    .then(1)
    .otherwise(0)
)

In [None]:
# Count the Interventions_24 variable
ews["Interventions_24"].value_counts()

In [None]:
# Count the Interventions_24 variable with percentages
intervention_counts = (
    ews["Interventions_24"]
    .value_counts()
    .with_columns(
        percentage = (pl.col("count") / pl.col("count").sum() * 100).round(2)
    )
    .sort("Interventions_24")
)

intervention_counts

In [None]:
# Save the ews dataframe

print(os.getcwd())

ews.write_parquet("ews_interventions_24.parquet")