In [None]:
# Load the libraries first

import polars as pl
import numpy as np
import os

# Access the procedures data

procs = pl.read_parquet("/home/alex/ews/NEWS2_Evaluation/procedures_newest.parquet")

In [None]:
# Now load the original dataframe with the metadata

# First change the current directory to the one where the original dataframe is located

os.chdir("/home/alex/ews/aggregated")

In [None]:
# That's the original dataframe with the metadata (early warning scores, age, sex, mortality status, department, hospital ...)

df = pl.read_parquet("ews_interventions_24_updated.parquet")

In [None]:
# Some of the datetime columns should be converted into the right format

df = df.with_columns([
    (pl.col("recorded_time") - pl.duration(hours=1)).alias("recorded_time"),
    (pl.col("HOSP_DISCH_TIME") - pl.duration(hours=1)).dt.replace_time_zone(None).alias("HOSP_DISCH_TIME"),
    pl.col("deathDate").dt.replace_time_zone(None).alias("deathDate")
])

In [None]:
# First, let's check the exact data types
print("Procedures data types:")
print(procs.schema)

print("\nEWS data types:")
print(df.schema)

# Specifically check the datetime columns
print("\nProcedure_Date type:", procs["Procedure_Date"].dtype)
print("recorded_time type:", df["recorded_time"].dtype)

In [None]:
# Now extract the column of df called "recorded_time" and rename

ews_dates = df.select("PT_ID","CSN","recorded_time").rename({"recorded_time": "ews_time"})

In [None]:
# Now put the ews_dates dataframe into the procedures_newest dataframe after changing back to the original directory

os.chdir("/home/alex/ews/NEWS2_Evaluation")

# Now join the two dataframes on the CSN column

procs = procs.join(ews_dates, on="CSN", how="left")

In [None]:

# Get PT_ID for each EWS CSN from the main dataframe
ews_with_ptid = df.select("PT_ID", "CSN", "recorded_time").rename({"recorded_time": "ews_time"})

# Conditional join: join on PT_ID where Procedure_Date < ews_time
procs_before_ews = procs.join_where(
    ews_with_ptid,
    pl.col("PT_ID") == pl.col("PT_ID_right"),
    pl.col("Procedure_Date") < pl.col("ews_time")
)

# Group by EWS CSN and concatenate procedure names with | separator
aggregated_procedures = procs_before_ews.group_by("CSN_right").agg(
    pl.col("Procedurenavn").str.concat(delimiter=" | ").alias("Aggregated_Procedures")
)

# Rename CSN column
aggregated_procedures = aggregated_procedures.rename({"CSN_right": "CSN"})

# Join back to all EWS CSNs for complete coverage, keeping PT_ID
final_aggregated = ews_with_ptid.select("PT_ID", "CSN").unique().join(
    aggregated_procedures, on="CSN", how="left"
).with_columns(
    pl.col("Aggregated_Procedures").fill_null("Ingen")
)

In [None]:
os.chdir("/home/alex/ews/NEWS2_Evaluation")
os.getcwd()

In [55]:
# Now I want to save the final_aggregated dataframe to a parquet file

final_aggregated.write_parquet("procs_full_june25.parquet")