# Pipeline 3: Categorical patient demographics for predicting the length of stay

This notebook presents all code used for the evaluation of the code produced for the third pipeline, as well as the related relational algebra

# Relational algebra

$D_{admissions}(subject\_id, hadm\_id, admittime, dischtime, deathtime, admission\_type, admission\_location,  insurance, language, marital\_status, race, edregtime, edouttime, hospital\_expire\_flag)$

$D_{icustays}(subject\_id, hadm\_id, stay\_id, first\_careunit, last\_careunit, intime, outtime, los)$

$D_{patients}(subject\_id, gender, anchor\_age, anchor\_year, anchor\_year\_group, dod)$

$admissions\_s = \sigma_{*}(admissions)$

$icustays\_s = \sigma_{*}(icustays)$

$patients\_s = \sigma_{*}(patients)$

$admissions\_icustays = admission\_s\bowtie_{admissions\_s.subject\_id = icustays\_s.subject\_id AND admissions\_s.hadm\_id = icustays\_s.hadm\_id}icustays\_s$

$tables\_combined = admission\_icustays\bowtie_{admission\_icustays.subject\_id = patients\_s.subject\_id}patients\_s$

$tables\_combined = \rho_{staytime/los, ethnicity/race}(tables\_combined)$

$final\_data = \pi_{admission\_type, admission\_location, language, ethnicity, gender, staytime}(tables\_combined)$

In [None]:
# Necessary imports and setup of connection to the database

import polars as pl
from sqlalchemy import create_engine, inspect, Table, MetaData, Column, Integer, String, DateTime, Float, Boolean
import numpy as np

engine = create_engine(r"sqlite://path to where mimic4.db is stored") #change this to the path where mimic.db is stored. If timeout occurs, add pool_size and pool_timeout to the engine creation

In [None]:
# Ground truth code, used for comparison
metadata = MetaData()

admissions = Table(
   'admissions', metadata, 
    Column('subject_id', Integer, nullable=False),
    Column('hadm_id', Integer, nullable=False),
    Column('admittime', DateTime, nullable=False),
    Column('dischtime', DateTime, nullable=False),
    Column('deathtime', DateTime, nullable=True),
    Column('admission_type', String, nullable=False),
    Column('admit_provider_id', String, nullable=False),
    Column('admission_location', String, nullable=False),
    Column('insurance', String, nullable=False),
    Column('language', String, nullable=False),
    Column('marital_status', String, nullable=False),
    Column('race', String, nullable=False), 
    Column('edregtime', DateTime, nullable=False),
    Column('edouttime', DateTime, nullable=False), 
    Column('hospital_expire_flag', Integer, nullable=False)
)

icustays = Table(
   'icustays', metadata, 
    Column('subject_id', Integer, nullable=False),
    Column('hadm_id', Integer, nullable=False),   
    Column('stay_id', Integer, primary_key=True),
    Column('first_careunit', String, nullable=False),
    Column('last_careunit', String, nullable=False),
    Column('intime', DateTime, nullable=False),
    Column('outtime', DateTime, nullable=False),
    Column('los', Float, nullable=False)
)

patients = Table(
   'patients', metadata, 
    Column('subject_id', Integer, primary_key=True),
    Column('gender', String, nullable=False),   
    Column('anchor_age', Integer, primary_key=True),
    Column('anchor_year', Integer, nullable=False),
    Column('anchor_year_group', String, nullable=False),
    Column('dod', DateTime, nullable=True)
)
metadata.reflect(bind=engine)

query = """
SELECT *
FROM admissions
"""

query2 = """
SELECT *
FROM icustays
"""

query3 = """
SELECT *
FROM patients
"""

df1 = pl.read_database(query=query, connection=engine.connect()).drop('index')
df2 = pl.read_database(query=query2, connection=engine.connect()).drop('index')
df3 = pl.read_database(query=query3, connection=engine.connect()).drop('index')

df_combined = df1.join(df2, on= ["subject_id", 'hadm_id'],  suffix='_df2').join(df3, on=["subject_id"], suffix='_df3')

df_combined = df_combined.rename({"los": "staytime", "race":"ethnicity"})

df_combined = df_combined.select([
   "admission_type",
   "admission_location",
   "language",
   "marital_status",
   "ethnicity",
   "gender",
   "staytime"
])

df_combined

# Code Valdity 

In the code block below, the generated code produced by the LLM can be pasted and executed, to assess if the code works without any runtime errors.

In [None]:
# Paste generated code here 

# Code correctness

To assess the code correctness, the code below can be executed. If it is correct, True should be returned. Otherwise, manual inspection in the code block below should be executed. It is assumed the presented solution is stored in a variable called 'final data'

In [None]:
df_combined.equals(final_data)

In [None]:
# Convert the Polars DataFrame to a pandas DataFrame
df_combined_pandas = df_combined.to_pandas()
final_data_pandas = final_data.to_pandas()

# Compare the two dataframes
df_diff = df_combined_pandas.compare(final_data_pandas)

# If there are differences, print them
if not df_combined_pandas.equals(final_data_pandas):
    print(df_diff)

# Relative Efficiency

To record the average running time for the generated solution, the generated solution can be added to the code block below and executed. The average executrion time should be recorded per solution, for later determination of the relative efficiency


In [None]:
import time
import numpy as np
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

execution_times = []

for i in range(10):
    start_time = time.time()

    # Paste generated code here

    end_time = time.time()

    execution_time = end_time - start_time
    execution_times.append(execution_time)


average_execution_time = np.mean(execution_times)

print(f"The code executed in average {average_execution_time} seconds over 10 runs.")

In [None]:
# To calculate the relative efficiency of the generated solutions
def relative_efficiencies(times):
    tmin = min(times)
    tmax = max(times)
    efficiencies = [100 * (1 - (tc - tmin) / (tmax - tmin)) for tc in times]
    return efficiencies

time_gt = [45.4] # ground truth time
solutions = [] # Add the times from the 10 runs here
times = time_gt + solutions


re = relative_efficiencies(times)
# Calculate average
average = np.mean(re[1:])
print(f'Average: {average}')