# Exclude Patients with Comfort Measures
This code is used to produce a dataset that excludes patients in our cohort who are on comfort measures. The output dataset from this notebook would contain patients who are not on comfort measures AND is in our cohort. This dataset is used to examine how GroupFasterRisk performs on those patients.

In [1]:
import mimic_pipeline as mmp
import mimic_pipeline.utils as utils
utils.seed_everything()

In [2]:
user = input("Enter your username: ")
password = input("Enter your password: ")
loader = utils.DataBaseLoader(user=user, password=password)

In [3]:
comfort_measure_df = loader["code_status"]
comfort_measure_df.head(10)

Unnamed: 0,subject_id,hadm_id,icustay_id,fullcode_first,cmo_first,dnr_first,dni_first,dncpr_first,fullcode_last,cmo_last,...,dncpr_last,fullcode,cmo,dnr,dni,dncpr,dnr_first_charttime,dni_first_charttime,dncpr_first_charttime,timecmo_chart
0,4993,195833,201013,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,2112-03-20 18:30:00,NaT,NaT,NaT
1,6441,192486,202058,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,NaT,NaT,NaT,NaT
2,42132,181988,200812,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,NaT,NaT,NaT,NaT
3,97207,142037,205347,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,NaT,NaT,NaT,NaT
4,19160,142711,200300,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,NaT,NaT,NaT,NaT
5,17596,148565,205791,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,NaT,NaT,NaT,NaT
6,15485,129151,206932,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,NaT,NaT,NaT,NaT
7,51841,196761,204475,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,NaT,NaT,NaT,NaT
8,17993,147010,204158,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,NaT,NaT,NaT,NaT
9,3920,119880,210560,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,NaT,NaT,NaT,NaT


According to the official repo that produces this table (`https://github.com/MIT-LCP/mimic-code/blob/main/mimic-iii/concepts_postgres/code_status.sql`), feature `cmo` is the indicator that if a given patient was ***at any time*** given a **Comfort Measures** or **Comfort Measures Only** status. Thus, we use `cmo` for the purpose of this study.

In [4]:
comfort_measure_df = comfort_measure_df[["subject_id", "hadm_id", "icustay_id", "cmo"]]
comfort_measure_df.head(10)

Unnamed: 0,subject_id,hadm_id,icustay_id,cmo
0,4993,195833,201013,0.0
1,6441,192486,202058,0.0
2,42132,181988,200812,0.0
3,97207,142037,205347,0.0
4,19160,142711,200300,0.0
5,17596,148565,205791,0.0
6,15485,129151,206932,0.0
7,51841,196761,204475,0.0
8,17993,147010,204158,0.0
9,3920,119880,210560,0.0


In [5]:
print(f"Number of unique patients on comfort measures: {comfort_measure_df[comfort_measure_df['cmo'] == 1].count()[0]}")
print(f"Total number of patients: {comfort_measure_df['subject_id'].nunique()}")
print(f"Unique values for feature 'cmo': {comfort_measure_df['cmo'].unique()}")

Number of unique patients on comfort measures: 1788
Total number of patients: 46476
Unique values for feature 'cmo': [ 0.  1. nan]


In [6]:
comfort_measure_df = comfort_measure_df[comfort_measure_df["cmo"] == 1]
len(comfort_measure_df)

1788

In [7]:
import pandas as pd
train_df = pd.read_csv("data/TRAIN-union-features-id.csv")
test_df = pd.read_csv("data/TEST-union-features-id.csv")

In [8]:
print(f"Number of unique patients in the training set: {train_df['subject_id'].nunique()}")
print(f"Number of unique patients in the test set: {test_df['subject_id'].nunique()}")

Number of unique patients in the training set: 22678
Number of unique patients in the test set: 7560


In [9]:
len(train_df), len(test_df)

(22678, 7560)

In [10]:
train_excluded_df = train_df[~train_df["icustay_id"].isin(comfort_measure_df["icustay_id"])]
test_excluded_df = test_df[~test_df["icustay_id"].isin(comfort_measure_df["icustay_id"])]
print(f"Number of unique patients in the training set after excluding patients on comfort measures: {train_excluded_df['subject_id'].nunique()}")
print(f"Number of unique patients in the test set after excluding patients on comfort measures: {test_excluded_df['subject_id'].nunique()}")

Number of unique patients in the training set after excluding patients on comfort measures: 21942
Number of unique patients in the test set after excluding patients on comfort measures: 7323


In [11]:
print(f"Excluded {len(train_df) - len(train_excluded_df)} patients in training set, {len(test_df) - len(test_excluded_df)} patients in test set")

Excluded 736 patients in training set, 237 patients in test set


save

In [12]:
train_excluded_df.to_csv("data/TRAIN-union-features-id-excluded-cmo.csv", index=False)
test_excluded_df.to_csv("data/TEST-union-features-id-excluded-cmo.csv", index=False)

In [13]:
# another version without ids
train_excluded_df = train_excluded_df.drop(columns=["subject_id", "hadm_id", "icustay_id"])
test_excluded_df = test_excluded_df.drop(columns=["subject_id", "hadm_id", "icustay_id"])
train_excluded_df.to_csv("data/TRAIN-union-features-excluded-cmo.csv", index=False)
test_excluded_df.to_csv("data/TEST-union-features-excluded-cmo.csv", index=False)