# Feature Importance with Captum

In [1]:
import torch
import pandas as pd
import numpy as np

WORKSPACE_DIR = '/workspaces/msc-thesis-recurrent-health-modeling'
print(f"Workspace directory: {WORKSPACE_DIR}")

API_DATA_DEMO_DIR = f"{WORKSPACE_DIR}/data/mimic-api-demo"
print(f"API data demo directory: {API_DATA_DEMO_DIR}")

Workspace directory: /workspaces/msc-thesis-recurrent-health-modeling
API data demo directory: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-api-demo


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


## Utils

In [3]:
from recurrent_health_events_prediction.model_selection.deep_learning.utils import plot_feature_attributions

def get_batch_by_subject(dataset, subject_ids):
    """
    Given a HospReadmDataset and a list of subject_ids,
    return a batch dict with stacked tensors:
        - x_current: [B, D_curr]
        - x_past:    [B, max_seq_len, D_long]
        - mask_past: [B, max_seq_len]
        - (optional) y, subject_id, etc.
    """
    subject_ids = set(subject_ids)

    batch_samples = [
        dataset.samples[i]
        for i in range(len(dataset))
        if dataset.samples[i]["subject_id"] in subject_ids
    ]

    # stack components into batch tensors
    x_current = torch.stack([torch.tensor(s["x_current"]) for s in batch_samples], dim=0)
    x_past    = torch.stack([torch.tensor(s["x_past"])    for s in batch_samples], dim=0)
    mask_past = torch.stack([torch.tensor(s["mask_past"]) for s in batch_samples], dim=0)

    # optional outputs
    y = torch.stack([torch.tensor(s["y"]) for s in batch_samples], dim=0)

    return {
        "x_current": x_current,
        "x_past": x_past,
        "mask_past": mask_past,
        "y": y,
        "subject_id": [s["subject_id"] for s in batch_samples],
        "seq_len": [s["seq_len"] for s in batch_samples],
        "t_index": [s["t_index"] for s in batch_samples],
    }
    
def get_results_for_sample(subject_ids_b, sample_idx, df_curr, df_past):
    subject_id = subject_ids_b[sample_idx]
    print("Subject ID of sample {}: {}".format(sample_idx, subject_id))
    df_curr_sample = df_curr[df_curr["sample_idx"] == sample_idx].copy()
    df_curr_sample = df_curr_sample.sort_values(by="attribution", ascending=False).reset_index(drop=True)
    
    df_past_sample = df_past[df_past["sample_idx"] == sample_idx].copy()
    df_past_sample = df_past_sample.sort_values(by="attribution", ascending=False).reset_index(drop=True)
    
    fig = plot_feature_attributions(
        df_curr_sample,
        title="Current Visit Features - IG Attributions Subject {}".format(subject_id),
        feature_col="feature",
        attr_col="attribution_signed",
        top_k=None,
    )

    fig.show()
    fig.write_json(f"results/feat_importance_subject{subject_id}_curr.json")
    
    fig = plot_feature_attributions(
        df_past_sample,
        title="Longitudinal Features - IG Attributions Subject {}".format(subject_id),
        feature_col="feature",
        attr_col="attribution_signed",
        top_k=None,
    )

    fig.show()
    fig.write_json(f"results/feat_importance_subject{subject_id}_past.json")
    
    df_curr_sample[["feature", "attribution_signed", "value", "mean_value"]].to_csv(f"results/feat_importance_subject{subject_id}_curr.csv", index=False)
    df_past_sample[["feature", "attribution_signed", "value", "mean_value"]].to_csv(f"results/feat_importance_subject{subject_id}_past.csv", index=False)
    
    return df_curr_sample, df_past_sample

In [4]:
def inverse_scale_mean(df_mean, scaler, scaler_cols):
    """
    df_mean: DataFrame with columns ['feature', 'mean_value']
    scaler: fitted sklearn scaler (e.g., StandardScaler)
    scaler_cols: list of column names the scaler was trained on
    """

    # Prepare output column
    df_mean = df_mean.copy()
    df_mean["mean_value_unscaled"] = df_mean["mean_value"]

    # Work only on rows whose feature was scaled
    mask = df_mean["feature"].isin(scaler_cols)
    scaled_feats = df_mean.loc[mask, "feature"]

    if scaled_feats.empty:
        return df_mean  # nothing to inverse-transform

    # Construct the vector in *scaler column order*
    x_scaled = np.zeros((1, len(scaler_cols)))

    # Fill in only the features we want to inverse-transform
    for feat, val in zip(df_mean.loc[mask, "feature"], df_mean.loc[mask, "mean_value"]):
        idx = scaler_cols.index(feat)
        x_scaled[0, idx] = val

    # Inverse transform
    x_unscaled = scaler.inverse_transform(x_scaled)[0]

    # Assign unscaled values back to df
    for feat in scaled_feats:
        idx = scaler_cols.index(feat)
        df_mean.loc[df_mean["feature"] == feat, "mean_value_unscaled"] = x_unscaled[idx]

    return df_mean



## Load Model

In [5]:
import yaml
from api.services.prediction import ModelPrediction
from importlib import resources as impresources
import api.configs as configs

with open(impresources.files(configs) / "config.yaml", encoding="utf-8") as f:
    api_config = yaml.safe_load(f)
    
model_prediction_service = ModelPrediction(api_config=api_config)

In [6]:
model_api = model_prediction_service.model
model_api

AttentionPoolingNetCurrentQuery(
  (key_proj): Linear(in_features=8, out_features=16, bias=False)
  (val_proj): Linear(in_features=8, out_features=16, bias=False)
  (query_proj): Linear(in_features=19, out_features=16, bias=False)
  (classifier_head): Sequential(
    (fc1): Linear(in_features=35, out_features=64, bias=True)
    (relu1): ReLU()
    (dropout1): Dropout(p=0.014417996600305944, inplace=False)
    (fc2): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [7]:
model_config = model_prediction_service.model_config
print("Model class:", model_config.get("model_class"))
print("Model params:", model_config.get("model_params"))

Model class: AttentionPoolingNetCurrentQuery
Model params: {'dropout': 0.014417996600305944, 'hidden_size_head': 64, 'hidden_size_seq': 16, 'input_size_curr': 19, 'input_size_seq': 8, 'scale_scores': False, 'use_separate_values': True}


In [8]:
current_feat_cols = model_config.get("current_feat_cols")
print("Current-visit features: ")
for col in current_feat_cols:
    print(" -", col)

Current-visit features: 
 - LOG_HOSPITALIZATION_DAYS
 - LOG_DAYS_IN_ICU
 - CHARLSON_INDEX
 - LOG_NUM_DRUGS
 - NUM_PROCEDURES
 - LOG_PARTICIPATION_DAYS
 - HAS_DIABETES
 - HAS_COPD
 - HAS_CONGESTIVE_HF
 - DISCHARGE_LOCATION_POST_ACUTE_CARE
 - DISCHARGE_LOCATION_HOME
 - AGE
 - GENDER_M
 - ADMISSION_TYPE_ELECTIVE
 - ETHNICITY_WHITE
 - ETHNICITY_BLACK
 - ETHNICITY_HISPANIC
 - INSURANCE_MEDICAID
 - INSURANCE_PRIVATE


In [9]:
longitudinal_feat_cols = model_config.get("longitudinal_feat_cols")
print("Longitudinal features: ")
for col in longitudinal_feat_cols:
    print(" -", col)

Longitudinal features: 
 - LOG_HOSPITALIZATION_DAYS
 - LOG_DAYS_IN_ICU
 - CHARLSON_INDEX
 - LOG_NUM_DRUGS
 - NUM_PROCEDURES
 - DISCHARGE_LOCATION_POST_ACUTE_CARE
 - ADMISSION_TYPE_ELECTIVE
 - LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION


## Load Data and Create Torch Dataset

In [10]:
admissions_df = pd.read_csv(f"{API_DATA_DEMO_DIR}/admissions.csv")
diagnoses_df = pd.read_csv(f"{API_DATA_DEMO_DIR}/diagnoses.csv")
icu_stays_df = pd.read_csv(f"{API_DATA_DEMO_DIR}/icu_stays.csv")
procedures_df = pd.read_csv(f"{API_DATA_DEMO_DIR}/procedures.csv")
prescriptions_df = pd.read_csv(f"{API_DATA_DEMO_DIR}/prescriptions.csv")
patients_df = pd.read_csv(f"{API_DATA_DEMO_DIR}/patients.csv")
targets_df = pd.read_csv(f"{API_DATA_DEMO_DIR}/targets.csv")

In [11]:
preprocessed_df = model_prediction_service._get_features_and_target(
    admissions_df=admissions_df,
    diagnoses_df=diagnoses_df,
    icu_stays_df=icu_stays_df,
    procedures_df=procedures_df,
    prescriptions_df=prescriptions_df,
    patients_df=patients_df,
    targets_df=targets_df,
)
preprocessed_df = model_prediction_service._one_hot_encode_features(preprocessed_df)

In [12]:
preprocessed_df.head()

Unnamed: 0,HADM_ID,ADMITTIME,DISCHTIME,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,HAS_DIABETES,HAS_COPD,HAS_CONGESTIVE_HF,NEXT_ADMISSION_TYPE,...,ADMISSION_TYPE_URGENT,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE
0,182104,2131-04-30 07:15:00,2131-05-08 14:00:00,8.28125,2,"[other, chronic_pulmonary_disease]",False,True,False,EMERGENCY,...,False,False,False,True,False,False,False,True,True,False
1,122659,2131-05-12 19:49:00,2131-05-25 13:30:00,12.736806,2,"[other, chronic_pulmonary_disease]",False,True,False,,...,False,False,False,True,False,False,False,True,False,True
2,191941,2115-02-20 17:41:00,2115-02-21 16:30:00,0.950694,2,"[other, renal_disease]",False,False,False,EMERGENCY,...,False,False,False,False,True,False,True,False,True,False
3,182383,2121-11-30 19:24:00,2121-12-05 14:18:00,4.7875,4,"[congestive_heart_failure, other, renal_diseas...",False,False,True,,...,False,False,False,True,False,False,True,False,True,False
4,145243,2137-07-15 15:31:00,2137-07-17 12:00:00,1.853472,3,"[other, cerebrovascular_disease, myocardial_in...",False,False,False,EMERGENCY,...,False,False,False,True,False,False,False,True,True,False


In [13]:
num_hosp_per_patient_df = preprocessed_df.groupby("SUBJECT_ID")["HADM_ID"].nunique().reset_index()
num_hosp_per_patient_df.rename(columns={"HADM_ID": "num_hospitalizations"}, inplace=True)
num_hosp_per_patient_df

Unnamed: 0,SUBJECT_ID,num_hospitalizations
0,36,2
1,107,2
2,222,4
3,236,2
4,291,2
...,...,...
995,99346,2
996,99469,2
997,99503,2
998,99650,2


In [14]:
preprocessed_scaled_df = model_prediction_service._scale_features(preprocessed_df)
pytorch_dataset = model_prediction_service._create_pytorch_dataset(preprocessed_scaled_df)

In [15]:
x_curr_ex, x_past_ex, mask_ex, label_ex = pytorch_dataset[0]
print("Shapes of tensors from dataset:")
print(f"x_curr: {x_curr_ex.shape}")
print(f"x_past: {x_past_ex.shape}")
print(f"mask: {mask_ex.shape}")
print(f"label: {label_ex.shape}")

Shapes of tensors from dataset:
x_curr: torch.Size([19])
x_past: torch.Size([4, 8])
mask: torch.Size([4])
label: torch.Size([])


In [16]:
data_loader = torch.utils.data.DataLoader(
    pytorch_dataset, batch_size=model_config["batch_size"], shuffle=False
)

In [17]:
x_curr, x_past, mask, label = next(iter(data_loader))
print("Shapes of tensors from DataLoader:")
print("x_curr:", x_curr.shape)
print("x_past:", x_past.shape)
print("mask:", mask.shape)
print("label:", label.shape)

Shapes of tensors from DataLoader:
x_curr: torch.Size([64, 19])
x_past: torch.Size([64, 4, 8])
mask: torch.Size([64, 4])
label: torch.Size([64])


## Get Unscaled Mean Feature Values

In [41]:
stats = model_prediction_service.train_explain_stats
print("Train explain stats keys:", stats.keys())

scaler = model_prediction_service.scaler
scaler_cols = scaler.feature_names_in_.tolist()
print("Scaler columns:", scaler_cols)

mean_curr = stats["mean_curr"].cpu().numpy()
mean_past = stats["mean_past"].cpu().numpy()

mean_curr_df = pd.DataFrame({
    "feature": current_feat_cols,
    "mean_value": mean_curr,
})
mean_past_df = pd.DataFrame({
    "feature": longitudinal_feat_cols,
    "mean_value": mean_past,
})

mean_curr_df = inverse_scale_mean(mean_curr_df, scaler, scaler_cols)
mean_past_df = inverse_scale_mean(mean_past_df, scaler, scaler_cols)

mean_curr_df.to_csv("results/mean_current_features.csv", index=False)
mean_past_df.to_csv("results/mean_longitudinal_features.csv", index=False)

Train explain stats keys: dict_keys(['mean_curr', 'mean_past', 'has_median', 'median_curr', 'median_past'])
Scaler columns: ['AGE', 'LOG_HOSPITALIZATION_DAYS', 'LOG_DAYS_IN_ICU', 'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION', 'LOG_PARTICIPATION_DAYS', 'LOG_NUM_DRUGS', 'NUM_PROCEDURES', 'CHARLSON_INDEX', 'NUM_PREV_HOSPITALIZATIONS', 'NUM_COMORBIDITIES', 'PARTICIPATION_DAYS']



Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '2.1887777979456757' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '2.144071655671188' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.



In [48]:
import numpy as np
import pandas as pd

def inverse_log_features(df, value_col="mean_value_unscaled"):
    """
    - Applies np.expm1() to rows where feature starts with 'LOG_'
    - Removes the 'LOG_' prefix from feature names
    """
    df = df.copy()

    # Mask for logged features
    mask = df["feature"].str.startswith("LOG_")

    # Inverse log1p → expm1
    df.loc[mask, value_col] = np.expm1(df.loc[mask, value_col])

    # Remove prefix LOG_
    df.loc[mask, "feature"] = df.loc[mask, "feature"].str[len("LOG_"):]

    return df


In [49]:
df1 = inverse_log_features(mean_curr_df)

In [50]:
df1

Unnamed: 0,feature,mean_value,mean_value_unscaled
0,HOSPITALIZATION_DAYS,0.001942,7.924299
1,DAYS_IN_ICU,0.013992,2.920034
2,CHARLSON_INDEX,0.015768,5.00203
3,NUM_DRUGS,0.014111,25.290829
4,NUM_PROCEDURES,-0.015215,3.717411
5,PARTICIPATION_DAYS,0.031994,135.513152
6,HAS_DIABETES,0.450383,0.450383
7,HAS_COPD,0.368516,0.368516
8,HAS_CONGESTIVE_HF,0.501804,0.501804
9,DISCHARGE_LOCATION_POST_ACUTE_CARE,0.420613,0.420613


In [35]:
scaler.mean_

array([ 60.52149416,   2.18735499,   1.35542726,   4.69569214,
         4.84794996,   3.25384953,   3.7721202 ,   4.96264608,
         2.03568447,   3.43927379, 575.69782972])

In [51]:
scaler_cols

['AGE',
 'LOG_HOSPITALIZATION_DAYS',
 'LOG_DAYS_IN_ICU',
 'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION',
 'LOG_PARTICIPATION_DAYS',
 'LOG_NUM_DRUGS',
 'NUM_PROCEDURES',
 'CHARLSON_INDEX',
 'NUM_PREV_HOSPITALIZATIONS',
 'NUM_COMORBIDITIES',
 'PARTICIPATION_DAYS']

## Feature Importance Samples

In [19]:
at_least_4_hosp_mask = num_hosp_per_patient_df["num_hospitalizations"] >= 5
at_least_4_hosp_patient_ids = num_hosp_per_patient_df[at_least_4_hosp_mask]["SUBJECT_ID"].tolist()
at_least_4_hosp_patient_ids_sample = at_least_4_hosp_patient_ids[:8]

In [20]:
num_hosp_per_patient_df[at_least_4_hosp_mask]

Unnamed: 0,SUBJECT_ID,num_hospitalizations
27,1578,5
29,1709,10
42,2187,6
45,2365,6
51,2589,5
...,...,...
971,94977,7
974,95372,6
976,95895,9
982,97441,7


In [21]:
# get a batch to explain
batch_example = get_batch_by_subject(pytorch_dataset, subject_ids=at_least_4_hosp_patient_ids_sample)
x_curr_b = batch_example["x_current"]
x_past_b = batch_example["x_past"]
mask_b = batch_example["mask_past"]
labels_b = batch_example["y"]
subject_ids_b = batch_example["subject_id"]

# (Optional) layer to split history vs current — for your AttentionPoolingNet:
fc1_layer = model_api.classifier_head[0]  # nn.Linear
stats = model_prediction_service.train_explain_stats

In [22]:
stats

{'mean_curr': tensor([ 0.0019,  0.0140,  0.0158,  0.0141, -0.0152,  0.0320,  0.4504,  0.3685,
          0.5018,  0.4206,  0.5444, -0.0087,  0.5347,  0.0902,  0.6651,  0.2305,
          0.0415,  0.1128,  0.1962]),
 'mean_past': tensor([-0.0591, -0.0168, -0.1352, -0.0786, -0.0637,  0.3508,  0.0770, -0.0366]),
 'has_median': True,
 'median_curr': tensor([-0.0818, -0.1775,  0.0150,  0.2502, -0.2147,  0.1974,  0.0000,  0.0000,
          1.0000,  0.0000,  1.0000,  0.0912,  1.0000,  0.0000,  1.0000,  0.0000,
          0.0000,  0.0000,  0.0000]),
 'median_past': tensor([-0.1459, -0.2039, -0.3854,  0.1945, -0.2147,  0.0000,  0.0000, -0.0201])}

In [23]:
scaler = model_prediction_service.scaler
scaler_cols = scaler.feature_names_in_.tolist()

mean_curr = stats["mean_curr"].cpu().numpy()
mean_past = stats["mean_past"].cpu().numpy()

mean_curr_df = pd.DataFrame({
    "feature": current_feat_cols,
    "mean_value": mean_curr,
})
mean_past_df = pd.DataFrame({
    "feature": longitudinal_feat_cols,
    "mean_value": mean_past,
})

In [24]:
mean_curr_df = inverse_scale_mean(mean_curr_df, scaler, scaler_cols)

  df_mean.loc[df_mean["feature"] == feat, "mean_value_unscaled"] = x_unscaled[idx]


In [25]:
from recurrent_health_events_prediction.model.explainers import explain_deep_learning_model_feat

df_curr, df_past, df_split  = explain_deep_learning_model_feat(
    model_api,
    x_curr_b,
    x_past_b,
    mask_b,
    current_feat_cols,
    longitudinal_feat_cols,
    layer_for_split=fc1_layer,
    baseline_strategy="means",
    stats=stats,
    n_steps=64,
    internal_batch_size=16,
)

In [26]:
df_curr = df_curr.sort_values(by=["sample_idx", "feature"], ascending=True)
df_past = df_past.sort_values(by=["sample_idx", "feature"], ascending=True)

In [27]:
import pandas as pd

mean_curr_df = pd.DataFrame(
    {
        "feature": current_feat_cols,
        "mean_value": stats["mean_curr"].cpu().numpy(),
    }
)

feature_values_df = pd.DataFrame({
    "sample_idx": [i for i in range(len(x_curr_b)) for _ in current_feat_cols],
    "feature": current_feat_cols * len(x_curr_b),
    "value": x_curr_b.flatten().detach().numpy(),
})

df_curr = df_curr.merge(mean_curr_df, on="feature", how="left")
df_curr = df_curr.merge(feature_values_df, on=["sample_idx", "feature"], how="left")
df_curr

Unnamed: 0,sample_idx,feature,attribution,attribution_signed,mean_value,value
0,0,ADMISSION_TYPE_ELECTIVE,0.018793,-0.018793,0.090212,0.000000
1,0,AGE,0.003885,0.003885,-0.008730,1.202013
2,0,CHARLSON_INDEX,0.116103,0.116103,0.015768,0.815700
3,0,DISCHARGE_LOCATION_HOME,0.152642,0.152642,0.544429,0.000000
4,0,DISCHARGE_LOCATION_POST_ACUTE_CARE,0.000892,0.000892,0.420613,1.000000
...,...,...,...,...,...,...
147,7,LOG_DAYS_IN_ICU,0.078199,0.078199,0.013992,0.518075
148,7,LOG_HOSPITALIZATION_DAYS,0.022827,-0.022827,0.001942,0.232688
149,7,LOG_NUM_DRUGS,0.004393,0.004393,0.014111,0.250178
150,7,LOG_PARTICIPATION_DAYS,0.199881,0.199881,0.031994,1.235027


In [28]:
mean_past_df = pd.DataFrame(
    {
        "feature": longitudinal_feat_cols,
        "mean_value": stats["mean_past"].cpu().numpy(),
    }
)

df_past = df_past.merge(mean_past_df, on="feature", how="left")

x_past_mean = x_past_b.mean(dim=1)  # [B, D_long]
feature_values_past_df = pd.DataFrame({
    "sample_idx": [i for i in range(len(x_past_b)) for _ in longitudinal_feat_cols],
    "feature": longitudinal_feat_cols * len(x_past_b),
    "value": x_past_mean.flatten().detach().numpy(),
})

df_past = df_past.merge(feature_values_past_df, on=["sample_idx", "feature"], how="left")
df_past

Unnamed: 0,sample_idx,feature,attribution,attribution_signed,mean_value,value
0,0,ADMISSION_TYPE_ELECTIVE,0.064692,0.062997,0.077040,0.250000
1,0,CHARLSON_INDEX,0.067532,-0.067532,-0.135245,0.515421
2,0,DISCHARGE_LOCATION_POST_ACUTE_CARE,0.154342,-0.154342,0.350792,0.500000
3,0,LOG_DAYS_IN_ICU,0.098397,0.098397,-0.016824,0.182818
4,0,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,0.426474,-0.372660,-0.036563,0.438343
...,...,...,...,...,...,...
59,7,LOG_DAYS_IN_ICU,0.032712,-0.014979,-0.016824,0.040125
60,7,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,0.127083,0.127083,-0.036563,-0.591664
61,7,LOG_HOSPITALIZATION_DAYS,0.011482,-0.011482,-0.059077,-0.492618
62,7,LOG_NUM_DRUGS,0.010718,-0.010515,-0.078553,0.211169


In [29]:
idx = 3
curr_sample_df, past_sample_df = get_results_for_sample(subject_ids_b, idx, df_curr, df_past)

Subject ID of sample 3: 2365


In [30]:
curr_sample_df

Unnamed: 0,sample_idx,feature,attribution,attribution_signed,mean_value,value
0,3,CHARLSON_INDEX,0.445762,0.445762,0.015768,2.817562
1,3,LOG_DAYS_IN_ICU,0.188165,-0.188165,0.013992,-0.748686
2,3,LOG_PARTICIPATION_DAYS,0.122665,0.122665,0.031994,1.068591
3,3,HAS_DIABETES,0.107386,0.107386,0.450383,0.0
4,3,ETHNICITY_WHITE,0.091566,-0.091566,0.665088,1.0
5,3,DISCHARGE_LOCATION_HOME,0.085901,-0.085901,0.544429,1.0
6,3,GENDER_M,0.085009,-0.085009,0.534732,0.0
7,3,HAS_COPD,0.057201,-0.057201,0.368516,0.0
8,3,DISCHARGE_LOCATION_POST_ACUTE_CARE,0.042331,0.042331,0.420613,0.0
9,3,LOG_NUM_DRUGS,0.038452,0.038452,0.014111,0.302652


In [32]:
past_sample_df

Unnamed: 0,sample_idx,feature,attribution,attribution_signed,mean_value,value
0,3,LOG_DAYS_IN_ICU,0.42287,-0.224266,-0.016824,0.028207
1,3,CHARLSON_INDEX,0.36346,-0.123875,-0.135245,1.816631
2,3,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,0.321047,0.117082,-0.036563,-0.058207
3,3,LOG_NUM_DRUGS,0.212619,0.028525,-0.078553,0.489803
4,3,DISCHARGE_LOCATION_POST_ACUTE_CARE,0.200877,0.029736,0.350792,0.25
5,3,ADMISSION_TYPE_ELECTIVE,0.114296,0.113088,0.07704,0.25
6,3,LOG_HOSPITALIZATION_DAYS,0.106014,0.106014,-0.059077,0.159906
7,3,NUM_PROCEDURES,0.02929,0.007611,-0.063654,0.132904


In [33]:
mask = (preprocessed_df["SUBJECT_ID"] == subject_ids_b[idx])
features_set = set(current_feat_cols + longitudinal_feat_cols)
feat_cols = [col for col in preprocessed_df.columns if col in features_set]
cols = ["SUBJECT_ID", "HADM_ID", "ADMITTIME"] + feat_cols
preprocessed_df[mask].iloc[-5:][cols]

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,HAS_DIABETES,HAS_COPD,HAS_CONGESTIVE_HF,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,NUM_PROCEDURES,AGE,CHARLSON_INDEX,...,LOG_NUM_DRUGS,GENDER_M,ADMISSION_TYPE_ELECTIVE,INSURANCE_MEDICAID,INSURANCE_PRIVATE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE
132,2365,179968,2178-08-09 04:01:00,False,True,False,-0.984564,-0.214736,-0.772698,1.216073,...,0.302652,False,False,False,False,False,False,True,True,False
133,2365,169783,2178-09-02 00:24:00,False,False,False,0.266002,-0.214736,-0.772698,0.8157,...,0.352288,False,True,False,False,False,False,True,True,False
134,2365,198301,2179-03-01 05:00:00,False,True,False,1.27381,1.453936,-0.710988,2.016817,...,0.976469,False,False,False,False,False,False,True,False,True
135,2365,184294,2181-11-19 17:13:00,False,True,False,-0.788077,-0.492848,-0.587569,3.217935,...,0.327805,False,False,False,False,False,False,True,True,False
136,2365,189913,2181-12-22 04:37:00,False,False,False,,-0.492848,-0.587569,2.817562,...,0.302652,False,False,False,False,False,False,True,True,False
