In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import torch
import random
import csv
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler, RobustScaler
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable
from torchmetrics.classification import AUROC, Accuracy, ConfusionMatrix, F1Score
import os, subprocess, gc, time, datetime
from itertools import product

import models.models_original as models_original
import models.models_3d_atomics as models_3d_atomics
from vasopressor.preprocess_helpers import load_and_create_MIMIC_dataloader
from models.helper import *
from models.param_initializations import *
from models.optimization_strategy import greedy_forward_selection, get_top_features_per_concept

device = get_free_gpu()

current device cuda:11


In [2]:
train_loader, val_loader, test_loader, class_weights, num_classes, changing_vars, static_names, seq_len = load_and_create_MIMIC_dataloader(output_dim = 2, batch_size = 512, random_state = 1)

print(class_weights, num_classes, seq_len)

for batch in train_loader:
    [print(t.shape, t.device) for t in batch]
    break

len(train_loader)

tensor([0.5797, 3.6376], dtype=torch.float64) 2 6
torch.Size([512, 6, 27]) cpu
torch.Size([512, 6, 27]) cpu
torch.Size([512, 8]) cpu
torch.Size([512, 2]) cpu


35

In [3]:
def plot_losses(train_losses, val_losses):
    plt.plot(train_losses, color="black", label="Train")
    plt.plot(val_losses, color="green", label="Val")
    plt.yscale("log")
    plt.legend()
    plt.show()

def plot_metrics(history, n_concepts_list):
    plt.plot(history[:, 0], history[:, 2], label=f'AUC')
    plt.plot(history[:, 0], history[:, 3], label=f'ACC')
    plt.plot(history[:, 0], history[:, 4], label=f'F1')

    plt.xlabel('Num Concepts')
    plt.ylabel('Criteria')
    plt.title('Plot of Concepts vs Criteria')
    plt.xticks(n_concepts_list)

    plt.legend()
    plt.show()

def plot_atomics_concepts_metric(history, title, dec="{:.3g}"):
        
    df = pd.DataFrame(history, columns=["n_atomics", "n_concepts", "val_loss", "auc", "acc", "f1"])
    mean_atomics = df.groupby("n_atomics").mean()
    mean_concepts = df.groupby("n_concepts").mean()

    # display(mean_atomics)
    plt.plot(mean_atomics.index, mean_atomics["auc"], label='AUC')
    plt.plot(mean_atomics.index, mean_atomics["acc"], label='ACC')
    plt.plot(mean_atomics.index, mean_atomics["f1"], label='F1')
    plt.xlabel('Num Atomics')
    plt.ylabel('Criteria')
    plt.title("Metric as mean over atomics")
    plt.suptitle(title)
    plt.legend()
    plt.show()

    # display(mean_concepts)
    plt.plot(mean_concepts.index, mean_concepts["auc"], label='AUC')
    plt.plot(mean_concepts.index, mean_concepts["acc"], label='ACC')
    plt.plot(mean_concepts.index, mean_concepts["f1"], label='F1')
    plt.xlabel('Num Concepts')
    plt.ylabel('Criteria')
    plt.title("Metric as mean over concepts")
    plt.suptitle(title)
    plt.legend()
    plt.show()


In [4]:
seq_len = seq_len
changing_dim = len(changing_vars)
static_dim = len(static_names)

print(changing_dim, static_dim, seq_len)

random_seed = 1
set_seed(random_seed)


27 8 6


In [5]:
auroc_metric = AUROC(task="binary").to(device)
accuracy_metric = Accuracy(task="binary").to(device)
f1_metric = F1Score(task="binary").to(device)
conf_matrix = ConfusionMatrix(task="binary").to(device)

## Optimization Atomics

In [6]:
experiment_folder = "/workdir/optimal-summaries-public/_models/vasopressor/atomics_new/"

def get_model_sum2con(random_seed):
    # feature weights
    config_atomics = {
        "n_atomics": 10, # 30
        "n_concepts": 4, # 20
        "use_indicators": True,
        "use_summaries_for_atomics": False,
    }

    makedir(experiment_folder)
    model_path = get_filename_from_dict(experiment_folder, config_atomics)

    train_loader, val_loader, test_loader, class_weights, num_classes, changing_vars, static_names, seq_len = load_and_create_MIMIC_dataloader(output_dim = 2, batch_size = 512, random_state = random_seed)

    top_k = None # "/workdir/optimal-summaries-public/_models/vasopressor/atomics/top-k/bottleneck_topkinds.csv"

    model = models_3d_atomics.CBM(**config_atomics, static_dim=static_dim, changing_dim=changing_dim, seq_len=seq_len, output_dim=2, top_k=top_k, device=device)
    model.try_load_else_fit(train_loader, val_loader, p_weight=class_weights.to(device), save_model_path=model_path.format(**config_atomics, seed = random_seed), max_epochs=10000)

    evaluate_classification(model, test_loader)
    return model

model = get_model_sum2con(1)

def get_model_sum2atom(random_seed):
    # feature weights
    config_atomics = {
        "n_atomics": 10, # 30
        "n_concepts": 4, # 20
        "use_indicators": True,
        "use_summaries_for_atomics": True,
    }

    makedir(experiment_folder)
    model_path = get_filename_from_dict(experiment_folder, config_atomics)

    train_loader, val_loader, test_loader, class_weights, num_classes, changing_vars, static_names, seq_len = load_and_create_MIMIC_dataloader(output_dim = 2, batch_size = 512, random_state = random_seed)

    top_k = None # "/workdir/optimal-summaries-public/_models/vasopressor/atomics/top-k/bottleneck_topkinds.csv"

    model = models_3d_atomics.CBM(**config_atomics, static_dim=static_dim, changing_dim=changing_dim, seq_len=seq_len, output_dim=2, top_k=top_k, device=device)
    model.try_load_else_fit(train_loader, val_loader, p_weight=class_weights.to(device), save_model_path=model_path.format(**config_atomics, seed = random_seed), max_epochs=10000)

    evaluate_classification(model, test_loader)
    return model

model = get_model_sum2atom(1)



Loaded model from /workdir/optimal-summaries-public/_models/vasopressor/atomics_new/n_atomics_10_n_concepts_4_use_indicators_True_use_summaries_for_atomics_False_seed_1.pt
AUC macro 0.900
ACC macro 0.860
 F1 macro 0.862




Loaded model from /workdir/optimal-summaries-public/_models/vasopressor/atomics_new/n_atomics_10_n_concepts_4_use_indicators_True_use_summaries_for_atomics_True_seed_1.pt
AUC macro 0.913
ACC macro 0.836
 F1 macro 0.844


In [15]:
models = [get_model_sum2atom(seed) for seed in range(1,4)]


Loaded model from /workdir/optimal-summaries-public/_models/vasopressor/atomics_new/n_atomics_10_n_concepts_4_use_indicators_True_use_summaries_for_atomics_True_seed_1.pt




AUC macro 0.913
ACC macro 0.836
 F1 macro 0.844




Loaded model from /workdir/optimal-summaries-public/_models/vasopressor/atomics_new/n_atomics_10_n_concepts_4_use_indicators_True_use_summaries_for_atomics_True_seed_2.pt
AUC macro 0.919
ACC macro 0.840
 F1 macro 0.848




Loaded model from /workdir/optimal-summaries-public/_models/vasopressor/atomics_new/n_atomics_10_n_concepts_4_use_indicators_True_use_summaries_for_atomics_True_seed_3.pt
AUC macro 0.915
ACC macro 0.837
 F1 macro 0.845


In [16]:
models = [get_model_sum2con(seed) for seed in range(1,4)]


  4%|▍         | 380/10000 [06:14<2:38:02,  1.01 epoch/s, Train Loss=0.56672, Val Loss=0.58711, Best Val Loss=0.58429]

Early Stopped





AUC macro 0.894
ACC macro 0.851
 F1 macro 0.855


  2%|▏         | 210/10000 [03:26<2:40:38,  1.02 epoch/s, Train Loss=0.56137, Val Loss=0.56205, Best Val Loss=0.56111]

Early Stopped





AUC macro 0.898
ACC macro 0.839
 F1 macro 0.844


  5%|▍         | 480/10000 [07:57<2:37:40,  1.01 epoch/s, Train Loss=0.56219, Val Loss=0.57548, Best Val Loss=0.56474]

Early Stopped





AUC macro 0.894
ACC macro 0.832
 F1 macro 0.840


In [7]:
track_metrics={"acc": accuracy_metric,
               "f1": f1_metric,
               "auc": auroc_metric,
               }

# models = [get_model_sum2atom(seed) for seed in range(1,4)]
models = [get_model_sum2atom(1)]

results = []
for seed, model in enumerate(models, 1):
    print("seed", seed)
    top_k_inds = [get_top_features_per_concept(layer) for layer in model.regularized_layers]
    
    save_path = experiment_folder + "top-k/bottleneck_topkinds_sum2atom_seed_{seed}.csv".format(seed=seed)
    
    greedy_results = greedy_forward_selection(model=model, layers_to_prune=model.regularized_layers, top_k_inds=top_k_inds, val_loader=val_loader, optimize_metric=auroc_metric, track_metrics=track_metrics, save_path=save_path)
    results.append(greedy_results)
    



Loaded model from /workdir/optimal-summaries-public/_models/vasopressor/atomics_new/n_atomics_10_n_concepts_4_use_indicators_True_use_summaries_for_atomics_True_seed_1.pt
AUC macro 0.913
ACC macro 0.836
 F1 macro 0.844
seed 1
Found 10 Concepts
90th percentile per concept [0.17485653 0.11012712 0.15025544 0.733804   0.12479137 5.350915
 0.13964465 0.11854323 0.14277679 0.37324145]
['Concept 0 len: 10', 'Concept 1 len: 10', 'Concept 2 len: 10', 'Concept 3 len: 10', 'Concept 4 len: 10', 'Concept 5 len: 12', 'Concept 6 len: 10', 'Concept 7 len: 10', 'Concept 8 len: 10', 'Concept 9 len: 10']
Found 4 Concepts
90th percentile per concept [10.933439    0.19957875  0.18998273  0.17745395]
['Concept 0 len: 25', 'Concept 1 len: 136', 'Concept 2 len: 142', 'Concept 3 len: 133']
Loaded results from save path. Skipping search...


In [8]:
track_metrics={"acc": accuracy_metric,
               "f1": f1_metric,
               "auc": auroc_metric,
               }

# models = [get_model_sum2con(seed) for seed in range(1,4)]
models = [get_model_sum2con(1)]

results = []
for seed, model in enumerate(models, 1):
    print("seed", seed)
    top_k_inds = [get_top_features_per_concept(layer) for layer in model.regularized_layers]
    
    save_path = experiment_folder + "top-k/bottleneck_topkinds_sum2con_seed_{seed}.csv".format(seed=seed)
    
    greedy_results = greedy_forward_selection(model=model, layers_to_prune=model.regularized_layers, top_k_inds=top_k_inds, val_loader=val_loader, optimize_metric=auroc_metric, track_metrics=track_metrics, save_path=save_path)
    results.append(greedy_results)
    



Loaded model from /workdir/optimal-summaries-public/_models/vasopressor/atomics_new/n_atomics_10_n_concepts_4_use_indicators_True_use_summaries_for_atomics_False_seed_1.pt
AUC macro 0.900
ACC macro 0.860
 F1 macro 0.862
seed 1
Found 10 Concepts
90th percentile per concept [0.09312173 0.1105315  0.11364545 0.08538713 0.08993343 0.08235046
 0.09268064 0.09022894 0.08621404 0.08993822]
['Concept 0 len: 10', 'Concept 1 len: 10', 'Concept 2 len: 10', 'Concept 3 len: 10', 'Concept 4 len: 10', 'Concept 5 len: 10', 'Concept 6 len: 10', 'Concept 7 len: 10', 'Concept 8 len: 10', 'Concept 9 len: 10']
Found 4 Concepts
90th percentile per concept [2.6416087  9.061051   0.71845627 1.6046413 ]
['Concept 0 len: 20', 'Concept 1 len: 54', 'Concept 2 len: 25', 'Concept 3 len: 92']


 64%|██████▎   | 89/140 [2:51:19<1:21:04, 95.39s/it, Score=0.92556, acc=0.864, f1=0.869, auc=0.926] 

In [None]:
save_path = experiment_folder + "top-k/bottleneck_topkinds_sum2con_seed_{seed}.csv".format(seed=1)
write_df_2_csv(save_path, results[0])

In [None]:
model = get_model_sum2atom(1)
greedy_results = read_df_from_csv("/workdir/optimal-summaries-public/_models/vasopressor/model3d/top-k/bottleneck_topkinds_encode_time_dim_False_1.csv")

train_loader, val_loader, test_loader, class_weights, num_classes, changing_vars, static_names, seq_len = load_and_create_MIMIC_dataloader(output_dim = 2, batch_size = 512, random_state = random_seed)

visualize_optimization_results(model, val_loader, test_loader, greedy_results)


In [None]:
model = get_model_sum2con(1)
greedy_results = read_df_from_csv("/workdir/optimal-summaries-public/_models/vasopressor/model3d/top-k/bottleneck_topkinds_encode_time_dim_False_1.csv")

train_loader, val_loader, test_loader, class_weights, num_classes, changing_vars, static_names, seq_len = load_and_create_MIMIC_dataloader(output_dim = 2, batch_size = 512, random_state = random_seed)

visualize_optimization_results(model, val_loader, test_loader, greedy_results)
