In [1]:
import os
os.chdir("../")

# # Set R environment variables using the conda environment path
# r_home = '/sfs/gpfs/tardis/home/jq2uw/llm_nicu_vitalsigns/clip_env/lib/R'
# os.environ['R_HOME'] = r_home
# os.environ['R_LIBS'] = f"{r_home}/library"
# os.environ['R_LIBS_USER'] = os.path.expanduser('~/R/goolf/4.3')
# os.environ['LD_LIBRARY_PATH'] = f"{r_home}/lib:" + os.environ.get('LD_LIBRARY_PATH', '')

import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

In [2]:
from config import *
from data import *
from train import *
from eval import *
from vital import *
print("using device: ", device)

Random seed set to 333
using device:  cpu


## Customize Configuration

In [3]:
# (customize) configs
overwrite = True
model_name = 'test_succ_inc2'
text_config['cl']['die7d'] = True # udpate text_config here if needed
# model_name = model_name + "___" + "_".join(get_true_components(text_config))

update_config(
    text_col = 'description_succ_inc',#'ts_description',
    y_col = 'description_succ_inc',
    y_levels = ['High amount of consecutive increases.', 'Low amount of consecutive increases.'],
    y_pred_levels = ['High amount of consecutive increases.', 'Low amount of consecutive increases.'],
    txt2ts_y_cols = ['description_succ_inc'],
    model_name = model_name,
    downsample_levels =['High amount of consecutive increases.', 'Low amount of consecutive increases.'],
    downsample = True,
    downsample_size = 50,
    custom_target_cols = ['description_succ_inc', 'label'], # 'label' is the same as the default "by_label" target
    embedded_dim = 256,
    batch_size = 512, # Data loader settings
    patience = 100, # Training settings
    num_saves = 10,
    num_epochs = 10,
    init_lr = 5e-5,
    text_config = text_config,
    **{'3d': False}  # Add this line
)
config_dict = get_config_dict()


#  result saving directory
output_dir = './results/'+config_dict['model_name']
model_clip_path = output_dir+'/model_clip.pth' 
eval_clip_path = output_dir+'/evals_clip.pth'
model_path = output_dir+'/model.pth' 
eval_path = output_dir+'/evals.pth'
config_path = output_dir+'/config.pth'

In [4]:
# run preprocess.py to ready the data
with open('main_preprocess.py', 'r') as file:
    exec(file.read())


Sample of patients with positive labels:
VitalID
1018    8
5170    8
1835    8
2361    8
2791    8
dtype: int64


[Parallel(n_jobs=9)]: Using backend LokyBackend with 9 concurrent workers.
[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:    4.2s
[Parallel(n_jobs=9)]: Done 2596 tasks      | elapsed:    4.7s
[Parallel(n_jobs=9)]: Done 63298 tasks      | elapsed:   12.1s
[Parallel(n_jobs=9)]: Done 65100 out of 65100 | elapsed:   12.4s finished


replace 'text' with:  description_succ_inc
text
Moderate amount of consecutive increases.    42910
Low amount of consecutive increases.         11838
High amount of consecutive increases.        10352
Name: count, dtype: int64

Sample of patients with positive labels:
TestID
508     8
707     8
1903    8
817     8
1414    7
dtype: int64


[Parallel(n_jobs=9)]: Using backend LokyBackend with 9 concurrent workers.
[Parallel(n_jobs=9)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=9)]: Done 14318 tasks      | elapsed:    2.1s
[Parallel(n_jobs=9)]: Done 60894 tasks      | elapsed:    7.4s
[Parallel(n_jobs=9)]: Done 61197 out of 61197 | elapsed:    7.4s finished


replace 'text' with:  description_succ_inc
text
Moderate amount of consecutive increases.    36173
Low amount of consecutive increases.         13165
High amount of consecutive increases.        11859
Name: count, dtype: int64
After downsampling:
description_succ_inc
High amount of consecutive increases.    50
Low amount of consecutive increases.     50
Name: count, dtype: int64
After downsampling:
description_succ_inc
High amount of consecutive increases.    50
Low amount of consecutive increases.     50
Name: count, dtype: int64
final distribution of text prediction
description_succ_inc
High amount of consecutive increases.    50
Low amount of consecutive increases.     50
Name: count, dtype: int64
description_succ_inc
High amount of consecutive increases.    50
Low amount of consecutive increases.     50
Name: count, dtype: int64


  from tqdm.autonotebook import tqdm, trange


In [5]:
for _, (idx, ts, text_features, labels, targets) in enumerate(train_dataloader):
    targets = targets[:,idx]
    print(targets)
    break

tensor([[1.0000, 0.0000, 0.5000,  ..., 0.0000, 0.5000, 0.5000],
        [0.0000, 1.0000, 0.0000,  ..., 0.5000, 0.0000, 0.0000],
        [0.5000, 0.0000, 1.0000,  ..., 0.0000, 0.5000, 0.5000],
        ...,
        [0.0000, 0.5000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.5000, 0.0000, 0.5000,  ..., 0.0000, 1.0000, 0.5000],
        [0.5000, 0.0000, 0.5000,  ..., 0.0000, 0.5000, 1.0000]])


In [33]:


def gen_target(df, 
               cluster_cols):
    targets = {}
    for cluster_col in cluster_cols:
        label_mapping = {cat: idx+1 for idx, cat in enumerate(sorted(df[cluster_col].unique()))}
        df['cluster'] = df[cluster_col].map(label_mapping).astype(int)
        labels = torch.tensor(df['cluster'].values)
        labels_equal = (labels.unsqueeze(0) == labels.unsqueeze(1))
        target = labels_equal.float()
        targets[cluster_col] = target

    # Sum all target matrices element-wise
    target_sum = sum(targets.values())
    # Method 2: Normalize by count
    target_normalized = target_sum / len(cluster_cols)
    return target_normalized



In [34]:
cluster_cols = ['description_succ_inc', 'description_succ_unc', 'cl_event', 'rowid']
df = df_train
target = gen_target(df, cluster_cols)


In [37]:
target

tensor([[1.0000, 0.7500, 0.7500,  ..., 0.7500, 0.7500, 0.2500],
        [0.7500, 1.0000, 0.7500,  ..., 0.7500, 0.7500, 0.2500],
        [0.7500, 0.7500, 1.0000,  ..., 0.7500, 0.7500, 0.2500],
        ...,
        [0.7500, 0.7500, 0.7500,  ..., 1.0000, 0.7500, 0.2500],
        [0.7500, 0.7500, 0.7500,  ..., 0.7500, 1.0000, 0.2500],
        [0.2500, 0.2500, 0.2500,  ..., 0.2500, 0.2500, 1.0000]])

In [None]:

cluster_col = cluster_cols[0]
label_mapping = {cat: idx+1 for idx, cat in enumerate(sorted(df_train[cluster_col].unique()))}
df_train['label'] = df_train[cluster_col].map(label_mapping).astype(int)
df_test['label'] = df_test[cluster_col].map(label_mapping).astype(int)

