## Hardware check

In [1]:
# gpu check
! nvidia-smi

Mon Apr  8 13:08:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  | 00000000:3A:00.0 Off |                    0 |
| N/A   31C    P0              41W / 300W |      9MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           On  | 00000000:3B:00.0 Off |  

In [2]:
# check number of cores
import multiprocessing

cores = multiprocessing.cpu_count() 
cores

80

In [3]:
! python --version

Python 3.9.7


## Environment Prep

In [3]:
cd /pfs/data5/home/hd/hd_hd/hd_nf283/MA_Thesis/

/pfs/data5/home/hd/hd_hd/hd_nf283/MA_Thesis


In [4]:
from keras.utils import pad_sequences
import pickle
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import pandas as pd
import json

2024-04-08 10:00:02.303993: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-08 10:00:02.351161: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Version Check

In [6]:
import tensorflow as tf
print(tf. __version__)

pickle.format_version

2.12.0


'4.0'

## Train/Val

In [5]:
data_path = 'Data/sepsis_removed_0.pkl'
pkl = pickle.load(open(data_path, 'rb'))
data = pkl[0]
oc = pkl[1]
train_ind = pkl[2]
valid_ind = pkl[3]
test_ind = pkl[4]
# embs = pkl[5]
del pkl
data_path = 'Data/CLS_emb_keywords_reserved.pkl'
pkl = pickle.load(open(data_path, 'rb'))
embs = pkl[0]
del pkl
text_data = data[data['variable'] == 'Text']
text_data
embs_list = []

for emb in embs:
    embs_list.append(emb)

text_data['value'] = embs_list
text_data
physio_data = data[data['variable'] != 'Text']
physio_data
del data
data = text_data.append(physio_data, ignore_index=False)
pred_window = 2 # hours
obs_windows = range(20, 124, 4)

# Remove test patients.
data = data.merge(oc[['ts_ind', 'SUBJECT_ID']], on='ts_ind', how='left')
test_sub = oc.loc[oc.ts_ind.isin(test_ind)].SUBJECT_ID.unique()
data = data.loc[~data.SUBJECT_ID.isin(test_sub)]
oc = oc.loc[~oc.SUBJECT_ID.isin(test_sub)]
data.drop(columns=['SUBJECT_ID', 'TABLE'], inplace=True)
# Get static data with mean fill and missingness indicator.
static_varis = ['Age', 'Gender']
ii = data.variable.isin(static_varis)
static_data = data.loc[ii]
data = data.loc[~ii]
def inv_list(l, start=0):
    d = {}
    for i in range(len(l)):
        d[l[i]] = i+start
    return d
static_var_to_ind = inv_list(static_varis)
D = len(static_varis)
N = data.ts_ind.max()+1

# Get variable indices.
varis = sorted(list(set(data.variable)))
V = len(varis)
var_to_ind = inv_list(varis, start=1)
data['vind'] = data.variable.map(var_to_ind)
data = data[['ts_ind', 'vind', 'hour', 'value']].sort_values(by=['ts_ind', 'vind', 'hour'])
# Find max_len.
fore_max_len = 880
# Get forecast inputs and outputs.
fore_texts_ip = []
fore_inds = []
def f(x):
    mask = [0 for i in range(V)]
    values = [0 for i in range(V)]
    for vv in x:
        v = int(vv[0])-1
        mask[v] = 1
        values[v] = vv[1]
    return values+mask
def pad(x):
    return x+[0]*(fore_max_len-len(x))
for w in tqdm(obs_windows):
    pred_data = data.loc[(data.hour>=w)&(data.hour<=w+pred_window)]
    pred_data = pred_data.groupby(['ts_ind', 'vind']).agg({'value':'first'}).reset_index()
    pred_data['vind_value'] = pred_data[['vind', 'value']].values.tolist()
    pred_data = pred_data.groupby('ts_ind').agg({'vind_value':list}).reset_index()
    pred_data['vind_value'] = pred_data['vind_value'].apply(f)    
    obs_data = data.loc[(data.hour<w)&(data.hour>=w-24)]
    obs_data = obs_data.loc[obs_data.ts_ind.isin(pred_data.ts_ind)]
    obs_data = obs_data.groupby('ts_ind').head(fore_max_len)
    obs_data = obs_data.groupby('ts_ind').agg({'vind':list, 'hour':list, 'value':list}).reset_index()
    obs_data = obs_data.merge(pred_data, on='ts_ind')
    for col in ['vind', 'hour', 'value']:
        obs_data[col] = obs_data[col].apply(pad)
    fore_inds.append(np.array(list(obs_data.ts_ind)))
    
    matrix = list(obs_data.value)
    obs_strings = []
    for l in matrix:
        string_list = []
        for value in l:
            if not np.isscalar(value):
            # if isinstance(value, str):
                string_list.append(value)
        obs_strings.append(string_list)
    del matrix
    fore_texts_ip.append(np.array(obs_strings)) 
del data
fore_texts_ip = np.concatenate(fore_texts_ip, axis=0)
fore_inds = np.concatenate(fore_inds, axis=0)
# Get train and valid ts_ind for forecast task.
train_sub = oc.loc[oc.ts_ind.isin(train_ind)].SUBJECT_ID.unique()
valid_sub = oc.loc[oc.ts_ind.isin(valid_ind)].SUBJECT_ID.unique()
rem_sub = oc.loc[~oc.SUBJECT_ID.isin(np.concatenate((train_ind, valid_ind)))].SUBJECT_ID.unique()
bp = int(0.8*len(rem_sub))
train_sub = np.concatenate((train_sub, rem_sub[:bp]))
valid_sub = np.concatenate((valid_sub, rem_sub[bp:]))
train_ind = oc.loc[oc.SUBJECT_ID.isin(train_sub)].ts_ind.unique() # Add remaining ts_ind s of train subjects.
valid_ind = oc.loc[oc.SUBJECT_ID.isin(valid_sub)].ts_ind.unique() # Add remaining ts_ind s of train subjects.
# Generate 3 sets of inputs and outputs.
train_ind = np.argwhere(np.in1d(fore_inds, train_ind)).flatten()
valid_ind = np.argwhere(np.in1d(fore_inds, valid_ind)).flatten()
fore_train_ip = [ip[train_ind] for ip in [fore_texts_ip]]
fore_valid_ip = [ip[valid_ind] for ip in [fore_texts_ip]]
del fore_texts_ip

fore_train_text_ip = fore_train_ip[0]
fore_valid_text_ip = fore_valid_ip[0]
del fore_train_ip, fore_valid_ip
train_text_embs = []

for obs in tqdm(fore_train_text_ip):
    if len(obs) < 50:
        for i in range(50 - len(obs)):
            obs.append(np.array([0.0]*768))
    train_text_embs.append(np.array(obs))
valid_text_embs = []

for obs in tqdm(fore_valid_text_ip):
    if len(obs) < 50:
        for i in range(50 - len(obs)):
            obs.append(np.array([0.0]*768))
    valid_text_embs.append(np.array(obs))

In [24]:
# max len
max_len = 0
for obs in fore_train_text_ip:
    if len(obs) > max_len:
        max_len = len(obs)
max_len

44

In [25]:
# max len
max_len = 0
for obs in fore_valid_text_ip:
    if len(obs) > max_len:
        max_len = len(obs)
max_len

42

In [14]:
train_text_embs = []

for obs in tqdm(fore_train_text_ip):
    if len(obs) < 50:
        for i in range(50 - len(obs)):
            obs.append(np.array([0.0]*768))
    train_text_embs.append(np.array(obs))

100%|██████████| 449482/449482 [15:01<00:00, 498.45it/s]


In [1]:
# dump to pkl, update sepsis_removed_0.pkl
pickle.dump([train_text_embs], open('Data/text_emb_input_train_1_2d.pkl','wb'))

In [34]:
valid_text_embs = []

for obs in tqdm(fore_valid_text_ip):
    if len(obs) < 50:
        for i in range(50 - len(obs)):
            obs.append(np.array([0.0]*768))
    valid_text_embs.append(np.array(obs))

100%|██████████| 136823/136823 [03:16<00:00, 696.74it/s] 


In [None]:
# dump to pkl, update sepsis_removed_0.pkl
pickle.dump([valid_text_embs], open('Data/text_emb_input_val_1_2d.pkl','wb'))

## Test

In [4]:
data_path = 'Data/sepsis_removed_0.pkl'
pkl = pickle.load(open(data_path, 'rb'))
data = pkl[0]
oc = pkl[1]
train_ind = pkl[2]
valid_ind = pkl[3]
test_ind = pkl[4]
# embs = pkl[5]
del pkl

# embs
data_path = 'Data/CLS_emb_keywords_reserved.pkl'
pkl = pickle.load(open(data_path, 'rb'))
embs = pkl[0]
del pkl

text_data = data[data['variable'] == 'Text']
embs_list = []
for emb in embs:
    embs_list.append(emb)
text_data['value'] = embs_list
physio_data = data[data['variable'] != 'Text']
del data
data = text_data.append(physio_data, ignore_index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_data['value'] = embs_list
  data = text_data.append(physio_data, ignore_index=False)


In [5]:
pred_window = 2  # hours
obs_windows = range(20, 124, 4)

# Remove train, val patients
data = data.merge(oc[['ts_ind', 'SUBJECT_ID']], on='ts_ind', how='left')
train_sub = oc.loc[oc.ts_ind.isin(train_ind)].SUBJECT_ID.unique()
valid_sub = oc.loc[oc.ts_ind.isin(valid_ind)].SUBJECT_ID.unique()
data = data.loc[~data.SUBJECT_ID.isin(train_sub)]
data = data.loc[~data.SUBJECT_ID.isin(valid_sub)]
oc = oc.loc[~oc.SUBJECT_ID.isin(train_sub)]
oc = oc.loc[~oc.SUBJECT_ID.isin(valid_sub)]

data.drop(columns=['SUBJECT_ID', 'TABLE'], inplace=True)
# Get static data with mean fill and missingness indicator.
static_varis = ['Age', 'Gender']
ii = data.variable.isin(static_varis)
static_data = data.loc[ii]
data = data.loc[~ii]


def inv_list(l, start=0):
    d = {}
    for i in range(len(l)):
        d[l[i]] = i+start
    return d


static_var_to_ind = inv_list(static_varis)
D = len(static_varis)
N = data.ts_ind.max()+1

# Get variable indices.
varis = sorted(list(set(data.variable)))
V = len(varis)
var_to_ind = inv_list(varis, start=1)
data['vind'] = data.variable.map(var_to_ind)
data = data[['ts_ind', 'vind', 'hour', 'value']
            ].sort_values(by=['ts_ind', 'vind', 'hour'])
# Find max_len.
fore_max_len = 880
# Get forecast inputs and outputs.
fore_texts_ip = []
fore_inds = []


def f(x):
    mask = [0 for i in range(V)]
    values = [0 for i in range(V)]
    for vv in x:
        v = int(vv[0])-1
        mask[v] = 1
        values[v] = vv[1]
    return values+mask


def pad(x):
    return x+[0]*(fore_max_len-len(x))


for w in tqdm(obs_windows):
    pred_data = data.loc[(data.hour >= w) & (data.hour <= w+pred_window)]
    pred_data = pred_data.groupby(['ts_ind', 'vind']).agg(
        {'value': 'first'}).reset_index()
    pred_data['vind_value'] = pred_data[['vind', 'value']].values.tolist()
    pred_data = pred_data.groupby('ts_ind').agg(
        {'vind_value': list}).reset_index()
    pred_data['vind_value'] = pred_data['vind_value'].apply(f)
    obs_data = data.loc[(data.hour < w) & (data.hour >= w-24)]
    obs_data = obs_data.loc[obs_data.ts_ind.isin(pred_data.ts_ind)]
    obs_data = obs_data.groupby('ts_ind').head(fore_max_len)
    obs_data = obs_data.groupby('ts_ind').agg(
        {'vind': list, 'hour': list, 'value': list}).reset_index()
    obs_data = obs_data.merge(pred_data, on='ts_ind')
    for col in ['vind', 'hour', 'value']:
        obs_data[col] = obs_data[col].apply(pad)
    fore_inds.append(np.array(list(obs_data.ts_ind)))

    matrix = list(obs_data.value)
    obs_strings = []
    for l in matrix:
        string_list = []
        for value in l:
            if not np.isscalar(value):
            # if isinstance(value, str):
                string_list.append(value)
        obs_strings.append(string_list)
    del matrix
    fore_texts_ip.append(np.array(obs_strings))
del data

fore_texts_ip = np.concatenate(fore_texts_ip, axis=0)

  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(obs_strings))
  fore_texts_ip.append(np.array(ob

In [9]:
fore_texts_ip

(131920,)

In [10]:
# max len
max_len = 0
for obs in fore_texts_ip:
    if len(obs) > max_len:
        max_len = len(obs)
max_len

40

In [14]:
train_text_embs = []

for obs in tqdm(fore_texts_ip):
    if len(obs) < 50:
        for i in range(50 - len(obs)):
            obs.append(np.array([0.0]*768))
    train_text_embs.append(np.array(obs))

100%|██████████| 131920/131920 [04:04<00:00, 538.55it/s]


In [19]:
train_text_embs_np = np.array(train_text_embs)
train_text_embs_np.shape
del train_text_embs_np

(131920, 50, 768)

In [21]:
# dump to pkl, update sepsis_removed_0.pkl
pickle.dump([train_text_embs], open('Data/text_emb_input_test_1_2d.pkl','wb'))