## Hardware check

In [1]:
# gpu check
! nvidia-smi

Wed Mar 20 22:32:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  | 00000000:B2:00.0 Off |                    0 |
| N/A   30C    P0              40W / 300W |      9MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           On  | 00000000:B3:00.0 Off |  

In [3]:
# check number of cores
import multiprocessing

cores = multiprocessing.cpu_count() 
cores

80

In [4]:
! python --version

Python 3.9.7


## Environment Prep

In [2]:
cd /pfs/data5/home/hd/hd_hd/hd_nf283/MA_Thesis/

/pfs/data5/home/hd/hd_hd/hd_nf283/MA_Thesis


In [6]:
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Lambda
from keras.utils import pad_sequences
# from tensorflow.keras.models import Model
from tensorflow.keras import models
import pickle
import numpy as np
from tqdm import tqdm
# tqdm.pandas()
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import Callback, EarlyStopping
import pandas as pd
import json
from torch.utils.data import Dataset
from transformers import AutoTokenizer, pipeline, AutoModel
import resources.smart_cond as sc
# from google.colab import files

2023-07-08 23:52:28.230461: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-08 23:52:30.609508: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Train/Val

In [7]:
data_path = 'Data/sepsis_removed_0.pkl'
pkl = pickle.load(open(data_path, 'rb'))
data = pkl[0]
oc = pkl[1]
train_ind = pkl[2]
valid_ind = pkl[3]
test_ind = pkl[4]
embs = pkl[5]
del pkl

In [8]:
text_data = data[data['variable'] == 'Text']
embs_list = []

for emb in embs:
    embs_list.append(emb)

text_data['value'] = embs_list
physio_data = data[data['variable'] != 'Text']

del data
data = text_data.append(physio_data, ignore_index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_data['value'] = embs_list
  data = text_data.append(physio_data, ignore_index=False)


In [9]:
# Filter labeled data in first 24h.
data = data.loc[data.ts_ind.isin(np.concatenate((train_ind, valid_ind, test_ind), axis=-1))]
data = data.loc[(data.hour>=0)&(data.hour<=24)]
oc = oc.loc[oc.ts_ind.isin(np.concatenate((train_ind, valid_ind, test_ind), axis=-1))]
# Get y and N.
y = np.array(oc.sort_values(by='ts_ind')['in_hospital_sepsis']).astype('float32')
N = data.ts_ind.max() + 1
# Get static data with mean fill and missingness indicator.
static_varis = ['Age', 'Gender']
ii = data.variable.isin(static_varis)
static_data = data.loc[ii]
data = data.loc[~ii]
def inv_list(l, start=0):
    d = {}
    for i in range(len(l)):
        d[l[i]] = i+start
    return d
static_var_to_ind = inv_list(static_varis)
D = len(static_varis)
demo = np.zeros((N, D))
for row in tqdm(static_data.itertuples()):
    demo[row.ts_ind, static_var_to_ind[row.variable]] = row.value
# Normalize static data.
means = demo.mean(axis=0, keepdims=True)
stds = demo.std(axis=0, keepdims=True)
stds = (stds==0)*1 + (stds!=0)*stds
demo = (demo-means)/stds
# Trim to max len.
data = data.sample(frac=1)
data = data.groupby('ts_ind').head(880)
# Get N, V, var_to_ind.
N = data.ts_ind.max() + 1
varis = sorted(list(set(data.variable)))
V = len(varis)
def inv_list(l, start=0):
    d = {}
    for i in range(len(l)):
        d[l[i]] = i+start
    return d
var_to_ind = inv_list(varis, start=1)
data['vind'] = data.variable.map(var_to_ind)
data = data[['ts_ind', 'vind', 'hour', 'value']].sort_values(by=['ts_ind', 'vind', 'hour'])
# Add obs index.
data = data.sort_values(by=['ts_ind']).reset_index(drop=True)
data = data.reset_index().rename(columns={'index':'obs_ind'})
data = data.merge(data.groupby('ts_ind').agg({'obs_ind':'min'}).reset_index().rename(columns={ \
                                                            'obs_ind':'first_obs_ind'}), on='ts_ind')
data['obs_ind'] = data['obs_ind'] - data['first_obs_ind']
# Find max_len.
max_len = data.obs_ind.max()+1
print ('max_len', max_len)
# Generate times_ip and values_ip matrices.
# times_inp = np.zeros((N, max_len), dtype='float32')
# values_inp = np.zeros((N, max_len), dtype='float32')
# varis_inp = np.zeros((N, max_len), dtype='int32')
texts_inp = np.empty([N, max_len], dtype=object)
for row in tqdm(data.itertuples()):
    ts_ind = row.ts_ind
    l = row.obs_ind
    # times_inp[ts_ind, l] = row.hour
    # if isinstance(row.value, str):
    if row.vind == 124:
        # values_inp[ts_ind, l] = 1.0
        # print(row.value)
        texts_inp[ts_ind, l] = row.value
    else:
        # values_inp[ts_ind, l] = row.value
        texts_inp[ts_ind, l] = None
    # varis_inp[ts_ind, l] = row.vind
    
data.drop(columns=['obs_ind', 'first_obs_ind'], inplace=True)
# Generate 3 sets of inputs and outputs.
# train_ip = [ip[train_ind] for ip in [demo, times_inp, values_inp, varis_inp, texts_inp]]
# valid_ip = [ip[valid_ind] for ip in [demo, times_inp, values_inp, varis_inp, texts_inp]]
# test_ip = [ip[test_ind] for ip in [demo, times_inp, values_inp, varis_inp, texts_inp]]
# del times_inp, values_inp, varis_inp
train_ip = [ip[train_ind] for ip in [texts_inp]]
valid_ip = [ip[valid_ind] for ip in [texts_inp]]
test_ip = [ip[test_ind] for ip in [texts_inp]]
del texts_inp

114564it [00:00, 791559.58it/s]


max_len 880


19267073it [00:20, 961908.58it/s]


In [10]:
train_text_emb_list = train_ip[0]
train_text_embs = []

for l in tqdm(train_text_emb_list):
    tmp = []
    for i in l:
        # get rid of the None
        if i is not None:
            tmp.append(i)
    if tmp == []:
        tmp = [np.array([0.0])]
    train_text_embs.append(np.concatenate(tmp))

100%|██████████| 36551/36551 [00:01<00:00, 20725.11it/s]


In [11]:
valid_text_emb_list = valid_ip[0]
valid_text_embs = []

for l in tqdm(valid_text_emb_list):
    tmp = []
    for i in l:
        # get rid of the None
        if i is not None:
            tmp.append(i)
    if tmp == []:
        tmp = [np.array([0.0])]
    valid_text_embs.append(np.concatenate(tmp))

100%|██████████| 9262/9262 [00:00<00:00, 20622.32it/s]


In [12]:
test_text_emb_list = test_ip[0]
test_text_embs = []

for l in tqdm(test_text_emb_list):
    tmp = []
    for i in l:
        # get rid of the None
        if i is not None:
            tmp.append(i)
    if tmp == []:
        tmp = [np.array([0.0])]
    test_text_embs.append(np.concatenate(tmp))

100%|██████████| 11469/11469 [00:00<00:00, 20590.04it/s]


In [13]:
# padding
max_len = 33792
# padding
padded_train = pad_sequences(train_text_embs, maxlen=max_len, dtype='float32', padding='post')
padded_train.shape

(36551, 33792)

In [14]:
# padding
max_len = 33792
# padding
padded_valid = pad_sequences(valid_text_embs, maxlen=max_len, dtype='float32', padding='post')
padded_valid.shape

(9262, 33792)

In [15]:
# padding
max_len = 33792
# padding
padded_test = pad_sequences(test_text_embs, maxlen=max_len, dtype='float32', padding='post')
padded_test.shape

(11469, 33792)

In [None]:
# dump to pkl, update sepsis_removed_0.pkl
pickle.dump([padded_train, padded_valid, padded_test], open('Data/classification_embs.pkl','wb'))

In [17]:
data_path = 'Data/classification_embs.pkl'
train_text, valid_text, test_text = pickle.load(open(data_path, 'rb'))

In [19]:
train_text.shape

(36551, 33792)