In [75]:
import os
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder

In [76]:
path = '/data/notebook/shared/MIMIC-IV'

In [77]:
with open(os.path.join(path, 'dict_types_mimic_240408_clinic_3_years.pkl'), 'rb') as f:
    dtype_dict = pickle.load(f)
f.close()

with open(os.path.join(path, 'total_data_dict_with_timedelta_nomedi_240408_clinic_3_years.pkl'), 'rb') as f:
    data_dict_d = pickle.load(f)
f.close()

In [78]:
labels = []
length_list = []
code_length_list = []
for sample_id, visits in tqdm(data_dict_d.items()):
    # 레이블 추가
    label = visits['label']
    labels.append(label)
    length_list.append(sum([len(visits[year]) for year in ['year']]))
    code_length_list.append(max([len(seq)for seq in visits['year']]))

100%|██████████| 8037/8037 [00:00<00:00, 353350.33it/s]


In [79]:
np.unique(labels, return_counts=True)
np.mean(length_list)
np.std(length_list)

max_visits_length = max(length_list)
max_index = max(dtype_dict.values())
max_code_len = max(code_length_list)
print('max_index:', max_index+1)
print('max_visit:', max_visits_length)
print('max_code_len:', max_code_len)

max_index: 15373
max_visit: 49
max_code_len: 13


In [91]:
def pad_sequence(seq_diagnosis_codes, maxlen, maxcode):
    lengths = len(data['year'])
    diagnosis_codes = np.zeros((maxlen, maxcode), dtype=np.int64)
    seq_mask_code = np.zeros((maxlen, maxcode), dtype=np.int8)
    seq_mask = np.zeros((maxlen), dtype=np.int8)
    seq_mask_final = np.zeros((maxlen), dtype=np.int8)
    for pid, subseq in enumerate(seq_diagnosis_codes):
        for tid, code in enumerate(subseq):
            diagnosis_codes[pid, tid] = code
            seq_mask_code[pid, tid] = 1
    seq_mask[:lengths] = 1
    seq_mask_final[lengths - 1] = 1
    return diagnosis_codes, seq_mask_code, seq_mask, seq_mask_final

In [92]:
def keep_last_one_in_columns(a):
    # 결과 배열 초기화
    result = np.zeros_like(a)
    # 각 열에 대해 반복
    for col_index in range(a.shape[1]):
        # 현재 열 추출
        column = a[:, col_index]
        # 이 열에서 마지막 '1' 찾기
        last_one_idx = np.max(np.where(column == 1)[0]) if 1 in column else None
        if last_one_idx is not None:
            result[last_one_idx, col_index] = 1
    return result

In [93]:
new_data_dict_d = {}
year_list = []
for sample_id, data in tqdm(data_dict_d.items()):
    data_dict_new = {}
    # pad_seq, seq_mask_code = pad_sequence(data['year'], max_visits_length, max_code_len)
    pad_seq, seq_mask_code, seq_mask, seq_mask_final = pad_sequence(data['year'], max_visits_length, max_code_len)
    data_dict_new['code_index'] = pad_seq
    data_dict_new['code'] = data['code']
    data_dict_new['time'] = data['time']
    data_dict_new['timedelta'] = data['timedelta']
    time_feature = np.array([[timestamp.year, timestamp.month, timestamp.day, timestamp.week] for timestamp in data['time']])
    data_dict_new['time_feature'] = np.pad(time_feature, pad_width=((0, max_visits_length - time_feature.shape[0]),(0,0)))
    data_dict_new['year'] = np.array([timestamp.year for timestamp in data['time']])
    # data_dict_new['seq_mask'] = np.pad(np.ones(time_feature.shape[0]), (0, max_visits_length - time_feature.shape[0]))
    data_dict_new['seq_mask'] = seq_mask
    data_dict_new['seq_mask_final'] = seq_mask_final
    data_dict_new['seq_mask_code'] = seq_mask_code
    unique_year = np.unique(data_dict_new['year'])
    if len(unique_year) == 2:
        # print("two year sample", sample_id)
        unique_year = np.append(unique_year, unique_year[-1]+1)
    elif len(unique_year) == 1:
        # print("one year sample", sample_id)
        continue
    encoder = OneHotEncoder(categories=[unique_year], sparse=False, handle_unknown='ignore')
    year_onehot =  encoder.fit_transform(np.array(data_dict_new['year']).reshape(-1,1))
    last_year_visit = keep_last_one_in_columns(year_onehot)
    data_dict_new['year_onehot'] = np.pad(year_onehot, pad_width=((0, max_visits_length - year_onehot.shape[0]), (0,0)))
    data_dict_new['last_year_onehot'] = np.pad(last_year_visit, pad_width=((0, max_visits_length - year_onehot.shape[0]), (0,0)))
    data_dict_new['label'] = data['label']    
    new_data_dict_d[sample_id] = data_dict_new

100%|██████████| 8037/8037 [00:02<00:00, 2855.77it/s]


In [106]:
with open(os.path.join(path, 'total_data_dict_with_timedelta_nomedi_240421_clinic_3_years.pkl'), 'wb') as f:
    pickle.dump(new_data_dict_d, f)
f.close()

In [107]:
os.path.join(path, 'total_data_dict_with_timedelta_nomedi_240421_clinic_3_years.pkl')

'/data/notebook/shared/MIMIC-IV/total_data_dict_with_timedelta_nomedi_240421_clinic_3_years.pkl'

In [108]:
np.outer(seq_mask, seq_mask)

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)