In [49]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
from scipy.stats import chi2_contingency
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows

In [50]:
df= pd.read_csv(f'./data/mimic_data/full_step2.csv')

"""
[feature1~n,missing,mask1~n,missing_mask] ==> [feature,mask,missing]
"""
column_to_move = 'missing'
df = df[[col for col in df.columns if col != column_to_move] + [column_to_move]]


In [51]:
"""
input: list[numpy]
remove negative samples
"""
def remove_sample(x,y,choose_count = 3):
    random.seed(42)
    num_sample = len(x)
    have_label_1 = False

    if num_sample<=choose_count:
        x = [arr[:, :-2] for arr in x]
        return x,y
    
    for label in y:
        if label[0] == 1:
            have_label_1
            break

    if have_label_1 == True:
        x,y,_ = sort_by_missing(x,y)    
        pass
    else:
        x,y,_ = sort_by_missing(x,y)    
    
    x = [arr[:, :-2] for arr in x]

    random_indices = [i for i in range(choose_count)]

    for i in range(len(y)):
        if y[i][0] == 1:
            if i not in random_indices:
                random_indices.append(i)
                
    selected_x = [x[i] for i in random_indices]
    selected_y = [y[i] for i in random_indices]

    return selected_x,selected_y

In [52]:
def sort_by_missing(x_list,y_list):
    if len(x_list) == 0:
        return data_list
    
    sample = x_list[0]
    feature_missing_index = sample.shape[1]-1  
    
    sums = [(np.sum(x[:, feature_missing_index]), x, y) for x, y in zip(x_list, y_list)]
    
    sorted_sums = sorted(sums, key=lambda x: x[0], reverse=False)

    sorted_sum_list  = [item[0] for item in sorted_sums]
    sorted_x_list = [item[1] for item in sorted_sums]
    sorted_y_list = [item[2] for item in sorted_sums]
    
    return sorted_x_list,sorted_y_list,sorted_sum_list

In [53]:
df_columns = pd.DataFrame()

In [54]:
"""
input_features, labels

input_features: dict['task'] ==> list[numpy]
"""
def get_input_and_label(df_patient, window_size, stay_id):
    global weaning_too_short_count
    global weaning_successful_too_short_count

    input_features = {
        'Mortality': [], 'Mortality_30d': [], 'Mortality_60d': [], 'Mortality_90d': [],
        'Weaning': [], 'Weaning_successful': [], 'Vasopressor': [],
        'SBT_Start': [], 'SBT_Successful': [], 'Reintubation': []
    }

    labels = {
        'Mortality': [], 'Mortality_30d': [], 'Mortality_60d': [], 'Mortality_90d': [],
        'Weaning': [], 'Weaning_successful': [], 'Vasopressor': [],
        'SBT_Start': [], 'SBT_Successful': [], 'Reintubation': []
    }

    if len(df_patient) <= window_size:
        return input_features, labels

    label_col = ['dod', 'dod_30day', 'dod_60day', 'dod_90day', 'Weaning_successful', 'Weaning', 'Reintubation',
                 'SBT Started', 'SBT Successfully Completed', 'Vasopressor', 'use_vent']
    label_col = label_col + ['InvasiveVent', 'tracheostomy', 'NonInvasiveVent', 'SupplementalOxygen', 'HFNC']
    label_col = label_col + ['date']
    mask_col = [f'{i}_mask' for i in label_col]
    label_col = label_col + mask_col
        
    for i in range(0, len(df_patient)):
        if i + window_size >= len(df_patient):
            break
        input_using_vent = df_patient['use_vent'][i:i + window_size].min() == 1
        df_input = df_patient.iloc[i:i + window_size, ~df_patient.columns.isin(label_col)]
        input_X = df_input.values

        global df_columns
        if len(df_columns.columns) == 0:
            if 'missing' in df_input:
                df_input = df_input.drop('missing',axis = 1)
                df_input = df_input.drop('missing_mask',axis = 1)
            df_columns = df_input.copy()

        labels_y = {
            'Mortality': df_patient.iloc[i + window_size:i + window_size + 1]['dod'].values,
            'Mortality_30d': df_patient.iloc[i + window_size:i + window_size + 1]['dod_30day'].values,
            'Mortality_60d': df_patient.iloc[i + window_size:i + window_size + 1]['dod_60day'].values,
            'Mortality_90d': df_patient.iloc[i + window_size:i + window_size + 1]['dod_90day'].values,
            'Weaning': df_patient.iloc[i + window_size:i + window_size + 1]['Weaning'].values,
            'Weaning_successful': df_patient.iloc[i + window_size:i + window_size + 1]['Weaning_successful'].values,
            'Vasopressor': df_patient.iloc[i + window_size:i + window_size + 1]['Vasopressor'].values,
            'SBT_Start': df_patient.iloc[i + window_size:i + window_size + 1]['SBT Started'].values,
            'SBT_Successful': df_patient.iloc[i + window_size:i + window_size + 1]['SBT Successfully Completed'].values,
            'Reintubation': df_patient.iloc[i + window_size:i + window_size + 1]['Reintubation'].values
        }

        if not input_using_vent:
            for key in ['Weaning', 'Weaning_successful', 'SBT_Start', 'SBT_Successful']:
                labels_y[key] = -1
        
        for key in input_features.keys():
            if labels_y[key] != -1:
                input_features[key].append(input_X)
                labels[key].append(labels_y[key])
    
    #remove sample
    for key in input_features.keys():
        x_list = input_features[key]
        y_list = labels[key]
        x_list_select, y_list_select = remove_sample(x_list,y_list)
        input_features[key] = x_list_select
        labels[key] = y_list_select
        
    return input_features, labels

In [55]:
def select_pattern(df_P,label_name):
    
    if len(df_P) == 1:
        return False
    
    for i in range(1,len(df_P)):
        if df_P.at[i,label_name] != -1 and df_P.at[i-1,'use_vent'] == 1:
            return True
    return False



In [56]:
def calculate_iqr(df):
    def calculate_column_iqr(column):
        if column.dtype == 'O':
            return 0
        summary = column.describe()
        q1 = summary['25%']
        q3 = summary['75%']
        iqr_str = f'{round(q1,2)}-{round(q3,2)}'
        return iqr_str
    iqr_values = df.apply(calculate_column_iqr)

    return iqr_values

def statistics(df, sample_type = 'mean'):
    mean_series = df.mean(numeric_only=True)
    median_series = df.median(numeric_only=True)
    std_series = df.std(numeric_only=True)
    iqr_series = calculate_iqr(df.drop('date', axis=1))
    if sample_type == 'mean':
        combined_series = mean_series.round(2).astype(str) + '±' + std_series.round(2).astype(str)
    else:
        combined_series = median_series.round(2).astype(str) + ' (' + iqr_series + ')'
    return combined_series

def calculate_p_value(group1,group2):
    t_statistic, p_value = ttest_ind(group1, group2)
    return t_statistic, p_value

def remove_nah(arr):
    mask = np.isnan(arr)
    arr_no_nan = arr[~mask]
    return arr_no_nan


def save_to_xlsx(df_save,file_name = 'output'):
    wb = Workbook()
    ws = wb.active
    for r_idx, row in enumerate(dataframe_to_rows(df_save, index=False, header=True), 1):
        for c_idx, value in enumerate(row, 1):
            ws.cell(row=r_idx, column=c_idx, value=value)
    wb.save(f'{file_name}.xlsx')

In [57]:
#+++
df_statistics = pd.DataFrame(columns=['InvasiveVent', 'Weaning', 'Weaning_successful','pred_weaning','pred_weaning_successful'])
input_features_list = []
labels_list = []
window_size = 1
distinct_stay_id = df['stay_id'].unique()
count_1D = 0
count_more_1D = 0
for stay_ids in tqdm(distinct_stay_id): 
    df_P = df[df['stay_id'] == stay_ids]
    input_features, labels = get_input_and_label(df_P, window_size ,stay_ids)
    input_features_list.append(input_features)
    labels_list.append(labels)
    

    pred_weaning = -1 
    pred_weaning_successful = -1
    if len(labels['Weaning'])!=0:
        pred_weaning = max([arr[0] for arr in labels['Weaning']])
    if len(labels['Weaning_successful'])!=0:
        pred_weaning_successful = max([arr[0] for arr in labels['Weaning_successful']])
    
    if df_P['Weaning_successful'].max() == 1 and df_P['Weaning'].max() == 1:
        if pred_weaning_successful == -1 and pred_weaning == -1 and df_P['InvasiveVent'].max() == 1:
            if (df_P['InvasiveVent'] == 1).sum() != 1:
                count_more_1D += 1
            else:
                count_1D += 1

    
    new_data = {'stay_id' : stay_ids,
                'InvasiveVent': df_P['InvasiveVent'].max(),
                'Weaning': df_P['Weaning'].max(),  
                'Weaning_successful': df_P['Weaning_successful'].max(),
                'SBT_start' : df_P['SBT Started'].max(),
                'SBT_successful' : df_P['SBT Successfully Completed'].max(),
                'pred_weaning': pred_weaning,
                'pred_weaning_successful': pred_weaning_successful}
    
    
    df_statistics = pd.concat([df_statistics, pd.DataFrame([new_data])], ignore_index=True)


100%|█████████████████████████████████████████████████████████████████████████████| 2893/2893 [00:16<00:00, 177.48it/s]


In [58]:
task_name_list = ['Weaning','Weaning_successful','SBT_Start','SBT_Successful','Mortality','Mortality_30d','Mortality_60d','Mortality_90d','Vasopressor']

In [59]:
len(input_features_list)

2893

In [60]:
task_samples = {}
task_labels = {}


for i in tqdm(range(len(input_features_list))): 
    x_dict = input_features_list[i]
    y_dict = labels_list[i]
    
    for task, data in x_dict.items():
        if task not in task_samples:
            task_samples[task] = []
        task_samples[task] = task_samples[task] + data
        
    for task, label in y_dict.items():
        if task not in task_labels:
            task_labels[task] = []
        task_labels[task] = task_labels[task] + label
            



100%|████████████████████████████████████████████████████████████████████████████| 2893/2893 [00:01<00:00, 1496.33it/s]


In [61]:
for task_name in task_name_list:
    task_samples[task_name] = np.array(task_samples[task_name])
    task_labels[task_name] = np.squeeze(task_labels[task_name])
    #print(task_samples[task_name].shape)

In [62]:
def check_label_distribution (data_Y):
    count_1 = np.count_nonzero(data_Y == 1)
    count_0 = np.count_nonzero(data_Y == 0)
    count_others = np.count_nonzero((data_Y != 1) & (data_Y != 0))
    #print(f'count1:{count_1}....{count_0}')
    ratio_1 = round(count_1/len(data_Y)*100,2)
    ratio_0 = round(count_0/len(data_Y)*100,2)
    ratio_others = round(count_others/len(data_Y)*100,2)
    print(f'1=>{count_1}({ratio_1}%),  0=>{count_0}({ratio_0}%),  others=>{count_others}({ratio_others}%)')

In [63]:
from datetime import datetime
date = datetime.today().strftime('%y%m%d')
route = './data/sample'
    

is_save = input('save? (y/n)')
if is_save == 'y':
    for task_name in task_name_list:
        for data_type in ['train','validation','test']:
            len_of_sample = len(task_samples[task_name])
            feature_count = int(len(df_columns.columns)/2)
            if data_type == 'train':
                x = task_samples[task_name][:int(len_of_sample*0.8),:,:feature_count]
                y = task_labels[task_name][:int(len_of_sample*0.8)]
            elif data_type == 'validation':
                x = task_samples[task_name][int(len_of_sample*0.8):int(len_of_sample*0.9),:,:feature_count]
                y = task_labels[task_name][int(len_of_sample*0.8):int(len_of_sample*0.9)]
            else:
                x = task_samples[task_name][int(len_of_sample*0.9):,:,:feature_count]
                y = task_labels[task_name][int(len_of_sample*0.9):]
            np.save(f'{route}/{data_type}_X_{task_name}.npy', x)
            np.save(f'{route}/{data_type}_Y_{task_name}.npy', y)
    print('Saved successfully....', date)
    
print("Finish....", datetime.now())

save? (y/n)y
Saved successfully.... 240206
Finish.... 2024-02-06 15:56:21.953265


In [64]:
feature_num = task_samples['Weaning_successful'].shape[2]/2
mask_data = task_samples['Weaning_successful'][:,0,int(feature_num):]
mask_data = mask_data.squeeze()
print(mask_data.shape)

(3601, 102)


In [65]:
len_col = int(len(df_columns.columns)/2)
col_name_list = df_columns.columns.to_list()[len_col:]
df_mask = pd.DataFrame(mask_data, columns=col_name_list)

In [66]:
print(int(feature_num))

df_columns.iloc[:, 1:int(feature_num)].to_csv(f"{route}/full_feature_name.csv", index=False)

102
