In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools
import pickle
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, brier_score_loss 


In [2]:
alpha = 0.5
gamma = 1
batch_size = 256
seq_len = 1
learning_rate = 1e-4

max_epoch = 100
experiment_time = 5
limit_early_stop_count = 5

show_shap_flag = True
select_feature_flag = False
use_upsample = False
use_mini_feature = False
only_Weaning = False

task_name_list = ['Weaning_successful']

#data_date = "20240104"
data_date = "20240114"
device = torch.device("cuda")

In [3]:
class MLP_MTL(nn.Module):
    def __init__(self, input_dim, task_name_list, dropout_ratio=0.0):
        super(MLP_MTL, self).__init__()

        self.dropout = nn.Dropout(dropout_ratio)
        self.relu = nn.ReLU()  # Activation function for hidden layers
        self.sigmoid = nn.Sigmoid()
        self.task_name_list = task_name_list
        self.num_tasks = len(task_name_list)
        hidden_dim = [256, 128, 64, 32]
        output_size = 1

        # Bottom
        self.bt_fc1 = nn.Linear(input_dim, hidden_dim[0])
        self.bt_fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.bt_fc3 = nn.Linear(hidden_dim[1], hidden_dim[2])

        # Towers
        self.task_fc0 = nn.ModuleList([nn.Linear(hidden_dim[2], hidden_dim[3]) for _ in range(self.num_tasks)])
        self.task_fc1 = nn.ModuleList([nn.Linear(hidden_dim[3], output_size) for _ in range(self.num_tasks)])
    
    def data_check(self,x):
        if isinstance(x, np.ndarray):
            x = torch.tensor(x, dtype=torch.float32)
        if x.ndim == 3:
            x = x.reshape(x.shape[0], x.shape[1] * x.shape[2])  # Flatten 
            
        x = x.to(device)
        return x
    
    def forward(self, x):
        x = self.data_check(x)

        # Bottom
        x = self.bt_fc1(x)
        x = self.relu(x)
        x = self.bt_fc2(x)
        x = self.relu(x)
        x = self.bt_fc3(x)
        h = self.relu(x)
        h = self.dropout(h)  

        # Towers
        task_out = {}
        for task_index in range(self.num_tasks):
            task_name = self.task_name_list[task_index]
            hi = self.task_fc0[task_index](h)
            hi = self.relu(hi)
            hi = self.dropout(hi)
            hi = self.task_fc1[task_index](hi)
            hi = self.sigmoid(hi)
            task_out[task_name] = hi    
            
        if len(self.task_name_list) == 1:
            return task_out[self.task_name_list[0]]
        else:
            return task_out
    
    def predict_prob(self, x):
        self.eval()
        prob_dict = self.forward(x)
        
        if len(self.task_name_list) == 1:
            prob_dict_true = {}
            prob_dict_true[self.task_name_list[0]] = prob_dict
            return prob_dict_true
        return prob_dict

    def predict_proba(self, x):
        self.eval()
        prob_dict = self.forward(x)
        
        if len(self.task_name_list) == 1:
            prob_dict_true = {}
            prob_dict_true[self.task_name_list[0]] = prob_dict
            return prob_dict_true
        
        return prob_dict
    
    def predict(self, x, threshold = 0.5):
        self.eval()
        prob_dict = self.predict_prob(x)
        pred_dict = {}
        
        for key, value in prob_dict.items():
            #tensor轉numpy
            value = value.cpu().detach().numpy()
            pred_class = [1 if x > threshold else 0 for x in value]
            pred_dict[key] = np.array(pred_class) 
        return pred_dict
    
    def evaluate(self,X,label,task_name,criterion):
        with torch.no_grad():
            prob = self.predict_prob(X)[task_name].cpu().detach().numpy() #tensor=>numpy
            pred = self.predict(X)[task_name] 
            score = compute_scores(label,pred,prob)
            score['task'] = task_name
            loss = criterion(torch.from_numpy(prob).to(device),torch.from_numpy(label).to(device)).item()
            score['loss'] = loss/len(label)
            return score
    

In [4]:
def compute_scores(y_true, y_pred,y_prob):
    if np.any(np.isnan(y_prob)):
        print(y_prob)
        input()
        
    scores = {}
    try:
        scores['task'] = 'Null'
        scores['auroc'] = round(roc_auc_score(y_true, y_prob), 3)
        scores['acc'] = round(accuracy_score(y_true, y_pred), 3)
        scores['f1'] = round(f1_score(y_true, y_pred), 3)
        scores['pre'] = round(precision_score(y_true, y_pred), 3)
        scores['recall'] = round(recall_score(y_true, y_pred), 3)
        scores['brier_score'] = round(brier_score_loss(y_true, y_prob), 3)
    except Exception as e:
        print("An error occurred:", str(e))
    return scores

In [5]:
"""
Input:
    model
    dict: Mydataset
    loss_function
Output:
    score: dict + dict
    result: dict => ['total_auc','total_loss']
"""
def test(model, dataset_dict, criterion, is_show = True , only_Weaning = False):
    model.eval()

    task_name_list = list(dataset_dict.keys())
    score = {}
    result = {'total_auc': 0, 'total_loss': 0}
    for task_name in task_name_list:  # 循環每個任務
        X = dataset_dict[task_name].inputs.numpy()
        Y = dataset_dict[task_name].labels.unsqueeze(1).numpy()
    
        score[task_name] = model.evaluate(X,Y,task_name,criterion)
        
        if only_Weaning == True and 'Weaning_succecssful' in task_name_list:
            if task_name == 'Weaning_succecssful':
                result['total_auc'] = result['total_auc'] + score[task_name]['auroc']
                result['total_loss'] = result['total_loss'] + score[task_name]['loss']
        else:
            result['total_auc'] = result['total_auc'] + score[task_name]['auroc']
            result['total_loss'] = result['total_loss'] + score[task_name]['loss']
            
        if is_show:
            print(score[task_name])
    
    return score,result

"""
local_best_model_dict: #dict{'task_name':{'model','performance(target_score)','id'}}
model
"""
def test2(local_best_model_dict, modelr, dataset_dict, criterion, is_show = True):
    score = {}
    result = {'total_auc': 0, 'total_loss': 0}
    task_name_list = list(dataset_dict.keys())
    
    for task_name in task_name_list:
        print(f"task: {task_name} ")
        print(f"{local_best_model_dict[task_name]['performance']}")
        modelr.load_state_dict(local_best_model_dict[task_name]['model'])
        modelr.eval()
        X = dataset_dict[task_name].inputs.numpy()
        Y = dataset_dict[task_name].labels.unsqueeze(1).numpy()
        score[task_name] = modelr.evaluate(X,Y,task_name,criterion)
        result['total_auc'] = result['total_auc'] + score[task_name]['auroc']
        result['total_loss'] = result['total_loss'] + score[task_name]['loss']
        if is_show:
            print(score[task_name])
            
    return score,result

In [6]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


class MyDataset(Dataset):
    def __init__(self, np_X_scalar,np_X_original, np_Y):
        self.inputs = torch.from_numpy(np_X_scalar).float()
        self.inputs_original = torch.from_numpy(np_X_original).float()
        self.labels = torch.from_numpy(np_Y).float()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]
    
    def remove_samples(self, feature_index, threshold, condition_type):
        """
        Remove samples based on a specified condition on a specific feature.

        Parameters:
        - feature_index (int): Index of the feature.
        - threshold (float): Threshold value for the condition.
        - condition_type (str): Type of condition ('type1' for '<' or 'type2' for '>=').
        """
        if condition_type == 'type1':
            indices_to_remove = torch.nonzero(self.inputs[:, feature_index] < threshold).squeeze()
        elif condition_type == 'type2':
            indices_to_remove = torch.nonzero(self.inputs[:, feature_index] >= threshold).squeeze()
        else:
            raise ValueError("Invalid condition_type. Use 'type1' for '<' or 'type2' for '>='.")

        # Remove samples
        self.inputs = torch.index_select(self.inputs, 0, indices_to_remove)
        self.inputs_original = torch.index_select(self.inputs_original, 0, indices_to_remove)
        self.labels = torch.index_select(self.labels, 0, indices_to_remove)
    
class BCEFocalLoss(torch.nn.Module):

    def __init__(self, gamma=2, alpha=0.25, reduction='elementwise_mean'):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction
 
    def forward(self, _input, target):
        pt = _input
        alpha = self.alpha
        loss = - alpha * (1 - pt) ** self.gamma * target * torch.log(pt) - \
               (1 - alpha) * pt ** self.gamma * (1 - target) * torch.log(1 - pt)
        if self.reduction == 'elementwise_mean':
            loss = torch.mean(loss)
        elif self.reduction == 'sum':
            loss = torch.sum(loss)
        return loss    

    
def check_label_distribution (data_Y):
    count_1 = np.count_nonzero(data_Y == 1)
    count_0 = np.count_nonzero(data_Y == 0)
    count_others = np.count_nonzero((data_Y != 1) & (data_Y != 0))
    ratio_1 = round(count_1/len(data_Y)*100,2)
    ratio_0 = round(count_0/len(data_Y)*100,2)
    ratio_others = round(count_others/len(data_Y)*100,2)
    print(f'Distribution: 1=>{count_1}({ratio_1}%),  0=>{count_0}({ratio_0}%),  others=>{count_others}({ratio_others}%)')

    
def upsampling_auto(X,X_original,Y,up_ratio):
    check_label_distribution(Y)
    zero_idx = np.where(Y == 0)[0]
    one_idx = np.where(Y == 1)[0]
    other_idx = np.where((Y != 1) & (Y != 0))[0]
    if len(other_idx > 0):
        return X,Y
    repeated_data_X = np.tile(X[one_idx], (up_ratio, 1, 1))
    repeated_data_X_original = np.tile(X_original[one_idx], (up_ratio, 1, 1))
    repeated_data_Y = np.tile(Y[one_idx], (up_ratio))

    X_upsampled = np.vstack((X[zero_idx], repeated_data_X))
    X_original_upsampled = np.vstack((X_original[zero_idx], repeated_data_X_original))

    Y_upsampled = np.concatenate((Y[zero_idx], repeated_data_Y)) 
    return X_upsampled,X_original_upsampled, Y_upsampled

In [7]:
import numpy as np

"""
Input:
    X: numpy
    feature_name_list : List
    select_feature_list : List   (必須是feature_name_list的子集)
Output
    select_feature_list data
"""
def select_features(X, feature_name_list, select_feature_list):
    invalid_features = set(select_feature_list) - set(feature_name_list)
    if invalid_features:
        raise ValueError(f"Invalid features in select_feature_list: {invalid_features}")
    selected_feature_indices = [feature_name_list.index(feature) for feature in select_feature_list]
    X_selected = X[:, :, selected_feature_indices]

    return X_selected

In [8]:
import numpy as np

def read_data(task_name_list,data_date,data_type, select_feature_list = [], batch_size = 256,use_upsample = False):
    batch_size = 256
    data_path = "data/sample/standard_data"
    
    #Feature name
    df_feature = pd.read_csv("data/sample/full_feature_name.csv")
    feature_name_list = df_feature.columns.to_list()

   
    #dataset
    dataset_dict = {}
    original_data_dict = {}
    for task_name in task_name_list:
        """拿掉日期"""
        X_scalar = np.load(f"{data_path}/{data_type}_scalar_X_{task_name}.npy", allow_pickle=True)
        X_original = np.load(f"{data_path}/{data_type}_X_{task_name}.npy", allow_pickle=True)
        X_original_with_id = np.load(f"{data_path}/{data_type}_X_with_id_{task_name}.npy", allow_pickle=True)
        
        if len(select_feature_list)>0:
            X_scalar = select_features(X_scalar,feature_name_list,select_feature_list)
            X_original = select_features(X_original,feature_name_list,select_feature_list)
            feature_name_list = select_feature_list
    
            assert X_scalar.shape[2] == len(select_feature_list)
            assert X_original.shape[2] == len(select_feature_list)
        X_original_with_id = X_original_with_id[:,:,:1]    
        Y = np.load(f"{data_path}/20240129_{data_type}_Y_{task_name}.npy", allow_pickle=True)
        
        if use_upsample:
            if task_name == 'Weaning_successful' and data_type == 'test':
                X_scalar,X_original,Y = upsampling_auto(X_scalar,X_original,Y,2)
        dataset_dict[task_name] = MyDataset(X_scalar,X_original,Y)
        original_data_dict['X_scalar'] = X_scalar
        original_data_dict['X'] = X_original
        original_data_dict['X_with_id'] = X_original_with_id
        original_data_dict['Y'] = Y
    
    #dataloader
    loader_dict = {}
    for key, dataset in dataset_dict.items():        
        loader_dict[key] = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
    
    return dataset_dict,loader_dict,feature_name_list,original_data_dict

In [9]:
from datetime import datetime

start_time = datetime.now()

In [10]:
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from datetime import datetime

def group_result(df):
    agg_columns = {
        'acc': ['mean', 'std'],
        'pre': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'auroc': ['mean', 'std'],
        'brier_score': ['mean', 'std']
    }
    df_group = df.groupby('task').agg(agg_columns)
    df_group.columns = [f"{col[0]}_{col[1]}" for col in df_group.columns]

    for metric in ['acc', 'pre', 'f1', 'recall', 'auroc','brier_score']:
        df_group[f"{metric}_combined"] = df_group.apply(
            lambda row: f"{row[f'{metric}_mean']:.4f} ± {row[f'{metric}_std']:.4f}", axis=1
        )

    df_result = df_group[[f"{metric}_combined" for metric in ['acc', 'pre', 'f1', 'recall', 'auroc','brier_score']]]

    df_result.reset_index(inplace=True)
    df_result.columns = ['task','acc', 'pre', 'f1', 'recall', 'auroc','brier_score']
    return df_result


def save_to_xlsx(df_save,file_name = 'output'):
    wb = Workbook()
    ws = wb.active
    for r_idx, row in enumerate(dataframe_to_rows(df_save, index=False, header=True), 1):
        for c_idx, value in enumerate(row, 1):
            ws.cell(row=r_idx, column=c_idx, value=value)
    wb.save(f'{file_name}.xlsx')
    


# Start

In [11]:
task_name = 'Weaning_successful'

# Feature

In [13]:
path = "./model/group_result/mtl_group/vent_group"
df_feature = pd.read_csv(f"{path}/feature_name_list.csv")
select_feature_list = df_feature['Feature'].tolist()
input_dim = len(select_feature_list)
loss_func = BCEFocalLoss(alpha=alpha, gamma=gamma)


train_dataset_dict,train_loader_dict,feature_name_list,_ = read_data([task_name],"",'train',select_feature_list,batch_size = batch_size,use_upsample = use_upsample)
val_dataset_dict,val_loader_dict,_ ,_= read_data([task_name],"",'validation',select_feature_list,batch_size = batch_size,use_upsample = use_upsample)
test_dataset_dict,test_loader_dict,_ ,_= read_data([task_name],"",'test',select_feature_list,batch_size = batch_size,use_upsample = use_upsample)

print(f'特徵數: {input_dim}')

特徵數: 18


# MTL_model

In [14]:
max_time = 5  #實驗次數(人工輸入，有多少紀錄就填多少)
mode = 'lite'

In [24]:
group_list = ['stl','mtl - vent_group','mtl - mortality_group']

for i in range(0,len(group_list)):
    print(f'[{i+1}]...{group_list[i]}')

select_mode = int(input())-1
print(f'==> {group_list[select_mode]}')


if select_mode == 0:
    data_path = "./model/group_result/stl_group"
elif select_mode == 1:
    data_path = "./model/group_result/mtl_group/vent_group"
else:
    data_path = "./model/group_result/mtl_group/mortality_group"
    

[1]...stl
[2]...mtl - vent_group
[3]...mtl - mortality_group
2
==> mtl - vent_group


In [25]:
""" Weaning_successful (Vent_group) """

row_list = []
for time in range(1,max_time+1):
    model_vent = MLP_MTL(input_dim, task_name_list).to(device)
    model_vent.load_state_dict(torch.load(f'{data_path}/{task_name}_{time}_{mode}'))
    result,_ = test(model_vent, test_dataset_dict, loss_func, is_show = False)
    row_list.append(result[task_name])
    #auroc_list.append(result[task_name]['auroc'])
    
df_result = pd.DataFrame(row_list)    
df_result = df_result.sort_values(by = 'auroc', ascending=False)
#print(df_result[:5].mean())

df_result_group = group_result(df_result[:5])
print(df_result_group)



                 task              acc              pre               f1  \
0  Weaning_successful  0.7546 ± 0.0042  0.6612 ± 0.0103  0.5822 ± 0.0246   

            recall            auroc      brier_score  
0  0.5216 ± 0.0435  0.8202 ± 0.0019  0.1656 ± 0.0009  


In [26]:
""" best model """
model_vent = MLP_MTL(input_dim, task_name_list).to(device)
model_vent.load_state_dict(torch.load(f'{data_path}/{task_name}_best_{mode}'))
result,_ = test(model_vent, test_dataset_dict, loss_func, is_show = False)
print(result)

{'Weaning_successful': {'task': 'Weaning_successful', 'auroc': 0.821, 'acc': 0.758, 'f1': 0.598, 'pre': 0.66, 'recall': 0.546, 'brier_score': 0.165, 'loss': 6.205740544593265e-05}}
