In [59]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

------

In [60]:
#Dataset

pd.set_option('display.max_columns',None)
mimic = pd.read_csv('mimic-iv_circ12h(의료정보학회추계-1120).csv', compression='gzip')

In [61]:
dataset = mimic.copy()

# Null 값 확인
columns_with_null = dataset.columns[dataset.isnull().any()]
columns_with_null

Index([], dtype='object')

In [62]:
# 새롭게 labeling
def early_event_prediction_label(df):
    
    data = df.copy()
    
    data['classes'] = 'undefined'

    class1 = data[(data['circ_next_12h']==0) & (data['Annotation']=='no_circ')].index
    data.loc[class1,'classes'] = 0
    
    class2 = data[(data['circ_next_12h']==1) & (data['Annotation']=='no_circ')].index
    data.loc[class2,'classes'] = 1
    
    return data


result_1 = early_event_prediction_label(dataset)

In [63]:
# def patient_window(targ, mask):
    
#     window_index = targ[mask].index
#     targ_sample = targ[mask]
    
#     for j, idx in enumerate(window_index):
#         current_time = targ.loc[mask, 'Time_since_ICU_admission'].loc[idx]
#         endpoint_window = current_time + 24
        
#         if targ.loc[idx, 'Annotation'] == 'circ':
        
#             try:
#                 window = targ_sample[(targ_sample['Time_since_ICU_admission'] >= current_time) & (targ_sample['Time_since_ICU_admission'] < endpoint_window)]
#                 count_amb_no_circ = window['Annotation'].value_counts().get('ambiguous', 0) + window['Annotation'].value_counts().get('no_circ', 0)
#                 count_amb_circ = window['Annotation'].value_counts().get('ambiguous', 0) + window['Annotation'].value_counts().get('circ', 0)
#                 total_state = len(window)
                
#                 if total_state > 0:
#                     recovery_ratio = count_amb_no_circ / total_state
#                     no_recovery_ratio = count_amb_circ / total_state
                
#                     if (recovery_ratio >= 0.7) & (window['Annotation'].value_counts().get('no_circ', 0) > 0):
#                         targ.loc[idx, 'classes'] = 2
                        
#                     elif (no_recovery_ratio >= 0.7) & (window['Annotation'].value_counts().get('circ', 0) > 0):
#                         targ.loc[idx, 'classes'] = 3
                        
#                     else:
#                         targ.loc[idx, 'classes'] = 'undefined'
#                 else:
#                     targ.loc[idx, 'classes'] = 'undefined'

#             except: # stay 관측치가 1개인 경우
#                 targ.loc[idx, 'classes'] = 'undefined'   

# def circfailure_labeler(df): 
#     targ = df.copy()
    
#     unique_stay_ids = targ['stay_id'].unique()
#     for i in tqdm(range(len(unique_stay_ids))):
#         stay_id = unique_stay_ids[i]
#         mask = targ['stay_id'] == stay_id 
#         patient_window(targ, mask)

#     return targ     


def optimized_recovered_labeler(df):
    # DataFrame 복사본 생성
    targ = df.copy()

    # 각 stay_id에 대해 반복
    for stay_id in tqdm(targ['stay_id'].unique()):
        stay_df = targ[targ['stay_id'] == stay_id].sort_values(by='Time_since_ICU_admission')
        for idx, row in stay_df.iterrows():
            if row['Annotation'] == 'circ':
                current_time = row['Time_since_ICU_admission']
                endpoint_window = current_time + 12

                # 24시간 창 내 데이터 필터링
                window = stay_df[(stay_df['Time_since_ICU_admission'] >= current_time) & (stay_df['Time_since_ICU_admission'] < endpoint_window)]
                if len(window) > 0:
                    # 상태별 발생 빈도 계산
                    counts = window['Annotation'].value_counts()
                    count_amb_no_circ = counts.get('ambiguous', 0) + counts.get('no_circ', 0)
                    count_amb_circ = counts.get('ambiguous', 0) + counts.get('circ', 0)
                    total_state = len(window)

                    recovery_ratio = count_amb_no_circ / total_state
                    no_recovery_ratio = count_amb_circ / total_state

                    # 조건에 따른 'classes' 설정
                    if recovery_ratio >= 0.7 and counts.get('no_circ', 0) > 0:
                        targ.loc[idx, 'classes'] = 2
                    elif no_recovery_ratio >= 0.7 and counts.get('circ', 0) > 0:
                        targ.loc[idx, 'classes'] = 3

    return targ

In [64]:
result_2 = optimized_recovered_labeler(result_1)

100%|██████████| 13919/13919 [44:41<00:00,  5.19it/s]  


In [65]:
result_2.classes.value_counts()

0            1391814
undefined    1290424
3             362211
1              13180
2               6267
Name: classes, dtype: int64

In [70]:
result_3 = result_2[~(result_2['Annotation']=='ambiguous') & ~(result_2['classes']=='undefined')]

In [71]:
result_3.classes.value_counts().sort_index()

0    1391814
1      13180
2       6267
3     362211
Name: classes, dtype: int64

In [72]:
print('전체 샘플의 개수')
print(len(result_3))

print()

print('샘플의 클래스 개수')
print(result_3.classes.value_counts().sort_index())

print()

print('총 Stay 수')
print(len(result_3.stay_id.unique()))

print()

print('총 환자 수')
print(len(result_3.subject_id.unique()))


전체 샘플의 개수
1773472

샘플의 클래스 개수
0    1391814
1      13180
2       6267
3     362211
Name: classes, dtype: int64

총 Stay 수
11977

총 환자 수
10736


In [73]:
result_3.to_csv('mimic_df.csv', compression='gzip')

In [2]:
import pandas as pd
result_3 = pd.read_csv('mimic_df.csv', compression='gzip')