In [2]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

------

In [7]:
#Dataset

pd.set_option('display.max_columns',None)
mimic = pd.read_csv('/Users/DAHS/Desktop/MIMIC_IV_CIRC(12h)/MIMIC-IV-CIRC(12h).csv.gz', compression='gzip')

In [8]:
dataset = mimic.copy()

# Null 값 확인
columns_with_null = dataset.columns[dataset.isnull().any()]
columns_with_null

Index([], dtype='object')

In [9]:
# 새롭게 labeling
def early_event_prediction_label(df):
    
    data = df.copy()
    
    data['classes'] = 'undefined'

    class1 = data[(data['CIRC_next_12h']==0) & (data['Annotation']=='no_circ')].index
    data.loc[class1,'classes'] = 0
    
    class2 = data[(data['CIRC_next_12h']==1) & (data['Annotation']=='no_circ')].index
    data.loc[class2,'classes'] = 1
    
    return data


result_1 = early_event_prediction_label(dataset)

In [10]:
def optimized_recovered_labeler(df):
    # DataFrame 복사본 생성
    targ = df.copy()

    # 각 stay_id에 대해 반복
    for stay_id in tqdm(targ['stay_id'].unique()):
        stay_df = targ[targ['stay_id'] == stay_id].sort_values(by='Time_since_ICU_admission')
        for idx, row in stay_df.iterrows():
            if row['Annotation'] == 'circ':
                current_time = row['Time_since_ICU_admission']
                endpoint_window = current_time + 12

                # 24시간 창 내 데이터 필터링
                window = stay_df[(stay_df['Time_since_ICU_admission'] >= current_time) & (stay_df['Time_since_ICU_admission'] < endpoint_window)]
                if len(window) > 0:
                    # 상태별 발생 빈도 계산
                    counts = window['Annotation'].value_counts()
                    count_amb_no_circ = counts.get('ambiguous', 0) + counts.get('no_circ', 0)
                    count_amb_circ = counts.get('ambiguous', 0) + counts.get('circ', 0)
                    total_state = len(window)

                    recovery_ratio = count_amb_no_circ / total_state
                    no_recovery_ratio = count_amb_circ / total_state

                    # 조건에 따른 'classes' 설정
                    if recovery_ratio >= 0.7 and counts.get('no_circ', 0) > 0:
                        targ.loc[idx, 'classes'] = 2
                    elif no_recovery_ratio >= 0.7 and counts.get('circ', 0) > 0:
                        targ.loc[idx, 'classes'] = 3

    return targ

In [11]:
result_2 = optimized_recovered_labeler(result_1)

100%|██████████| 20767/20767 [35:39<00:00,  9.70it/s]  


In [12]:
result_2.to_csv('case_study.csv.gz', compression='gzip')

In [7]:
result_2.classes.value_counts()

0            1695407
undefined     960497
3             303183
1              31842
2               9611
Name: classes, dtype: int64

In [8]:
result_3 = result_2[~(result_2['Annotation']=='ambiguous') & ~(result_2['classes']=='undefined')]

In [9]:
result_3.classes.value_counts().sort_index()

0    1695407
1      31842
2       9611
3     303183
Name: classes, dtype: int64

In [10]:
print('전체 샘플의 개수')
print(len(result_3))

print()

print('샘플의 클래스 개수')
print(result_3.classes.value_counts().sort_index())

print()

print('총 Stay 수')
print(len(result_3.stay_id.unique()))

print()

print('총 환자 수')
print(len(result_3.subject_id.unique()))


전체 샘플의 개수
2040043

샘플의 클래스 개수
0    1695407
1      31842
2       9611
3     303183
Name: classes, dtype: int64

총 Stay 수
20549

총 환자 수
18455


In [11]:
result_3.to_csv('mimic_df.csv.gz', compression='gzip')