In [1]:
import pandas as pd

note_path = "/data1/project/juhyeon/MAGNET/OSS/mimic-iii-10k/NOTEEVENTS/NOTEEVENTS_sorted.csv"
adm_path  = "/data1/project/juhyeon/MAGNET/OSS/mimic-iii-10k/ADMISSIONS/ADMISSIONS_sorted.csv"
icu_path  = "/data1/project/juhyeon/MAGNET/OSS/mimic-iii-10k/ICUSTAYS/ICUSTAYS_sorted.csv"

notes_raw = pd.read_csv(
    note_path,
    parse_dates=["CHARTDATE", "CHARTTIME"],  # STORETIME 없으면 생략
    low_memory=False
)

adm_raw = pd.read_csv(
    adm_path,
    parse_dates=["ADMITTIME", "DISCHTIME"],  # DEATHTIME, EDREGTIME 등 없는 경우 생략
    low_memory=False
)

icu_raw = pd.read_csv(
    icu_path,
    parse_dates=["INTIME", "OUTTIME"],
    low_memory=False
)


#    adm  -> SUBJECT_ID, HADM_ID, ADMITTIME, DEATHTIME, HOSPITAL_EXPIRE_FLAG
#    notes-> SUBJECT_ID, HADM_ID, CHARTTIME, CATEGORY, ISERROR, TEXT
#    icu  -> SUBJECT_ID, HADM_ID, ICUSTAY_ID, INTIME, OUTTIME

# ADMISSIONS 쪽: DEATHTIME이 없을 수도 있으므로 없으면 만들고 진행
adm_cols = ["SUBJECT_ID", "HADM_ID", "ADMITTIME", "HOSPITAL_EXPIRE_FLAG"]
if "DEATHTIME" in adm_raw.columns:
    adm_cols.append("DEATHTIME")
admissions = adm_raw[adm_cols].copy()

# NOTEEVENTS 쪽: ISERROR 컬럼이 있는지 확인해서 없으면 0으로 채운다
note_cols = ["SUBJECT_ID", "HADM_ID", "CHARTTIME", "CATEGORY", "TEXT"]
if "ISERROR" in notes_raw.columns:
    note_cols.insert(4, "ISERROR")  # TEXT는 맨 끝으로 가도록
notes = notes_raw[note_cols].copy()

# ICUSTAYS 쪽
icu_cols = ["SUBJECT_ID", "HADM_ID", "ICUSTAY_ID", "INTIME", "OUTTIME"]
icustays = icu_raw[icu_cols].copy()

# 노트 전처리 
#    - ISERROR == 1 인 노트 제거
#    - TEXT NaN -> "" 후 공백 압축 & strip
if "ISERROR" in notes.columns:
    notes = notes[notes["ISERROR"].fillna(0) != 1]

notes["TEXT"] = (
    notes["TEXT"]
    .fillna("")
    .str.replace(r"\s+", " ", regex=True)
    .str.strip())


# HADM_ID 결측 제거 (ICU/ADM과 붙이려면 필요)
notes = notes.dropna(subset=["HADM_ID"]).copy()
notes["HADM_ID"] = notes["HADM_ID"].astype("int64", errors="ignore")

admissions = admissions.dropna(subset=["HADM_ID"]).copy()
admissions["HADM_ID"] = admissions["HADM_ID"].astype("int64", errors="ignore")

icustays = icustays.dropna(subset=["HADM_ID"]).copy()
icustays["HADM_ID"] = icustays["HADM_ID"].astype("int64", errors="ignore")


# ICU 입실 시간 정리 (src 스타일)
#    동일 (SUBJECT_ID, HADM_ID) 조합에 대해 INTIME이 가장 이른 ICU stay만 남김
icustays = (
    icustays
    .sort_values("INTIME")
    .drop_duplicates(subset=["SUBJECT_ID", "HADM_ID"], keep="first")
    .copy())

# 병합: notes + admissions + icu
df = (
    notes
    .merge(admissions, on=["SUBJECT_ID", "HADM_ID"], how="inner")
    .merge(icustays,  on=["SUBJECT_ID", "HADM_ID"], how="inner"))

# ICU 입실 시간 기준 48시간 윈도우 필터
#    INTIME ≤ CHARTTIME ≤ INTIME + 48h
df = df.dropna(subset=["INTIME", "CHARTTIME"]).copy()

time_mask = (
    (df["CHARTTIME"] >= df["INTIME"]) &
    (df["CHARTTIME"] <= df["INTIME"] + pd.Timedelta(hours=48)))
df = df[time_mask].copy()


# 라벨 생성 HOSPITAL_EXPIRE_FLAG -> label (int)
df["label"] = df["HOSPITAL_EXPIRE_FLAG"].fillna(0).astype(int)

# ICU 기준 경과 시간(Hours) 계산
# Hours = (CHARTTIME - INTIME) [hr]
df["Hours"] = (df["CHARTTIME"] - df["INTIME"]).dt.total_seconds() / 3600.0


final_dataset = df[[
    "SUBJECT_ID",
    "HADM_ID",
    "ICUSTAY_ID",
    "CATEGORY",
    "TEXT",
    "CHARTTIME",
    "INTIME",
    "Hours",
    "label",
    "HOSPITAL_EXPIRE_FLAG"]].copy()

print("=== 최종 데이터셋 샘플 ===")
print(final_dataset.head(3))
print(f"\n최종 행 수(=ICU 입실 후 48h 노트 수): {len(final_dataset):,}")

print("\n라벨 분포:")
print((final_dataset["label"].value_counts(normalize=True) * 100).round(2))


=== 최종 데이터셋 샘플 ===
   SUBJECT_ID  HADM_ID  ICUSTAY_ID       CATEGORY  \
0           2   163353      243653  Nursing/other   
1           2   163353      243653  Nursing/other   
5           3   145834      211552  Nursing/other   

                                                TEXT           CHARTTIME  \
0  Neonatology Attending Triage Note Baby [**Name... 2138-07-17 22:51:00   
1  Nursing Transfer note Pt admitted to NICU for ... 2138-07-17 23:08:00   
5  Micu Progress Nursing Note: Patient arrived in... 2101-10-21 06:58:00   

               INTIME      Hours  label  HOSPITAL_EXPIRE_FLAG  
0 2138-07-17 21:20:07   1.514722      0                     0  
1 2138-07-17 21:20:07   1.798056      0                     0  
5 2101-10-20 19:10:11  11.796944      0                     0  

최종 행 수(=ICU 입실 후 48h 노트 수): 93,781

라벨 분포:
label
0    88.32
1    11.68
Name: proportion, dtype: float64


In [2]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93781 entries, 0 to 430964
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   SUBJECT_ID            93781 non-null  int64         
 1   HADM_ID               93781 non-null  int64         
 2   ICUSTAY_ID            93781 non-null  int64         
 3   CATEGORY              93781 non-null  object        
 4   TEXT                  93781 non-null  object        
 5   CHARTTIME             93781 non-null  datetime64[ns]
 6   INTIME                93781 non-null  datetime64[ns]
 7   Hours                 93781 non-null  float64       
 8   label                 93781 non-null  int64         
 9   HOSPITAL_EXPIRE_FLAG  93781 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(5), object(2)
memory usage: 7.9+ MB


In [None]:
# import os
# import glob

# # 대상 폴더 경로
# folder_path = "/data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note"

# # 폴더 안의 모든 .csv 파일 경로 리스트
# csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# # 파일 삭제
# for file_path in csv_files:
#     try:
#         os.remove(file_path)
#         print(f"Deleted: {file_path}")
#     except Exception as e:
#         print(f"Error deleting {file_path}: {e}")

# print("✅ 모든 .csv 파일 삭제 완료")


Deleted: /data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note/6454_episode971.csv
Deleted: /data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note/9220_episode2121.csv
Deleted: /data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note/7092_episode1770.csv
Deleted: /data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note/2074_episode1486.csv
Deleted: /data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note/4329_episode2224.csv
Deleted: /data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note/4419_episode1282.csv
Deleted: /data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note/1863_episode2101.csv
Deleted: /data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note/668_episode1629.csv
Deleted: /data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DA

In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# ⚙️ 1️⃣ 설정
BASE_PATH = "/data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data"
TASK_NAME = "in-hospital-mortality"
SAVE_DIR = f"{BASE_PATH}/DATA_RAW/{TASK_NAME}"

# 폴더 구조 생성
os.makedirs(f"{SAVE_DIR}/train_note", exist_ok=True)
os.makedirs(f"{SAVE_DIR}/test_note", exist_ok=True)

# ⚙️ 2️⃣ final_dataset 불러오기 (이미 메모리에 있다면 이 부분 생략)
# final_dataset = pd.read_csv("/path/to/final_dataset.csv")

print(f"전체 데이터 수: {len(final_dataset):,}")
print(final_dataset.head(2))

# ⚙️ 3️⃣ (환자 단위 split)
patients = final_dataset["SUBJECT_ID"].unique()
train_pats, test_pats = train_test_split(patients, test_size=0.2, random_state=42)

# ⚙️ 4️⃣ train/test 분리
train_df = final_dataset[final_dataset["SUBJECT_ID"].isin(train_pats)].copy()
test_df  = final_dataset[final_dataset["SUBJECT_ID"].isin(test_pats)].copy()

print(f"train 환자 수: {len(train_pats)}, test 환자 수: {len(test_pats)}")
print(f"train 노트 수: {len(train_df):,}, test 노트 수: {len(test_df):,}")

# ⚙️ 5️⃣ HADM_ID 기준으로 episode 만들기
def make_episode_id(df):
    df = df.copy()
    df["episode"] = df.groupby("HADM_ID").ngroup().astype(str)
    return df

train_df = make_episode_id(train_df)
test_df = make_episode_id(test_df)

# ⚙️ 6️⃣ listfile.csv 생성
def make_listfile(df, split):
    list_records = []
    for (subj, hadm, epi), g in df.groupby(["SUBJECT_ID", "HADM_ID", "episode"]):
        label = int(g["label"].iloc[0])
        list_records.append({
            "patient": subj,
            "episode": f"episode{epi}",
            "y_true": label
        })
    listfile = pd.DataFrame(list_records)
    listfile.to_csv(f"{SAVE_DIR}/{split}_note/listfile.csv", index=False)
    print(f"{split}_note/listfile.csv 저장 완료 ({len(listfile)}행)")
    return listfile

train_list = make_listfile(train_df, "train")
test_list = make_listfile(test_df, "test")

# ⚙️ 7️⃣ episode별 파일 생성
def save_episode_csvs(df, split):
    base_path = f"{SAVE_DIR}/{split}_note"
    for (subj, hadm, epi), g in tqdm(df.groupby(["SUBJECT_ID", "HADM_ID", "episode"]),
                                     desc=f"Saving {split}_note files"):
        fname = f"{subj}_episode{epi}.csv"
        g_out = g[[
            "Hours", "HADM_ID", "SUBJECT_ID", "CATEGORY",
            "TEXT", "label"
        ]].copy()

        # note_id, SENT, WORD 등 downstream에서 요구되는 기본 컬럼 추가
        # (임시 더미로 note_id만 추가, 후속 단계에서 문장/단어 단위로 확장 가능)
        g_out["note_id"] = range(len(g_out))
        g_out["SENT"] = g_out["TEXT"].str[:50]  # 문장 단위 요약용 예시
        g_out["WORD"] = g_out["TEXT"].str.split().str[:10].apply(lambda x: " ".join(x))

        g_out.to_csv(f"{base_path}/{fname}", sep='\t', index=False)
    print(f"{split}_note 파일 저장 완료 ✅")

save_episode_csvs(train_df, "train")
save_episode_csvs(test_df, "test")

print("\n✅ TM-HGNN 인풋 폴더 구조 생성 완료!")
print(f"결과 폴더: {SAVE_DIR}")


전체 데이터 수: 93,781
   SUBJECT_ID  HADM_ID  ICUSTAY_ID       CATEGORY  \
0           2   163353      243653  Nursing/other   
1           2   163353      243653  Nursing/other   

                                                TEXT           CHARTTIME  \
0  Neonatology Attending Triage Note Baby [**Name... 2138-07-17 22:51:00   
1  Nursing Transfer note Pt admitted to NICU for ... 2138-07-17 23:08:00   

               INTIME     Hours  label  HOSPITAL_EXPIRE_FLAG  
0 2138-07-17 21:20:07  1.514722      0                     0  
1 2138-07-17 21:20:07  1.798056      0                     0  
train 환자 수: 7930, test 환자 수: 1983
train 노트 수: 74,559, test 노트 수: 19,222
train_note/listfile.csv 저장 완료 (9858행)
test_note/listfile.csv 저장 완료 (2467행)


Saving train_note files: 100%|██████████| 9858/9858 [00:29<00:00, 330.10it/s]


train_note 파일 저장 완료 ✅


Saving test_note files: 100%|██████████| 2467/2467 [00:07<00:00, 346.70it/s]

test_note 파일 저장 완료 ✅

✅ TM-HGNN 인풋 폴더 구조 생성 완료!
결과 폴더: /data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality





In [10]:
import pandas as pd
import csv


df = pd.read_csv(
    "/data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note/2_episode1552.csv",
    sep='\t',
    engine='python')

print(df.columns)
print(df['TEXT'])
df.head()

Index(['Hours', 'HADM_ID', 'SUBJECT_ID', 'CATEGORY', 'TEXT', 'label',
       'note_id', 'SENT', 'WORD'],
      dtype='object')
0    Neonatology Attending Triage Note Baby [**Name...
1    Nursing Transfer note Pt admitted to NICU for ...
Name: TEXT, dtype: object


Unnamed: 0,Hours,HADM_ID,SUBJECT_ID,CATEGORY,TEXT,label,note_id,SENT,WORD
0,1.514722,163353,2,Nursing/other,Neonatology Attending Triage Note Baby [**Name...,0,0,Neonatology Attending Triage Note Baby [**Name...,Neonatology Attending Triage Note Baby [**Name...
1,1.798056,163353,2,Nursing/other,Nursing Transfer note Pt admitted to NICU for ...,0,1,Nursing Transfer note Pt admitted to NICU for ...,Nursing Transfer note Pt admitted to NICU for ...


In [None]:
import pandas as pd
import csv


df = pd.read_csv(
    "/data1/project/juhyeon/MAGNET/OSS/TM-HGNN/data/DATA_RAW/in-hospital-mortality/test_note/2_episode1552.csv",
    sep='\t',
    engine='python',
    encoding='utf-8')

print(df.columns)
print(df['TEXT'])
df.head()