In [7]:
import pandas as pd 
import numpy as np
import os 
import random
import gzip
import gc
from datetime import datetime
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed = 21):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seed_everything()

In [8]:
d_items_csv = gzip.open('./MIMIC-IV/icu/d_items.csv.gz')
d_items = pd.read_csv(d_items_csv)

del d_items_csv 
gc.collect()

d_labitems_csv = gzip.open('./MIMIC-IV/hosp/d_labitems.csv.gz')
d_labitems = pd.read_csv(d_labitems_csv)

del d_labitems_csv 
gc.collect()

d_labitems.label.fillna('Unknown_code', inplace=True)

# ICU Patients

In [9]:
icu_stay_csv = gzip.open('./MIMIC-IV/icu/icustays.csv.gz')
icu_stay = pd.read_csv(icu_stay_csv)

del icu_stay_csv 
gc.collect()

0

In [10]:
# 동일 hadm_id 중복 제거 
icu_stay = icu_stay.drop_duplicates(subset=['hadm_id'], keep='first')
icu_stay[['subject_id', 'hadm_id', 'intime']].to_csv('icu_stay.csv')
icu_stay = pd.read_csv('icu_stay.csv', index_col=0)

admission = pd.read_csv('./MIMIC-IV/core/admissions.csv')
admission = admission[['hadm_id', 'deathtime']]

patients = pd.read_csv('./MIMIC-IV/core/patients.csv')
patients = patients[['subject_id', 'gender', 'anchor_age']]

icu_stay = pd.merge(icu_stay, patients, on='subject_id', how='left')
icu_stay = pd.merge(icu_stay, admission, on='hadm_id', how='left')

icu_stay['intime'] = pd.to_datetime(icu_stay['intime'])
icu_stay['deathtime'] = pd.to_datetime(icu_stay['deathtime'])

icu_stay['mortality'] = icu_stay['deathtime'] - icu_stay['intime']
icu_stay['mortality_in_second'] = icu_stay.mortality.dt.total_seconds()

# 입원 후 6시간 이내에 데이터를 기반으로 3일 내 죽을 확률 계산
# mortality_in_second가 양수 혹은 null(사망하지 않음)인 데이터만 사용
icu_stay = icu_stay[(icu_stay.mortality_in_second > 0) | (icu_stay.mortality_in_second.isnull())]
icu_stay['mortality_in_3days'] = icu_stay['mortality_in_second'] < 86400 * 3
icu_stay = icu_stay[['hadm_id', 'intime', 'gender', 'anchor_age', 'mortality_in_second', 'mortality_in_3days']]

icu_stay.to_csv('icu_stay_with_3days.csv')
del icu_stay, admission, patients
gc.collect()

0

# Outputevents

In [11]:
data = pd.read_csv('./icu_stay_with_3days.csv', index_col=0)

outputevents_csv = gzip.open('./MIMIC-IV/icu/outputevents.csv.gz')
outputevents = pd.read_csv(outputevents_csv)

del outputevents_csv 
gc.collect()

outputevents = pd.merge(outputevents, data[['hadm_id', 'intime']], on='hadm_id', how='left')

outputevents['intime'] = pd.to_datetime(outputevents['intime'])
outputevents['storetime'] = pd.to_datetime(outputevents['storetime'])

outputevents['time_to_store'] = outputevents['storetime'] - outputevents['intime']
outputevents['time_to_store'] = outputevents['time_to_store'].dt.total_seconds()

# 6시간 이내의 데이터
outputevents['time_to_store_in_day'] = (outputevents['time_to_store'] < 86400 / 4) & (outputevents['time_to_store'] > 0)

outputevents_in_6hour = outputevents[outputevents.time_to_store_in_day]
outputevents_in_6hour = pd.merge(outputevents_in_6hour, d_items[['itemid', 'label']], on=['itemid'], how='left')

# 6시간 내 value의 평균을 사용
tmp = outputevents_in_6hour.groupby(['hadm_id', 'label'])['value'].mean()
tmp = pd.DataFrame(tmp).reset_index()

outputevents_in_6hour_pivot = tmp.pivot(index='hadm_id', columns='label', values='value')
outputevents_in_6hour_pivot = outputevents_in_6hour_pivot.reset_index()
outputevents_in_6hour_pivot.to_csv('outputevents_in_row_mean.csv')


# Inputevents

In [12]:
inputevents_csv = gzip.open('./MIMIC-IV/icu/inputevents.csv.gz')
inputevents = pd.read_csv(inputevents_csv)

del inputevents_csv 
gc.collect()

inputevents = pd.merge(inputevents, data[['hadm_id', 'intime']], on='hadm_id', how='left')

inputevents['intime'] = pd.to_datetime(inputevents['intime'])
inputevents['storetime'] = pd.to_datetime(inputevents['storetime'])

inputevents['time_to_store'] = inputevents['storetime'] - inputevents['intime']
inputevents['time_to_store'] = inputevents['time_to_store'].dt.total_seconds()

# 6시간 이내의 데이터
inputevents['time_to_store_in_day'] = (inputevents['time_to_store'] < 86400 / 4) & (inputevents['time_to_store'] > 0)
inputevents_in_6hour = inputevents[inputevents.time_to_store_in_day]
inputevents_in_6hour = pd.merge(inputevents_in_6hour, d_items[['itemid', 'label']], on=['itemid'], how='left')
inputevents_in_6hour

# 6시간 내 value의 평균을 사용
tmp = inputevents_in_6hour.groupby(['hadm_id', 'label'])['amount'].mean()
tmp = pd.DataFrame(tmp).reset_index()

inputevents_in_6hour_pivot = tmp.pivot(index='hadm_id', columns='label', values='amount')
inputevents_in_6hour_pivot = inputevents_in_6hour_pivot.reset_index()

# weight : first -> last 
inputevents_in_6hour_pivot = pd.merge(inputevents_in_6hour_pivot,
                                      inputevents_in_6hour[['hadm_id', 'patientweight']].drop_duplicates(subset='hadm_id', keep='last'),
                                      on='hadm_id', how='left')

inputevents_in_6hour_pivot.to_csv('inputevents_in_row_mean.csv')
del inputevents, inputevents_in_6hour, tmp
gc.collect()

0

# Labevents

In [13]:
labevents_csv = gzip.open('./MIMIC-IV/hosp/labevents.csv.gz')

result = pd.DataFrame()

for cnt, df in enumerate(pd.read_csv(labevents_csv, chunksize=1e6)):
    df = pd.merge(df, data[['hadm_id', 'intime']], on='hadm_id', how='left')
    df['intime'] = pd.to_datetime(df['intime'])
    df['storetime'] = pd.to_datetime(df['storetime'])

    df['time_to_store'] = df['storetime'] - df['intime']
    df['time_to_store'] = df['time_to_store'].dt.total_seconds()

    # 6시간 이내의 데이터
    df['time_to_store_in_day'] = (df['time_to_store'] < 86400 / 4) & (df['time_to_store'] > 0)
    df = df[df.time_to_store_in_day]
    df = pd.merge(df, d_labitems[['itemid', 'label']], on=['itemid'], how='left')
    result = pd.concat([result, df])

del labevents_csv 
gc.collect()

result.hadm_id = result.hadm_id.astype('int64')
result = result.groupby(['hadm_id', 'label'])['valuenum'].mean()
result = pd.DataFrame(result).reset_index()
result = result.pivot(index='hadm_id', columns='label', values='valuenum').reset_index()

result.to_csv('labevents_in_row_mean.csv')
del result
gc.collect()

0

# Chartevents

In [14]:
chartevent_csv = gzip.open('./MIMIC-IV/icu/chartevents.csv.gz')

result = pd.DataFrame()

cols = ['hadm_id', 'storetime', 'itemid', 'valuenum']

for cnt, df in enumerate(pd.read_csv(chartevent_csv, chunksize=1e6, usecols=cols)):
    df = pd.merge(df, data[['hadm_id', 'intime']], on='hadm_id', how='left')
    df['intime'] = pd.to_datetime(df['intime'])
    df['storetime'] = pd.to_datetime(df['storetime'])

    df['time_to_store'] = df['storetime'] - df['intime']
    df['time_to_store'] = df['time_to_store'].dt.total_seconds()

    # 6시간 이내의 데이터
    df['time_to_store_in_day'] = (df['time_to_store'] < 86400 / 4) & (df['time_to_store'] > 0)
    df = df[df.time_to_store_in_day]
    df = pd.merge(df, d_items[['itemid', 'label']], on=['itemid'], how='left')
    result = pd.concat([result, df])

del chartevent_csv 
gc.collect()

result = result.groupby(['hadm_id', 'label'])['valuenum'].mean()
result = pd.DataFrame(result).reset_index()
result = result.pivot(index='hadm_id', columns='label', values='valuenum').reset_index()
result.to_csv('chartevents_in_row_mean.csv')

del result
gc.collect()

0