In [None]:
# In[ ]:
# ** import package **
import os
import sys
import json
import pathlib
sys.path.append("..")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import traceback
from tqdm import tqdm
from datetime import timedelta
from _utils.customlogger import customlogger as CL

pd.set_option('display.max_colwidth', -1)  #각 컬럼 width 최대로 
pd.set_option('display.max_rows', 50)      # display 50개 까지 

In [None]:
# In[ ]:
# ** loading path info **
current_dir = pathlib.Path.cwd()
parent_dir = current_dir.parent
curr_file_name = os.path.splitext(os.path.basename(os.path.abspath('')))[0]
data_dir = pathlib.Path('{}/data/'.format(parent_dir))
pathlib.Path.mkdir(data_dir, mode=0o777, parents=True, exist_ok=True)

In [None]:
# ** loading data **
MICU_df = pd.read_feather('{}/MICU_df.feather'.format(data_dir))
SICU_df = pd.read_feather('{}/SICU_df.feather'.format(data_dir))

In [None]:
# ** calc Glasgow Coma Scale **
def calculate_gcs(eyes, motor, verbal, meds):
    if meds == 1:
        return np.nan
    return eyes + motor + verbal

MICU_df.loc[:,'gcs'] = MICU_df.apply(lambda x: calculate_gcs(x['eyes'], x['motor'], x['verbal'], x['meds']), axis=1)
MICU_df.drop(['eyes', 'motor', 'verbal', 'meds'], axis=1, inplace=True)

SICU_df.loc[:,'gcs'] = SICU_df.apply(lambda x: calculate_gcs(x['eyes'], x['motor'], x['verbal'], x['meds']), axis=1)
SICU_df.drop(['eyes', 'motor', 'verbal', 'meds'], axis=1, inplace=True)

In [None]:
all_cols_df = pd.read_feather('{}/all_cols_df.feather'.format(data_dir))

In [None]:
def fill_missing_values(df, cols_df):
    pat_feature_list = list(set(df.columns) & set(cols_df.loc[cols_df['concept_table']=='pat'].concept_name))
    apc_feature_list = list(set(df.columns) & set(cols_df.loc[cols_df['concept_table']=='apc'].concept_name))
    med_feature_list = list(set(df.columns) & set(cols_df.loc[cols_df['concept_table']=='med'].concept_name))
    lab_feature_list = list(set(df.columns) & set(cols_df.loc[cols_df['concept_table']=='lab'].concept_name))
    adm_feature_list = list(set(df.columns) & set(cols_df.loc[cols_df['concept_table']=='adm'].concept_name))

    print(len(pat_feature_list), len(apc_feature_list), len(med_feature_list), len(lab_feature_list), len(adm_feature_list))
    print('total : ', len(pat_feature_list)+len(apc_feature_list)+len(med_feature_list)+len(lab_feature_list)+len(adm_feature_list))

    ## patient + apacheapsvar
    df = df[~df['hospitaldischargestatus'].isna()]
    expired_loc = df['hospitaldischargestatus']=='Expired'
    alive_loc = df['hospitaldischargestatus']=='Alive'
    df['ethnicity'] = df['ethnicity'].fillna('Other/Unknown')
    df['apacheadmissiondx'] = df['apacheadmissiondx'].fillna('None')
    df['unitdischargestatus'] = df['unitdischargestatus'].fillna('None')
    df['hospitaladmitsource'] = df['hospitaladmitsource'].fillna('None')
    df['hospitaldischargelocation'] = df['hospitaldischargelocation'].fillna('None')
    df['unitadmitsource'] = df['unitadmitsource'].fillna('None')
    df['unitdischargelocation'] = df['unitdischargelocation'].fillna('None')
    other_cols = ['admissionheight', 'admissionweight', 'dischargeweight', 'gcs']
    df.loc[expired_loc,other_cols] = df.loc[expired_loc,other_cols].fillna(df.loc[expired_loc,other_cols].mean())
    df.loc[alive_loc,other_cols] = df.loc[alive_loc,other_cols].fillna(df.loc[alive_loc,other_cols].mean())
    
    ## lab, medication, admissiondrug
    df.loc[expired_loc,lab_feature_list] = df.loc[expired_loc,lab_feature_list].fillna(df.loc[expired_loc,lab_feature_list].mean())
    df.loc[alive_loc,lab_feature_list] = df.loc[alive_loc,lab_feature_list].fillna(df.loc[alive_loc,lab_feature_list].mean())
    df[med_feature_list] = df[med_feature_list].fillna(0)
    df[adm_feature_list] = df[adm_feature_list].fillna(0)
    return df
    
SICU_df = fill_missing_values(SICU_df, all_cols_df)
MICU_df = fill_missing_values(MICU_df, all_cols_df)

In [None]:
def resumetable(df):
    print(f'data frame shape: {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=['data_type'])
    summary = summary.reset_index()
    summary = summary.rename(columns={'index': 'feature'})
    summary['n_missingvalues'] = df.isnull().sum().values
    summary['n_missingrate'] = df.isnull().sum().values/len(df)
    summary['n_eigenvalues'] = df.nunique().values
    print(len(summary[summary['n_missingvalues'] != 0].feature))
    print(list(summary[summary['n_missingvalues'] != 0].feature))
    return summary
summary = resumetable(SICU_df)
summary = resumetable(MICU_df)

In [None]:
# ** make to dict rename columns **
eicu_mapping = pd.read_csv('eicu_mapping.csv')
eicu_mapping.replace(['\r\n', '\n'], '', inplace=True)
eicu_mapping_diff = eicu_mapping[eicu_mapping['concept_name'] != eicu_mapping['feature_new_name']]
rename_dict = dict(zip(eicu_mapping_diff.concept_name, eicu_mapping_diff.feature_new_name))
print(rename_dict)

# ** rename columns **
MICU_df.rename(rename_dict, axis='columns', inplace=True)
SICU_df.rename(rename_dict, axis='columns', inplace=True)

MICU_df.reset_index(drop=True, inplace=True)
SICU_df.reset_index(drop=True, inplace=True)

In [None]:
# ** check duplicated columns **
def check_duplicated_columns(df):
    cols = list(df.columns)
    dup = [x for i, x in enumerate(cols) if i != cols.index(x)]
    print(dup)
    
check_duplicated_columns(SICU_df)
check_duplicated_columns(MICU_df)
    

In [None]:
SICU_df.to_feather('{}/SICU_ps_df.feather'.format(data_dir))
MICU_df.to_feather('{}/MICU_ps_df.feather'.format(data_dir))