In [11]:
import os
import pathlib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [12]:
# PROJ_PATH = Path().cwd().parent
# DATA_PATH = PROJ_PATH.joinpath('data')
# In[ ]:
# ** loading path info **
current_dir = pathlib.Path.cwd()
parent_dir = current_dir.parent
curr_file_name = os.path.splitext(os.path.basename(os.path.abspath('')))[0]
data_dir = pathlib.Path('{}/data/'.format(parent_dir))
data_ps_dir = data_dir.joinpath('ps')
pathlib.Path.mkdir(data_dir, mode=0o777, parents=True, exist_ok=True)


In [13]:
icu_list = ['MSICU', 'CTICU', 'SICU', 'CCUCTICU', 'MICU', 'NICU', 'CICU', 'CSICU']
icu_df_dict = {}
for icu in icu_list:
    icu_df_dict[icu] = pd.read_feather('{}/{}_df.feather'.format(data_ps_dir, icu))
    print('icu : ', icu, ' len(columns) : ', len(icu_df_dict[icu].columns), ' len(df) : ', len(icu_df_dict[icu]))
    

icu :  MSICU  len(columns) :  94  len(df) :  92369
icu :  CTICU  len(columns) :  212  len(df) :  5578
icu :  SICU  len(columns) :  132  len(df) :  10649
icu :  CCUCTICU  len(columns) :  117  len(df) :  14201
icu :  MICU  len(columns) :  96  len(df) :  14550
icu :  NICU  len(columns) :  135  len(df) :  12710
icu :  CICU  len(columns) :  116  len(df) :  10594
icu :  CSICU  len(columns) :  163  len(df) :  6320


In [14]:
def parsing(df):
    drop_list = ['apacheadmissiondx', 'hospitaladmittime24', 'hospitaldischargelocation', 'hospitaldischargetime24', 'patienthealthsystemstayid', 
                'uniquepid', 'unitadmittime24', 'unitdischargelocation', 'unitdischargestatus', 'unitdischargetime24', 'unittype']
    df.drop(columns = list(set(df.columns)&set(drop_list)), inplace=True)
    encoder = {}
    encoder['gender'] = {"M":1, "F":0, "Female":0, "Male":1}
    encoder['외국인여부'] = {"N":0, "Y":1, 'K':np.nan}
    encoder['death'] = {'Alive':0, "Expired":1}
    encoder['ethnicity'] = {"African American":0, "Caucasian":1,"Hispanic":2, "Asian":3, 'Native American':4, 'Other/Unknown':5}
    df_unitadmitsource = {col : idx for idx, col in enumerate(df.unitadmitsource.unique())}
    encoder['unitadmitsource'] = df_unitadmitsource
    df_unitstaytype = {col : idx for idx, col in enumerate(df.unitstaytype.unique())}
    encoder['unitstaytype'] = df_unitstaytype
    df_hospitaladmitsource = {col : idx for idx, col in enumerate(df.hospitaladmitsource.unique())}
    encoder['hospitaladmitsource'] = df_hospitaladmitsource
    encoder['hospitaldischargestatus'] = {"Expired":1, "Alive":0}
    encoder["death"] = {"Alive":0, "Expired":1}
    df = df.replace(encoder)
    df['age'] = df.age.str.replace('>','').astype('float')
    df['gender'] = df.gender.replace('Other|Unknown',np.nan, regex=True)
    # df = df.dropna(subset=['gender', 'age'], axis=0)
    df = df.dropna(axis=0)
    df = df.rename(columns = {'hospitaldischargestatus':'death', "admissionheight":'height','admissionweight':'weight', 'meanbp':'bp'})
    # df.head()
    df = df.rename(columns = {'patientunitstayid':'연구등록번호'})
    return df

for icu in icu_list:
    print(icu)
    icu_df_dict[icu] = parsing(icu_df_dict[icu])
    print('icu : ', icu, ' len(columns) : ', len(icu_df_dict[icu].columns), ' len(df) : ', len(icu_df_dict[icu]))

MSICU
icu :  MSICU  len(columns) :  83  len(df) :  92325
CTICU
icu :  CTICU  len(columns) :  201  len(df) :  5578
SICU
icu :  SICU  len(columns) :  121  len(df) :  10644
CCUCTICU
icu :  CCUCTICU  len(columns) :  106  len(df) :  14200
MICU
icu :  MICU  len(columns) :  85  len(df) :  14550
NICU
icu :  NICU  len(columns) :  124  len(df) :  12706
CICU
icu :  CICU  len(columns) :  105  len(df) :  10594
CSICU
icu :  CSICU  len(columns) :  152  len(df) :  6318


In [15]:
HICU = pd.read_feather(data_dir.joinpath('hicu.feather'))
HICU['bp'] = HICU.bp.str.extract('(\d{2,3})/(\d{2,3})').apply(lambda x : (float(x[0]) + 2*float(x[1]))/3, axis=1)
HICU['temperature'] = HICU.temperature.str.replace(',','.').str.extract("(\d{2}[.,\,]\d{1,2}|\d{2})").astype('float')
HICU['heartrate'] = HICU.heartrate.str.extract("(\d{2,3})")
HICU['respiratoryrate'] = HICU.respiratoryrate.str.extract("(\d{2})")
import numpy as np
encoder = {}
encoder['gender'] = {"M":1, "F":0, "Female":0, "Male":1}
encoder['외국인여부'] = {"N":0, "Y":1, 'K':np.nan}
encoder['death'] = {'Alive':0, "Expired":1}
HICU = HICU.replace(encoder)
HICU.외국인여부.unique()
HICU = HICU.astype({'age':'float32', 'height':"float", 'weight':"float", 'heartrate':"float", "respiratoryrate":"float", 
                                     "신장Z":'float', "신장P":"float", '체중Z':"float", "체중P":'float', '체표면적':"float",
                                     "death":'float'})
HICU['외국인여부'] = HICU.외국인여부.fillna(2)
death = HICU['death']
HICU = HICU.groupby('death').transform(lambda group : group.fillna(group.mean()))
HICU['death'] = death
HICU.to_feather(data_dir.joinpath('ps','HICU.feather'))

## 데이터 파씽 진행
- common feature는 비슷한 형식으로 encoding 진행
- specific feature에 대해서는 dummy encoding을 진행해야 한다.


Parsing with MICU, SICU

Free text로 되어 있는 admissiondx는 쓰지 않는다. in eICU

Common Feature, Specific Feature 추리기

In [16]:
common_features = set(HICU.columns)
for icu in icu_list:
    common_features = common_features & set(icu_df_dict[icu].columns)
common_features.remove('death')
common_features.remove('연구등록번호')

In [17]:
specific_features = {}
for icu in icu_list:
    specific_features[icu] = set(icu_df_dict[icu].columns) - common_features - set(['연구등록번호', 'death'])
    print(icu, 'specific : ', len(specific_features[icu]) )
specific_features['HICU'] = set(HICU.columns) - common_features - set(['연구등록번호', 'death'])
    
for icu in icu_list:
    data_ps2_dir = pathlib.Path.joinpath(data_dir, 'ps2')
    pathlib.Path.mkdir(data_ps2_dir, mode=0o777, parents=True, exist_ok=True)
    icu_df_dict[icu].reset_index(drop=True, inplace=True)
    icu_df_dict[icu].to_feather('{}/{}_df.feather'.format(data_ps2_dir, icu))
HICU.to_feather('{}/{}_df.feather'.format(data_ps2_dir, 'HICU'))

MSICU specific :  67
CTICU specific :  185
SICU specific :  105
CCUCTICU specific :  90
MICU specific :  69
NICU specific :  108
CICU specific :  89
CSICU specific :  136


# Data EDA

## Logistic Regression

In [18]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler

In [19]:
def _min_max_scaler(df):
    min_max_scalar = MinMaxScaler()
    fitted = min_max_scalar.fit_transform(df[df.columns[1:]])
    df[df.columns[1:]] = fitted
    return df

def collect_significant_variables(df, outcome, variables):
    X = df[variables]
    y = df[outcome]
    lasso = Lasso(alpha=0.003)
    # print(X)
    lasso.fit(X, y)
    print(lasso.coef_)
    importance = np.abs(lasso.coef_)
    selected_features = np.array(X.columns)[importance > 0]
    print(selected_features)
    print(len(selected_features))
    return selected_features

for icu in icu_list:
    icu_df_dict[icu] = _min_max_scaler(icu_df_dict[icu])

for icu in icu_list:
    icu_df_dict[icu] = pd.read_feather('{}/{}_df.feather'.format(data_ps2_dir, icu))


In [20]:
selected_features = {}    
for icu in icu_list:
    outcome = 'death'
    selected_features[icu] = collect_significant_variables(icu_df_dict[icu], outcome, specific_features[icu])
    with open(data_ps2_dir.joinpath('{}_selected.pkl'.format(icu)),'wb') as f:
        pickle.dump(selected_features[icu], f)
    

MSICU
       anion gap  hospitaldischargeyear  hospitaladmitsource  \
0      20.000000                   2015                    0   
1      17.000000                   2015                    1   
2      15.000000                   2015                    1   
3      13.000000                   2014                    2   
4      16.000000                   2015                    1   
...          ...                    ...                  ...   
92320   8.857143                   2015                    2   
92321   8.400000                   2015                    4   
92322   7.000000                   2015                    2   
92323  11.333333                   2014                    2   
92324   5.333333                   2015                    1   

       PANTOPRAZOLE SODIUM 40 MG IV SOLR  hospitaladmitoffset  \
0                                    1.0                    0   
1                                    0.0                  -14   
2                             

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


[-0.00000000e+00 -0.00000000e+00 -0.00000000e+00  1.11299247e-02
 -1.10362645e-03 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -3.38578811e-04 -0.00000000e+00
  4.05901718e-05 -4.79382480e-03 -0.00000000e+00 -0.00000000e+00
  1.49388751e-03 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  5.22872928e-03  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -6.85395111e-04 -0.00000000e+00
 -1.16091660e-03  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -6.50325191e-03 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  1.61838740e-02 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -8.76762891e-04 -1.24329599e-03 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


[ 0.00000000e+00  9.97127343e-03  0.00000000e+00 -0.00000000e+00
 -5.53842742e-03  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -2.21280630e-06 -0.00000000e+00
  2.47464110e-03 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -4.99383380e-04  2.30693383e-04 -0.00000000e+00 -0.00000000e+00
  2.09043522e-04 -3.74154942e-04 -0.00000000e+00  6.26965659e-03
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -4.52416195e-06
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00 -9.97059014e-04
 -1.31778815e-02 -4.81016633e-03 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -3.41734185e-04
 -0.00000000e+00 -0.00000000e+00 -3.25571645e-03  4.29625553e-03
 -0.00000000e+00 -0.00000000e+00  3.22030340e-03  4.54955023e-03
 -1.39292720e-02 -0.00000000e+00  0.00000000e+00  0.00000000e+00
 -3.15782993e-06 -0.00000000e+00  0.00000000e+00 -1.89218895e-06
  1.65381725e-02 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -3.58728458e-03 -0.00000

In [None]:
selected_features

In [None]:
import yaml
with open(data_ps2_dir.joinpath('feature_book.yaml'),'w') as f:
    yaml.dump(selected_features, f, allow_unicode=True)

Get Specific variables for eICU datas

In [None]:
def collect_significant_variables(df, outcome, variables):
    
    model = LogisticRegression(C=0.1, penalty="l2")
    X = df[variables]
    y = df[outcome]
    
    logReg = model.fit(X,y)
    transModel = SelectFromModel(logReg, prefit=True)
    
    X_new = transModel.transform(X)
    return transModel.get_feature_names_out(X.columns)