In [400]:
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression

In [401]:
PROJ_PATH = Path().cwd().parent
DATA_PATH = PROJ_PATH.joinpath('data')

In [476]:
MICU = pd.read_feather('MICU_ps_df.feather')
SICU = pd.read_feather('SICU_ps_df.feather')

In [444]:
HICU = pd.read_feather(DATA_PATH.joinpath('hicu.feather'))

## 데이터 파씽 진행
- common feature는 비슷한 형식으로 encoding 진행
- specific feature에 대해서는 dummy encoding을 진행해야 한다.


In [445]:
HICU['bp'] = HICU.bp.str.extract('(\d{2,3})/(\d{2,3})').apply(lambda x : (float(x[0]) + 2*float(x[1]))/3, axis=1)

In [446]:
HICU['temperature'] = HICU.temperature.str.replace(',','.').str.extract("(\d{2}[.,\,]\d{1,2}|\d{2})").astype('float')

In [447]:
HICU['heartrate'] = HICU.heartrate.str.extract("(\d{2,3})")

In [448]:
HICU['respiratoryrate'] = HICU.respiratoryrate.str.extract("(\d{2})")

In [449]:
import numpy as np
encoder = {}

encoder['gender'] = {"M":1, "F":0, "Female":0, "Male":1}
encoder['외국인여부'] = {"N":0, "Y":1, 'K':np.nan}
encoder['death'] = {'Alive':0, "Expired":1}

In [450]:
HICU = HICU.replace(encoder)

In [451]:
HICU.외국인여부.unique()

array([ 0.,  1., nan])

In [452]:
HICU = HICU.astype({'age':'float32', 'height':"float", 'weight':"float", 'heartrate':"float", "respiratoryrate":"float", 
                                     "신장Z":'float', "신장P":"float", '체중Z':"float", "체중P":'float', '체표면적':"float",
                                     "death":'float'})

In [453]:
HICU['외국인여부'] = HICU.외국인여부.fillna(2)

In [455]:
death = HICU['death']

In [456]:
HICU = HICU.groupby('death').transform(lambda group : group.fillna(group.mean()))

In [461]:
HICU['death'] = death

In [464]:
HICU.to_feather(DATA_PATH.joinpath('processed','HICU.feather'))

Parsing with MICU, SICU

In [477]:
MICU.drop(columns = ['patienthealthsystemstayid', 'hospitaladmittime24','hospitaldischargetime24','unitadmittime24','unitdischargetime24','uniquepid',
                     'hospitaldischargelocation','unitdischargestatus','unittype','unitdischargelocation','apacheadmissiondx',], inplace=True)

In [487]:
encoder['ethnicity'] = {"African American":0, "Caucasian":1,"Hispanic":2, "Asian":3,
                        'Native American':4, 'Other/Unknown':5}
micu_unitadmitsource = {col : idx for idx, col in enumerate(MICU.unitadmitsource.unique())}

encoder['unitadmitsource'] = micu_unitadmitsource

micu_unitstaytype = {col : idx for idx, col in enumerate(MICU.unitstaytype.unique())}
encoder['unitstaytype'] = micu_unitstaytype

micu_hospitaladmitsource = {col : idx for idx, col in enumerate(MICU.hospitaladmitsource.unique())}
encoder['hospitaladmitsource'] = micu_hospitaladmitsource

encoder["death"] = {"Alive":0, "Expired":1}

In [488]:
MICU = MICU.replace(encoder)

In [482]:
MICU['age'] = MICU.age.str.replace('>','').astype('float')

In [483]:
MICU = MICU.rename(columns = {'hospitaldischargestatus':'death', "admissionheight":'height','admissionweight':'weight', 'meanbp':'bp'})

In [491]:
MICU.head()

Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,wardid,height,hospitaladmitoffset,hospitaladmitsource,hospitaldischargeyear,...,alkaline phos.,anion gap,bicarbonate,calcium,chloride,magnesium,platelets x 1000,potassium,total protein,gcs
0,141328,0,76.0,0,73,97,157.5,-6,0,2014,...,92.0,11.5,30.0,8.55,95.0,2.1,174.5,4.15,7.3,13.0
1,141366,1,81.0,1,73,97,172.7,-1982,1,2015,...,82.0,9.0,32.6,8.44,102.0,2.1,295.833333,4.54,6.3,15.0
2,141392,0,78.0,1,73,97,160.0,-1,0,2014,...,72.0,9.0,32.875,8.925,100.0,1.9,172.625,3.635714,6.633333,15.0
3,141462,1,80.0,1,73,97,170.2,-1,2,2014,...,57.0,10.0,28.833333,8.566667,100.5,2.016667,368.5,3.855556,6.4,15.0
4,141475,1,87.0,1,73,97,180.3,-7267,2,2015,...,72.0,9.2,25.4,7.82,114.6,1.95192,182.5,3.566667,6.15,10.0


In [492]:
SICU.select_dtypes('object')

Unnamed: 0,gender,age,ethnicity,apacheadmissiondx,hospitaladmittime24,hospitaladmitsource,hospitaldischargetime24,hospitaldischargelocation,hospitaldischargestatus,unittype,unitadmittime24,unitadmitsource,unitstaytype,unitdischargetime24,unitdischargelocation,unitdischargestatus,uniquepid
0,Male,63,Caucasian,"Hypovolemia (including dehydration, Do not inc...",04:18:00,Floor,03:41:00,Death,Expired,SICU,17:52:00,Floor,admit,17:00:00,Floor,Alive,002-30269
1,Male,63,Caucasian,"Sepsis, pulmonary",04:18:00,Floor,03:41:00,Death,Expired,SICU,20:32:00,Floor,readmit,03:41:00,Death,Expired,002-30269
2,Male,45,Caucasian,"Aneurysm, abdominal aortic; with rupture",00:41:00,Other Hospital,18:46:00,Death,Expired,SICU,08:07:00,Operating Room,transfer,18:39:00,Death,Expired,002-70742
3,Female,58,Caucasian,"Infarction, acute myocardial (MI)",13:43:00,Emergency Department,20:43:00,Home,Alive,SICU,14:55:00,Emergency Department,admit,20:18:00,Home,Alive,002-71403
4,Male,82,Caucasian,Head/extremity trauma,14:00:00,,22:36:00,Home,Alive,SICU,14:03:00,Direct Admit,admit,21:34:00,Floor,Alive,002-17433
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10644,Female,74,Caucasian,Chest/extremity trauma,09:01:00,Emergency Department,18:37:00,Home,Alive,SICU,10:07:00,Emergency Department,admit,20:25:00,Step-Down Unit (SDU),Alive,035-12535
10645,Male,19,African American,Abdomen/pelvis trauma,14:24:00,Recovery Room,20:49:00,Rehabilitation,Alive,SICU,20:40:00,Recovery Room,admit,18:00:00,Step-Down Unit (SDU),Alive,035-22455
10646,Male,45,Caucasian,Chest/thorax only trauma,22:23:00,Emergency Department,02:10:00,Home,Alive,SICU,23:00:00,Operating Room,admit,18:25:00,Telemetry,Alive,035-6647
10647,Female,27,African American,Cesarean section,21:18:00,Operating Room,20:15:00,Home,Alive,SICU,17:42:00,Recovery Room,admit,23:46:00,Floor,Alive,035-1263


In [493]:
SICU = SICU.drop(columns = ['patienthealthsystemstayid','hospitaladmittime24','hospitaldischargetime24','unitdischargetime24','unittype',
                            'unitadmittime24','hospitaldischargelocation','unitdischargelocation','uniquepid','unitdischargestatus','apacheadmissiondx'])

In [494]:
encoder['ethnicity'] = {"African American":0, "Caucasian":1,"Hispanic":2, "Asian":3,
                        'Native American':4, 'Other/Unknown':5}
sicu_unitadmitsource = {col : idx for idx, col in enumerate(SICU.unitadmitsource.unique())}

encoder['unitadmitsource'] = sicu_unitadmitsource

sicu_unitstaytype = {col : idx for idx, col in enumerate(SICU.unitstaytype.unique())}
encoder['unitstaytype'] = sicu_unitstaytype

sicu_hospitaladmitsource = {col : idx for idx, col in enumerate(SICU.hospitaladmitsource.unique())}
encoder['hospitaladmitsource'] = sicu_hospitaladmitsource

encoder['hospitaldischargestatus'] = {"Expired":1, "Alive":0}

In [496]:
SICU = SICU.replace(encoder)

In [501]:
SICU['age'] = SICU.age.replace('>','', regex=True).astype('float')

In [518]:
SICU['gender'] = SICU.gender.replace('Other|Unknown',np.nan, regex=True)

In [526]:
SICU = SICU.dropna(subset=['gender'])

In [527]:
SICU['age'] = SICU.age.astype('float')
SICU['gender'] = SICU.gender.astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [528]:
SICU = SICU.rename(columns = {'hospitaldischargestatus':'death', "admissionheight":'height','admissionweight':'weight', 'meanbp':'bp'})

In [529]:
SICU = SICU.replace(encoder)

Free text로 되어 있는 admissiondx는 쓰지 않는다. in eICU

In [534]:
SICU.head()

Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,wardid,height,hospitaladmitoffset,hospitaladmitsource,hospitaldischargeyear,...,calcium,chloride,fentaNYL,fentaNYL citrate (PF),hydrALAZINE,magnesium,morphine,platelets x 1000,potassium,gcs
0,141296,1.0,63.0,1,73,85,162.6,-2254,0,2014,...,8.416667,95.5,0.0,0.0,0.0,2.037001,0.0,437.833333,5.05,15.0
1,141297,1.0,63.0,1,73,85,162.6,-8174,0,2014,...,9.966667,97.333333,0.0,0.0,0.0,3.2,0.0,440.0,7.066667,15.0
2,141314,1.0,45.0,1,73,85,170.2,-446,1,2014,...,10.3,109.0,0.0,0.0,0.0,2.35,0.0,172.5,6.533333,3.0
3,141675,0.0,58.0,1,68,103,151.8,-72,2,2014,...,8.5,108.0,0.0,0.0,0.0,1.949268,0.0,232.0,4.2,15.0
4,141708,1.0,82.0,1,68,103,188.0,-3,3,2015,...,8.3,105.5,0.0,0.0,0.0,1.949268,0.0,188.0,3.8,15.0


In [535]:
SICU = SICU.rename(columns = {'patientunitstayid':'연구등록번호'})
MICU = MICU.rename(columns = {'patientunitstayid':'연구등록번호'})

Common Feature, Specific Feature 추리기

In [14]:
hicu_columns = set(HICU.columns) 
micu_columns = set(MICU.columns)
sicu_columns = set(SICU.columns)

In [15]:
common_features = hicu_columns & micu_columns & sicu_columns
common_features.remove('death')
common_features.remove('연구등록번호')

outcome = ['death']
patientID = ['연구등록번호']


hicu_columns.remove('death')
micu_columns.remove('death')
sicu_columns.remove('death')

In [16]:
hicu_specific = hicu_columns - common_features - set(['연구등록번호', 'death'])
micu_specific = micu_columns - common_features - set(['연구등록번호', 'death'])
sicu_specific = sicu_columns - common_features - set(['연구등록번호', 'death'])

In [17]:
len(hicu_specific), len(micu_specific), len(sicu_specific)

(13, 69, 105)

In [578]:
SICU.reset_index(drop=True, inplace=True)
MICU.reset_index(drop=True, inplace=True)

In [545]:
SICU.to_feather(DATA_PATH.joinpath('processed','SICU.feather'))
MICU.to_feather(DATA_PATH.joinpath('processed','MICU.feather'))

# Data EDA

In [252]:
SICU.columns.tolist()[-20]

'SENNOSIDES-DOCUSATE SODIUM 8.6-50 MG PO TABS'

## Logistic Regression

In [26]:
import pandas as pd
import numpy as np

In [7]:
HICU = pd.read_feather('../data/processed/HICU.feather')
MICU = pd.read_feather('../data/processed/MICU.feather')
SICU = pd.read_feather('../data/processed/SICU.feather')

In [8]:
SICU.head()
MICU.head()

Unnamed: 0,연구등록번호,gender,age,ethnicity,hospitalid,wardid,height,hospitaladmitoffset,hospitaladmitsource,hospitaldischargeyear,...,alkaline phos.,anion gap,bicarbonate,calcium,chloride,magnesium,platelets x 1000,potassium,total protein,gcs
0,141328,0,76.0,0,73,97,157.5,-6,0,2014,...,92.0,11.5,30.0,8.55,95.0,2.1,174.5,4.15,7.3,13.0
1,141366,1,81.0,1,73,97,172.7,-1982,1,2015,...,82.0,9.0,32.6,8.44,102.0,2.1,295.833333,4.54,6.3,15.0
2,141392,0,78.0,1,73,97,160.0,-1,0,2014,...,72.0,9.0,32.875,8.925,100.0,1.9,172.625,3.635714,6.633333,15.0
3,141462,1,80.0,1,73,97,170.2,-1,2,2014,...,57.0,10.0,28.833333,8.566667,100.5,2.016667,368.5,3.855556,6.4,15.0
4,141475,1,87.0,1,73,97,180.3,-7267,2,2015,...,72.0,9.2,25.4,7.82,114.6,1.95192,182.5,3.566667,6.15,10.0


Get Specific variables for eICU datas

In [9]:
from sklearn.preprocessing import MinMaxScaler

min_max_scalar = MinMaxScaler()
fitted = min_max_scalar.fit_transform(MICU[MICU.columns[1:]])

In [10]:
MICU[MICU.columns[1:]] = fitted

In [11]:
min_max_scalar = MinMaxScaler()
fitted = min_max_scalar.fit_transform(SICU[SICU.columns[1:]])
SICU[SICU.columns[1:]] = fitted

In [19]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

# lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
# model = SelectFromModel(lsvc, prefit=True)
# X_new = model.transform(X)
# X_new.shape

In [20]:
def collect_significant_variables(df, outcome, variables):
    
    model = LogisticRegression(C=0.1, penalty="l2")
    X = df[variables]
    y = df[outcome]
    
    logReg = model.fit(X,y)
    transModel = SelectFromModel(logReg, prefit=True)
    
    X_new = transModel.transform(X)
    return transModel.get_feature_names_out(X.columns)

In [43]:
def collect_significant_variables(df, outcome, variables):
    
    X = df[variables]
    y = df[outcome]
    
    from sklearn.linear_model import Lasso
    lasso = Lasso(alpha=0.003)

    lasso.fit(X, y)

    print(lasso.coef_)

    importance = np.abs(lasso.coef_)

    selected_features = np.array(X.columns)[importance > 0]
    print(selected_features)
    print(len(selected_features))
    return selected_features

In [46]:
micu_selected = collect_significant_variables(MICU, outcome, micu_specific)

[-0.          0.          0.         -0.         -0.          0.00275841
 -0.30883129 -0.         -0.          0.          0.         -0.
  0.05473193 -0.02535724 -0.01262236  0.         -0.         -0.
  0.         -0.          0.         -0.         -0.          0.
  0.          0.         -0.         -0.01636387  0.          0.
 -0.         -0.         -0.         -0.         -0.         -0.22284466
  0.          0.         -0.          0.         -0.          0.02928005
 -0.          0.          0.          0.          0.         -0.
  0.          0.         -0.17289757 -0.         -0.          0.
  0.13503933 -0.         -0.         -0.          0.          0.
  0.         -0.          0.         -0.          0.3940695   0.
 -0.         -0.          0.        ]
['1000 ML FLEX CONT : SODIUM CHLORIDE 0.9 % IV SOLN' 'total protein'
 'vent' 'PANTOPRAZOLE SODIUM 40 MG PO TBEC'
 'POTASSIUM CHLORIDE CRYS ER 20 MEQ PO TBCR' '-lymphs' 'gcs'
 'VANCOMYCIN HCL 1000 MG IV SOLR' 'bicarbonate' '

In [47]:
sicu_selected = collect_significant_variables(SICU, outcome, sicu_specific)

[-0.         -0.          0.          0.         -0.          0.
 -0.          0.         -0.          0.          0.          0.05149923
 -0.02158403 -0.0087204  -0.         -0.02156485 -0.         -0.
 -0.          0.          0.         -0.          0.04318303 -0.
  0.          0.         -0.         -0.          0.          0.
  0.         -0.          0.         -0.          0.         -0.
  0.          0.         -0.00386638 -0.         -0.02142538  0.
 -0.         -0.         -0.         -0.16855515  0.         -0.
  0.         -0.         -0.          0.         -0.          0.
  0.00119984  0.          0.          0.         -0.         -0.
  0.          0.          0.         -0.         -0.          0.
 -0.01220876  0.          0.          0.         -0.          0.
 -0.07636039 -0.         -0.         -0.         -0.         -0.00435494
  0.00529569 -0.         -0.          0.         -0.          0.02230842
  0.10707517 -0.         -0.          0.         -0.          0.
 

In [48]:
import pickle 

with open('sicu_selected.pkl','wb') as f:
    pickle.dump(sicu_selected, f)
with open('micu_selected.pkl','wb') as f:
    pickle.dump(sicu_selected,f) 