## Module

In [2]:
import pandas as pd
from IPython.display import display
import numpy as np
from matplotlib import rc
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from tqdm import tqdm
import re

from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False
pd.set_option("display.max.columns", None)

plt.rcParams['figure.figsize'] = (10, 10)

In [None]:
from google.colab import drive
drive.mount("./content/drive")

In [None]:
datapath = "/content/drive/Shared drives/2020medicalAI_team3_예후예측/"
df = pd.read_csv (os.path.join (datapath, 'data_preop.csv'))
df.head()

In [3]:
pd.DataFrame(df.isnull().sum() / len(df), columns=['Null Percentage']).T

Unnamed: 0,caseid,age,sex,bmi,asa,emop,department,optype,dx,opname,approach,ane_type,anedur,los_postop,los_icu,death_inhosp,preop_htn,preop_dm,preop_arrhythmia,preop_pft,preop_hb,preop_plt,preop_pt,preop_aptt,preop_na,preop_k,preop_glucose,preop_alb,preop_got,preop_gpt,preop_bun,preop_cr,cormack,airway,aline1,aline2,cline1,cline2,intraop_uo,intraop_rbc,intraop_ffp,intraop_crystalloid,intraop_colloid,intraop_ppf,intraop_mdz,intraop_ftn,intraop_rocu,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca,WBC,Hb,PLT,Albumin,GOT (AST),GPT (ALT),Creatinine,Sodium,Potassium,Chloride,hs-CRP quantitation,PT (INR),aPTT,Fibrinogen,pH,pCO₂,pO₂,HCO3-,BE,Calcium,Glucose,Lactic acid
Null Percentage,0.0,0.0,0.0,0.0,0.019709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03827,0.037313,0.039227,0.042289,0.081324,0.081324,0.040949,0.039418,0.038462,0.037696,0.038462,0.040184,0.143131,0.073096,0.0,0.0,0.0,0.0,0.450057,0.0,0.0,0.063911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.400115,0.400115,0.403368,0.412744,0.412361,0.412361,0.423268,0.289323,0.289705,0.290088,0.61328,0.474742,0.476081,0.515882,0.950823,0.950823,0.950823,0.950823,0.987754,0.377535,0.484118,0.950823


## Object Type의 변수 category 확인

for i in df.columns:
    if df[i].dtype == object:
        print('Variable: {0}'.format(i))
        print('Category: {0} \n'.format(set(df[i])))

## Preprocessing and Cleaning

In [4]:
df['sex'] = df['sex'].map({'F': 0, 'M': 1})
df['emop'] = df['emop'].map({'N': 0, 'Y': 1})
df['death_inhosp'] = df['death_inhosp'].map({'N': 0, 'Y': 1})
df['preop_htn'] = df['preop_htn'].map({'N': 0, 'Y': 1})
df['preop_dm'] = df['preop_dm'].map({'N': 0, 'Y': 1})

# aline & cline
df['aline1_code'] = np.where(df['aline1'].str.contains('N', regex=True), 0, 1)
df['aline2_code'] = np.where(df['aline2'].str.contains('N', regex=True), 0, 1)
df['cline1_code'] = np.where(df['cline1'].str.contains('N', regex=True), 0, 1)
df['cline2_code'] = np.where(df['cline2'].str.contains('N', regex=True), 0, 1)
df['aline'] = df[['aline1_code', 'aline2_code']].any(axis=1).astype(int) # 0(no), 1(yes) 
df['cline'] = df[['cline1_code', 'cline2_code']].any(axis=1).astype(int) # 0(no), 1(yes)
df.drop(columns=['aline1_code', 'aline2_code', 'cline1_code', 'cline2_code'],inplace=True)

## preop_arrhythmia

## Numeric Value
def clean_num(df_series):
    df_series = str(df_series)
    num_pattern = re.compile(r'\s*(?P<value>[0-9]+[.]*[0-9]*)\s*')
    re_num = re.search(num_pattern, df_series)
    if re_num != None:
        value = re_num.group('value')
    else:
        value = np.nan
        
    return value

df['preop_plt'] = df['preop_plt'].apply(lambda x: clean_num(x)).astype(float)
df['preop_aptt'] = df['preop_aptt'].apply(lambda x: clean_num(x)).astype(float)
df['preop_got'] = df['preop_got'].apply(lambda x: clean_num(x)).astype(float)
df['preop_gpt'] = df['preop_gpt'].apply(lambda x: clean_num(x)).astype(float)
df['preop_cr'] = df['preop_cr'].apply(lambda x: clean_num(x)).astype(float)

## Drop the Intra-OP Variable

In [5]:
drop_list = ['department', 'optype', 'dx', 'opname', 'approach', 'ane_type', 'aline', 
             'cline', 'aline1', 'aline2', 'cline1', 'cline2', 'cormack', 'airway', 
             'intraop_crystalloid','intraop_uo', 'intraop_rbc', 'intraop_ffp', 
             'intraop_crystalloid', 'intraop_colloid', 'intraop_ppf', 'intraop_mdz',
             'intraop_ftn', 'intraop_rocu', 'intraop_vecu', 'intraop_eph', 'intraop_phe']

## Save Data

In [6]:
df.drop(columns=drop_list).to_csv(  
    os.path.join( "/content/drive/Shared drives/2020medicalAI_team3_예후예측/", 'data_preop_preprocessed.csv' ),
    index=False )
df.head()

Unnamed: 0,caseid,age,sex,bmi,asa,emop,anedur,los_postop,los_icu,death_inhosp,preop_htn,preop_dm,preop_arrhythmia,preop_pft,preop_hb,preop_plt,preop_pt,preop_aptt,preop_na,preop_k,preop_glucose,preop_alb,preop_got,preop_gpt,preop_bun,preop_cr,intraop_epi,intraop_ca,WBC,Hb,PLT,Albumin,GOT (AST),GPT (ALT),Creatinine,Sodium,Potassium,Chloride,hs-CRP quantitation,PT (INR),aPTT,Fibrinogen,pH,pCO₂,pO₂,HCO3-,BE,Calcium,Glucose,Lactic acid
0,1,77.0,1,26.3,2.0,0,180,8,0,0,1,0,N,Normal,14.1,189.0,94.0,33.2,141.0,3.1,134.0,4.3,18.0,16.0,10.0,0.82,0,0,,,,,,,,138.0,3.1,100.0,,,,,,,,,,,,
1,2,54.0,1,19.6,2.0,0,245,19,0,0,0,0,N,Normal,10.2,251.0,110.0,31.9,143.0,4.7,88.0,3.8,18.0,15.0,14.0,0.86,0,0,4.36,10.2,251.0,,,,,143.0,4.7,115.0,,,,,,,,,,,,
2,3,62.0,1,24.4,1.0,0,65,2,0,0,0,0,N,Normal,14.2,373.0,103.0,30.3,144.0,4.9,87.0,4.2,17.0,34.0,14.0,1.18,0,0,,,,,,,,,,,,,,,,,,,,,,
3,4,74.0,1,20.5,2.0,0,335,7,1,0,1,0,N,Normal,14.4,275.0,103.0,34.5,141.0,4.2,108.0,4.1,23.0,18.0,10.0,0.96,0,0,,,,,,,,144.0,4.3,108.0,,,,,,,,,,,,
4,5,66.0,1,20.4,3.0,1,350,45,13,0,1,0,Left anterior fascicular block,Normal,10.1,67.0,73.0,36.5,146.0,4.4,126.0,2.6,765.0,77.0,50.0,4.43,0,2100,14.64,15.3,158.0,4.6,64.0,30.0,1.83,139.0,4.9,108.0,,,,,,,,,,9.9,140.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5221,6384,64.0,1,24.2,1.0,0,245,8,0,0,0,0,N,Normal,14.5,279.0,109.0,31.0,,,100.0,4.2,31.0,33.0,13.0,0.99,0,0,,,,,,,,143.0,4.2,107.0,,,,,,,,,,,,
5222,6385,69.0,1,24.6,2.0,0,335,20,0,0,1,0,N,Normal,15.2,239.0,114.0,28.7,144.0,4.0,140.0,3.7,18.0,28.0,19.0,0.84,0,300,5.45,15.2,239.0,3.7,18.0,28.0,0.84,143.0,3.6,108.0,,,,,,,,,,8.1,140.0,
5223,6386,61.0,0,18.8,1.0,0,310,10,0,0,0,0,N,Normal,12.6,276.0,97.0,29.2,140.0,4.5,103.0,4.8,20.0,19.0,15.0,0.66,0,0,,,,,,,,142.0,4.5,107.0,,,,,,,,,,,,
5224,6387,24.0,0,22.9,1.0,0,185,6,0,0,0,0,N,Normal,12.5,214.0,90.0,32.2,142.0,3.6,95.0,4.4,16.0,10.0,7.0,0.65,0,0,7.70,12.5,214.0,4.4,16.0,10.0,0.65,142.0,3.6,103.0,0.03,1.07,32.2,239.0,,,,,,9.4,,
