In [7]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [8]:
df= pd.read_csv(f'./data/mimic_data/full_step1.csv')

In [9]:
missing_count = df[['PEEP','FiO2']].isnull().sum(axis=1)
df['missing'] =  missing_count/2

In [10]:
def impute_Gram_stain(df_in):
    Gram_stain_feature = ['Abdomen','Blood','Respiratory tract','Skin and soft tissue','Urinary tract','Others']
    df_out = df_in.copy()
    for col in Gram_stain_feature:
        df_out[col] = df_out[col].rolling(window=5, center=True, min_periods=1).apply(lambda x: (x.sum() >= 1) * 1, raw=False)
    return df_out

def impute_PEEP_FiO2(df_P,avg_PEEP,avg_FiO2):
    PEEP = 5
    FiO2 = 30
    df_P.loc[df_P['use_vent'] == 0, ['PEEP']] = PEEP
    df_P.loc[df_P['use_vent'] == 0, ['FiO2']] = FiO2
    fill_interval = False
    for i in range(len(df_P)):
        if df_P.loc[i, 'use_vent'] == 1 and not fill_interval:
            fill_interval = True
            start_index = i
        elif df_P.loc[i, 'use_vent'] == 0 and fill_interval:
            fill_interval = False
            end_index = i
            subset = df_P.loc[start_index:end_index - 1].copy()
            subset['PEEP'] = subset['PEEP'].interpolate(method='linear', limit_direction='both')
            subset['FiO2'] = subset['FiO2'].interpolate(method='linear', limit_direction='both')
            subset['PEEP'].fillna(avg_PEEP, inplace=True)
            subset['FiO2'].fillna(avg_FiO2, inplace=True)
            df_P.loc[start_index:end_index - 1] = subset
    return df_P


def impute_vent_feature(df_input,df_Mean):
    df_P = df_input.copy()

    vent_feature_list = ['PEEP','FiO2','Mean Airway Pressure','Peak Airway Pressure','RASS']
    PEEP = 5
    FiO2 = 30
    df_P.loc[df_P['use_vent'] == 0, ['PEEP']] = PEEP
    df_P.loc[df_P['use_vent'] == 0, ['FiO2']] = FiO2
    
    fill_interval = False
    for i in range(len(df_P)):
        if df_P.loc[i, 'use_vent'] == 1 and not fill_interval:
            fill_interval = True
            start_index = i
        elif df_P.loc[i, 'use_vent'] == 0 and fill_interval:
            fill_interval = False
            end_index = i
            subset = df_P.loc[start_index:end_index - 1].copy()
            
            for feature_name in vent_feature_list:
                subset[feature_name].fillna(method='ffill', inplace=True)
                subset[feature_name].fillna(df_Mean, inplace=True)
            df_P.loc[start_index:end_index - 1] = subset
            
    for feature_name in vent_feature_list:
        df_P.loc[:, feature_name].fillna(df_Mean[feature_name], inplace=True)
    
    return df_P


In [11]:
def remove_outliers(series):
    if  series.dtype == object:
        print(f'object....{series.name}')
        return series
    if series.max() == 1 and series.min() == 0:
        print(f'found 0/1!....{series.name}')
        return series
    
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    return series[(series >= Q1 - 1.5 * IQR) & (series <= Q3 + 1.5 * IQR)]

In [12]:
feature_name_list = df.columns.to_list()
missing_ratio =  df.isna().mean().to_list()

In [13]:
drop_cols = ['Total Protein','Compliance','Albumin','Alkaline Phos.','Total Bilirubin','ALT (SGPT)','AST (SGOT)','Respiration','Pressure Support','ROXindex','Troponin - I','PTT_Ratio']
df = df.drop(drop_cols,axis = 1)

In [14]:
#各欄位的缺失比例
missing_data_ratios = df.isna().mean()
cols_with_missing_data = missing_data_ratios[missing_data_ratios == 1].index
df = df.drop(cols_with_missing_data, axis=1)

In [15]:
#input()

In [16]:
""" Missing value mask ==> 1 / 0 """
for col in df.columns:
    mask_col_name = col + '_mask'
    df[mask_col_name] = df[col].isna().astype(int)
    

""" Label Embedding """
object_col_list = []
le = LabelEncoder()
for col in df.select_dtypes(include='object'):
    if col == 'date' or col == 'data_type' or col == 'ventilation_status': 
        continue
    df[col].fillna("NULL", inplace=True) 
    df[col] = le.fit_transform(df[col])
    object_col_list.append(col)
    

""" Fill missing values with the binary column"""
binary_cols = df.columns[(df.isin([0, 1, np.nan])).all()]
df[binary_cols] = df[binary_cols].fillna(0)
print(binary_cols)


exclude_cols = ['date', 'data_type', 'ventilation_status','FiO2','PEEP']
dont_remove_outlier_cols = exclude_cols + list(binary_cols) + object_col_list
df = df.apply(lambda col: remove_outliers(col) if col.name not in dont_remove_outlier_cols else col, axis=0)

  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] = df[col].isna().astype(int)
  df[mask_col_name] 

Index(['use_vent', 'PC mode', 'Vasopressor', 'Relaxant', 'Sedation', 'PPI',
       'Pain control', 'Aspergillus', 'Candida', 'Abdomen',
       ...
       'Ionized Calcium_mask', 'Triglycerides_mask', 'Cortisol_mask',
       'Uric Acid_mask', 'Ammonia_mask', 'Vitamin B12_mask', 'Weaning_mask',
       'Reintubation_mask', 'Weaning_successful_mask', 'missing_mask'],
      dtype='object', length=165)
found 0/1!....missing


In [17]:
import os

""" mean of column """
# 路徑
file_path = f'./data/mimic_data/df_Mean.csv'

if os.path.exists(file_path):
    df_Mean = pd.read_csv(file_path)
else:
    df_Mean = pd.DataFrame(df.mean()).T
    df_Mean.to_csv(file_path, index=False) 

  df_Mean = pd.DataFrame(df.mean()).T


In [18]:
avg_PEEP = df[df['use_vent'] == 1]['PEEP'].mean()
avg_FiO2 = df[df['use_vent'] == 1]['FiO2'].mean()

In [19]:
df_result = pd.DataFrame()
df_result_list = []
mean = df_Mean.mean()

distinct_stay_id = df['stay_id'].unique()
for stay_ids in tqdm(distinct_stay_id):     
    df_P = df[df['stay_id'] == stay_ids]
    df_P.reset_index(drop=True, inplace=True)
    

    df_P = impute_vent_feature(df_P,mean)
    df_P = impute_Gram_stain(df_P)
    
    df_P.interpolate(method='linear',limit_direction='both', inplace=True)
    df_P.interpolate(method='pad',limit_direction='forward', inplace=True)
    df_P.interpolate(method='bfill',limit_direction='backward', inplace=True)
    df_P.fillna(mean, inplace=True)
    
    df_result_list.append(df_P)
    

100%|██████████████████████████████████████████████████████████████████████████████| 2893/2893 [02:43<00:00, 17.69it/s]


In [20]:
df_result = pd.concat(df_result_list, ignore_index=True)

In [21]:
df_result.to_csv(f'./data/mimic_data/full_step2.csv',index = False)