In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
pd.set_option('display.max_columns', None)
tqdm.pandas()

In [None]:
# we use KUMC as the internal dataset
data_path = '...'
pat_id_cols = ["CENTER_NAME", "PATID",  "ONSETS_ENCOUNTERID"]
%store pat_id_cols
%store data_path

Stored 'pat_id_cols' (list)
Stored 'data_path' (str)


In [None]:
onset_df = pd.read_csv('...')

  onset_df = pd.read_csv('/blue/yonghui.wu/lideyi/AKI_VAE/NEW_ONSETS.csv')


We only need non-AKI and general AKI (i.e. no staging). Remove community-acquired AKI (AKI happened at least 3 days after admission). For multiple AKI stage during the hospitalization, we used the earliest onset stage. That is to say, the earliest onset stage should at least 3 days from admission. Feature space will consist of demographcis(binary), lab tests(dense continuouse), procedures(sparse binary) and medications(sparse continuous), SCr information will be not included. Lab tests, procedures and medications are taken from a window of 3 days before onset. For non-AKI patients, the predciton point is the last SCr measurement date.

In [4]:
KUMC_df = onset_df[onset_df.CENTER_NAME == 'KUMC'].copy(deep = True)

In [5]:
# format data type
time_cols = ['ADMIT_DATE', 'DISCHARGE_DATE', 'AKI1_ONSET', 'AKI2_ONSET', 'AKI3_ONSET']
for col in time_cols:
    KUMC_df[col] = pd.to_datetime(KUMC_df[col], format='%Y-%m-%d')

KUMC_df[['PATID', 'ONSETS_ENCOUNTERID']] = KUMC_df[['PATID', 'ONSETS_ENCOUNTERID']].astype(str)

In [7]:
KUMC_df = KUMC_df.sort_values(by='ADMIT_DATE', ascending=True)
KUMC_df = KUMC_df[(KUMC_df.ADMIT_DATE >= pd.to_datetime('2016-01-01')) & \
                 (KUMC_df.ADMIT_DATE <= pd.to_datetime('2016-12-31'))].reset_index(drop = True)

In [8]:
KUMC_df['EARLIEST_STAGE_DATE'] = KUMC_df[['AKI1_ONSET', 'AKI2_ONSET', 'AKI3_ONSET']].min(axis=1)

In [9]:
KUMC_df['EARLIEST_AKI_STAGE_TO_ADMIT'] = (KUMC_df['EARLIEST_STAGE_DATE'] - KUMC_df['ADMIT_DATE']).dt.days

In [10]:
# keep hospital-acquired AKI and non-AKI "48h"
KUMC_df = KUMC_df[(KUMC_df['EARLIEST_AKI_STAGE_TO_ADMIT'] >= 3) | (KUMC_df['EARLIEST_AKI_STAGE_TO_ADMIT'].isna())]

In [11]:
#exclude those baseline SCr > 3.5
KUMC_df = KUMC_df.loc[KUMC_df.BASELINE_SCR < 3.5, :]

In [12]:
KUMC_df = KUMC_df.reset_index(drop = True)

# Add AKI Onset Label

In [14]:
KUMC_df['AKI_LABEL'] = 1 - KUMC_df['NONAKI_SINCE_ADMIT']

In [15]:
KUMC_df.AKI_LABEL.value_counts()

AKI_LABEL
0    19192
1     2335
Name: count, dtype: int64

# Add 1-year Mortality Label

In [16]:
KUMC_death = pd.read_csv(data_path + 'AKI_DEMO_DEATH.csv', delimiter = ',', 
                         usecols = ['PATID', 'AKI.DEATH_DATE"+PD.DATE_SHIFT"'])
KUMC_death.columns = ['PATID', 'DEATH_DATE']

In [17]:
KUMC_death['PATID'] = KUMC_death['PATID'].astype(str)
KUMC_death['DEATH_DATE'] = pd.to_datetime(KUMC_death['DEATH_DATE'], format = 'mixed')

In [18]:
KUMC_death.drop_duplicates(subset = 'PATID', inplace = True)

In [19]:
KUMC_df = KUMC_df.merge(KUMC_death, on = 'PATID', how = 'left')

In [20]:
KUMC_df['DEATH_DAYS'] = (KUMC_df['DEATH_DATE'] - KUMC_df['DISCHARGE_DATE']).dt.days

In [21]:
KUMC_df['MORT_1_YEAR'] = (KUMC_df['DEATH_DAYS'] < 365).astype(int)

In [22]:
KUMC_df.MORT_1_YEAR.value_counts()

MORT_1_YEAR
0    18307
1     3220
Name: count, dtype: int64

# Read SCR to Get Prediction Points for Non-AKI Patients

In [24]:
SCR_df = pd.read_csv(data_path + 'AKI_LAB_SCR.csv')

In [25]:
use_cols = ['ONSETS_ENCOUNTERID','PATID','ENCOUNTERID','SPECIMEN_DATE',
            'RESULT_NUM', 'DAYS_SINCE_ADMIT']
SCR_cols = SCR_df.columns.tolist()
SCR_cols = [s[:-len('"+PD.DATE_SHIFT"')] \
                  if s.endswith('"+PD.DATE_SHIFT"') else s for s in SCR_cols]
SCR_df.columns = SCR_cols
SCR_df = SCR_df[use_cols]

In [26]:
SCR_df['PATID'] = SCR_df['PATID'].astype(str)
SCR_df = SCR_df.loc[SCR_df.PATID.isin(KUMC_df.PATID), :]

In [27]:
SCR_df['SPECIMEN_DATE'] = pd.to_datetime(SCR_df['SPECIMEN_DATE'], format = '%d-%b-%y')

In [28]:
non_AKI_df = KUMC_df.loc[KUMC_df.NONAKI_SINCE_ADMIT == True, :]

In [29]:
non_AKI_SCR_df = non_AKI_df.merge(SCR_df[['PATID', 'SPECIMEN_DATE']], on = 'PATID', how = 'left')

In [30]:
# get those within each encounter
non_AKI_SCR_df = non_AKI_SCR_df.loc[(non_AKI_SCR_df.SPECIMEN_DATE > non_AKI_SCR_df.ADMIT_DATE) & \
                                (non_AKI_SCR_df.SPECIMEN_DATE <= non_AKI_SCR_df.DISCHARGE_DATE), :] 

In [31]:
non_AKI_SCR_df = non_AKI_SCR_df.sort_values(by=['PATID', 'ONSETS_ENCOUNTERID', 'SPECIMEN_DATE'], ascending=True)

In [32]:
# only take the last SCr measureement date
non_AKI_SCR_df = non_AKI_SCR_df.groupby(['PATID', 'ONSETS_ENCOUNTERID']).tail(1).reset_index(drop=True)

In [33]:
non_AKI_SCR_df.rename(columns = {'SPECIMEN_DATE':'LAST_SCR_DATE'}, inplace = True)

In [34]:
non_AKI_SCR_df['LAST_SCR_TO_ADMIT'] = (non_AKI_SCR_df['LAST_SCR_DATE'] - non_AKI_SCR_df['ADMIT_DATE']).dt.days

In [35]:
non_AKI_SCR_df = non_AKI_SCR_df.loc[non_AKI_SCR_df.LAST_SCR_TO_ADMIT >= 3, :]

# Merge Non-AKI Patients Back and Derive Prediction Point

In [36]:
KUMC_df = KUMC_df.merge(non_AKI_SCR_df[['PATID', 'ONSETS_ENCOUNTERID', 'LAST_SCR_DATE']], 
                        on = ['PATID', 'ONSETS_ENCOUNTERID'], how = 'left')

In [37]:
assert(KUMC_df[KUMC_df.NONAKI_SINCE_ADMIT == False].LAST_SCR_DATE.isna().mean() == 1)
assert(KUMC_df[KUMC_df.NONAKI_SINCE_ADMIT == False].EARLIEST_STAGE_DATE.isna().mean() == 0)

In [38]:
KUMC_df['PREDICTION_POINT'] = KUMC_df['EARLIEST_STAGE_DATE'].combine_first(KUMC_df['LAST_SCR_DATE'])

In [39]:
# filter out those cannot find a prediction point
KUMC_df = KUMC_df[KUMC_df.PREDICTION_POINT.notna()].reset_index(drop=True)

In [40]:
KUMC_df['PREDICTION_POINT_TO_ADMIT'] = (KUMC_df['PREDICTION_POINT'] - KUMC_df['ADMIT_DATE']).dt.days
assert((KUMC_df.PREDICTION_POINT_TO_ADMIT >= 3).all())

# Read Medications

In [4]:
MED_df = pd.read_csv(data_path + 'AKI_AMED.csv', usecols = ['PATID', 'MEDADMIN_START_DATE"+PD.DATE_SHIFT"',
                                                            'MEDADMIN_STOP_DATE"+PD.DATE_SHIFT"', 'MEDADMIN_TYPE',
                                                            'MEDADMIN_CODE', 'MEDADMIN_DOSE_ADMIN'])

In [42]:
MED_df[['PATID', 'MEDADMIN_CODE']]= MED_df[['PATID', 'MEDADMIN_CODE']].astype(str)
MED_df.rename(columns = {'MEDADMIN_START_DATE"+PD.DATE_SHIFT"': "MED_START_DATE",
                             'MEDADMIN_STOP_DATE"+PD.DATE_SHIFT"': "MED_END_DATE"}, inplace = True)

In [43]:
# filter before formatting data
MED_df = MED_df.loc[MED_df.PATID.isin(KUMC_df.PATID), :]

In [44]:
MED_df['MED_START_DATE'] = pd.to_datetime(MED_df['MED_START_DATE'], format = '%d-%b-%y')
MED_df['MED_END_DATE'] = pd.to_datetime(MED_df['MED_END_DATE'], format = '%d-%b-%y')

In [45]:
KUMC_med = KUMC_df.merge(MED_df, on = 'PATID', how = 'left')

In [46]:
# medication should overlap with observation window
KUMC_med = KUMC_med[(KUMC_med.MED_START_DATE < KUMC_med.PREDICTION_POINT) & \
                    (KUMC_med.MED_END_DATE >= (KUMC_med.PREDICTION_POINT - pd.Timedelta(days = 3)))]

In [47]:
KUMC_med_final = KUMC_med.pivot_table(index=['PATID', 'ONSETS_ENCOUNTERID'], 
                             columns='MEDADMIN_CODE', 
                             values='MEDADMIN_DOSE_ADMIN', 
                             fill_value=0, 
                             aggfunc='max').reset_index()

In [48]:
medication_space = KUMC_med_final.columns[2:]

In [49]:
KUMC_df = KUMC_df.merge(KUMC_med_final, on = ['PATID', 'ONSETS_ENCOUNTERID'],
                       how = 'left')

In [50]:
# Fill NaN values in the medication_space column(s) with 0
KUMC_df[medication_space] = KUMC_df[medication_space].fillna(0)

In [51]:
# drop featurus with a imbalance > 0.99
def drop_highly_imbalanced(df, cols, threshold=0.99):
    cols_to_drop = []
    for col in tqdm(cols):
        max_proportion = df[col].value_counts(normalize=True).max()
        if max_proportion > threshold:
            cols_to_drop.append(col)
    updated_cols = [col for col in cols if col not in cols_to_drop]
    return df.drop(columns=cols_to_drop), updated_cols

In [52]:
KUMC_df, medication_space = drop_highly_imbalanced(KUMC_df, medication_space, threshold=0.99)

100%|██████████| 1932/1932 [00:00<00:00, 3153.66it/s]


In [53]:
len(medication_space)

277

# Read Procedures

In [54]:
PX_df = pd.read_csv(data_path + 'AKI_PX.csv', usecols = ['PATID', 'PX_DATE"+PD.DATE_SHIFT"', 'PX'])

  PX_df = pd.read_csv(data_path + 'AKI_PX.csv', usecols = ['PATID', 'PX_DATE"+PD.DATE_SHIFT"', 'PX'])


In [55]:
# filter before formatting data type
PX_df[['PATID', 'PX']] = PX_df[['PATID', 'PX']].astype(str)
PX_df = PX_df[PX_df.PATID.isin(KUMC_df.PATID)]

In [56]:
PX_df.rename(columns = {'PX_DATE"+PD.DATE_SHIFT"': 'PX_DATE'}, inplace = True)
PX_df['PX_DATE'] = pd.to_datetime(PX_df['PX_DATE'], format = '%d-%b-%y')

In [57]:
KUMC_PX = KUMC_df.merge(PX_df, on = 'PATID', how = 'left')

In [58]:
# we require that PX fall within the observation window
KUMC_PX = KUMC_PX[(KUMC_PX.PX_DATE >= (KUMC_PX.PREDICTION_POINT - pd.Timedelta(days = 3))) & \
                 (KUMC_PX.PX_DATE < KUMC_PX.PREDICTION_POINT)]

In [59]:
KUMC_PX.loc[:, 'HELP_COL'] = 1
KUMC_PX_final = KUMC_PX.pivot_table(index=['PATID', 'ONSETS_ENCOUNTERID'], 
                             columns='PX', 
                             values='HELP_COL', 
                             fill_value=0, 
                             aggfunc='max').reset_index()

In [60]:
PX_space = KUMC_PX_final.columns[2:]

In [61]:
KUMC_df = KUMC_df.merge(KUMC_PX_final, on = ['PATID', 'ONSETS_ENCOUNTERID'],
                       how = 'left')

In [62]:
# Fill NaN values in the PX column(s) with 0
KUMC_df[PX_space] = KUMC_df[PX_space].fillna(0)

In [63]:
KUMC_df, PX_space = drop_highly_imbalanced(KUMC_df, PX_space, threshold=0.99)

100%|██████████| 3281/3281 [00:01<00:00, 3054.82it/s]


In [64]:
len(PX_space)

288

# Each Patient Should Be Unique: Just use the first Encounter of Each Patients

In [66]:
KUMC_df = KUMC_df.sort_values(by=['PATID', 'ADMIT_DATE'], ascending=True)
# drop duplicates based on PATID and ENCOUTNERID and keep the first records
KUMC_df = KUMC_df.drop_duplicates(subset='PATID', keep='first')

# Read Labs

In [68]:
with open(data_path + 'AKI_LAB.csv', 'r', encoding='utf-8', errors='ignore') as file:
    LAB_df = pd.read_csv(data_path + 'AKI_LAB.csv', 
                      delimiter=",", usecols=['PATID', 'LAB_LOINC', 'SPECIMEN_DATE"+PD.DATE_SHIFT"', 
                                              'RESULT_NUM'],
                      encoding='unicode_escape')

In [69]:
LAB_df[['PATID', 'LAB_LOINC']] = LAB_df[['PATID', 'LAB_LOINC']].astype(str)
LAB_df.rename(columns = {'SPECIMEN_DATE"+PD.DATE_SHIFT"': 'SPECIMEN_DATE'}, inplace = True)

In [70]:
# filter before data type change
LAB_df = LAB_df[LAB_df.PATID.isin(KUMC_df.PATID)]

In [71]:
used_lab_codes = ['17861-6', '2075-0', '2345-7', '2823-3', '2951-2', '3094-0', '4544-3', 
                  '718-7', '742-7', '777-3', '785-6', '786-4', '787-2', '788-0', '789-8']
LAB_df = LAB_df.loc[LAB_df.LAB_LOINC.isin(used_lab_codes), :]

In [72]:
LAB_df['SPECIMEN_DATE'] = pd.to_datetime(LAB_df['SPECIMEN_DATE'], format = '%d-%b-%y')

In [73]:
KUMC_LAB = KUMC_df.merge(LAB_df, on = 'PATID', how = 'left')

In [74]:
# we require that LAB fall within the observation window
KUMC_LAB = KUMC_LAB[(KUMC_LAB.SPECIMEN_DATE >= (KUMC_LAB.PREDICTION_POINT - pd.Timedelta(days = 3))) & \
                 (KUMC_LAB.SPECIMEN_DATE < KUMC_LAB.PREDICTION_POINT)]

In [75]:
# same lab within the window are averaged
KUMC_LAB = KUMC_LAB.groupby(['PATID', 'ONSETS_ENCOUNTERID','LAB_LOINC'])['RESULT_NUM'].mean().reset_index()

In [76]:
KUMC_LAB_final = KUMC_LAB.pivot_table(index=['PATID', 'ONSETS_ENCOUNTERID'], 
                             columns='LAB_LOINC', 
                             values='RESULT_NUM', 
                             fill_value=np.nan, 
                             aggfunc='max').reset_index()

In [77]:
lab_space = KUMC_LAB_final.columns[2:]

In [78]:
KUMC_df = KUMC_df.merge(KUMC_LAB_final, on = ['PATID', 'ONSETS_ENCOUNTERID'],
                       how = 'left')

In [79]:
# remove lab cols missing rate > 30%
threshold = 0.3
labs_to_drop = [col for col in lab_space if KUMC_df[col].isna().mean() > threshold]

# Drop the columns from the DataFrame
KUMC_df = KUMC_df.drop(columns=labs_to_drop)

In [80]:
# update lab space 
lab_space = [lab for lab in lab_space if lab not in labs_to_drop]

In [81]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

In [82]:
# use MICE to fill in missing data
imputer = IterativeImputer(missing_values=np.nan, max_iter=10000, random_state=42)
# should only fit on train data in case for data leakage
KUMC_df.loc[:, lab_space] = imputer.fit_transform(KUMC_df.loc[:, lab_space])
# convert data type
KUMC_df.loc[:, lab_space] = KUMC_df.loc[:, lab_space].astype(np.float64)

# Get Basic Statistics before Removing IDs

In [84]:
# cohort size 
print(len(KUMC_df))

13752


In [85]:
print(len(list(medication_space)))
print(len(list(PX_space)))
print(len(list(lab_space)))

277
288
14


In [86]:
# label ratio
label_col_names = ['AKI_LABEL', 'MORT_1_YEAR']
print(len(KUMC_df[KUMC_df[label_col_names[0]] == 1]))
print(len(KUMC_df[KUMC_df[label_col_names[1]] == 1]))
print(len(KUMC_df[KUMC_df[label_col_names[0]] == 1])/len(KUMC_df))
print(len(KUMC_df[KUMC_df[label_col_names[1]] == 1])/len(KUMC_df))

1636
1739
0.11896451425247237
0.1264543339150669


Demographics

In [87]:
DEMO_df = pd.read_csv(data_path + "AKI_DEMO.csv", 
                      delimiter = ',', 
                      usecols = ['ONSETS_ENCOUNTERID', 'AGE', 'PATID', 'SEX', 'RACE'])

In [88]:
DEMO_df[['ONSETS_ENCOUNTERID', 'PATID', 'SEX', 'RACE']] = DEMO_df[['ONSETS_ENCOUNTERID', 'PATID', 'SEX', 'RACE']].astype(str)

In [89]:
race_mapping = \
{
    '01': 'American Indian or Alaska Native',
    'RACE:amer. indian': 'American Indian or Alaska Native',
    '02': 'Asian',
    'RACE:asian': 'Asian',
    '03': 'Black',
    'RACE:black': 'Black', 
    '04': 'Native Hawaiian',
    'RACE:asian/pac. isl': 'Native Hawaiian',
    'RACE:white': 'White',
    '05': 'White',
    '06': 'More Than One Race',
    '07': 'Other',
    'RACE:ot': 'Patient Refused',
    'OT': 'Patient Refused',
    'NI': 'No Information',
    'RACE:ni': 'No Information',
    'nan': 'No Information',
    'UN': 'Unknown',
    'RACE:unknown':  'Unknown'
}
%store race_mapping

Stored 'race_mapping' (dict)


In [90]:
DEMO_df['RACE'] = DEMO_df['RACE'].replace(race_mapping)
DEMO_df.drop_duplicates(subset = ['PATID', 'ONSETS_ENCOUNTERID'], inplace = True)

In [91]:
KUMC_df = KUMC_df.merge(DEMO_df, on = ['PATID', 'ONSETS_ENCOUNTERID'], how = 'left')

In [92]:
print(len(KUMC_df[KUMC_df.SEX == 'F']))
print(len(KUMC_df[KUMC_df.SEX == 'F']) / len(KUMC_df))

6902
0.501890634089587


In [93]:
print(len(KUMC_df[KUMC_df.RACE == 'Black']))
print(len(KUMC_df[KUMC_df.RACE == 'Black']) / len(KUMC_df))

1856
0.13496218731820825


In [94]:
age_quantiles = KUMC_df['AGE'].quantile([0.25, 0.5, 0.75])
print(age_quantiles)

0.25    48.0
0.50    61.0
0.75    71.0
Name: AGE, dtype: float64


In [95]:
admit_to_onset_quantiles = KUMC_df['EARLIEST_AKI_STAGE_TO_ADMIT'].quantile([0.25, 0.5, 0.75])
print(admit_to_onset_quantiles)

0.25     4.0
0.50     7.0
0.75    17.0
Name: EARLIEST_AKI_STAGE_TO_ADMIT, dtype: float64


# Save Table

In [96]:
feature_space = list(medication_space) + list(PX_space) + list(lab_space)

In [97]:
print(len(list(medication_space)))
print(len(list(PX_space)))
print(len(list(lab_space)))

277
288
14


In [98]:
# no nan in table
assert(np.sum(KUMC_df[feature_space].isna().mean()) == 0)

In [99]:
KUMC_df = KUMC_df[feature_space + label_col_names]

In [100]:
# before min-max norm, we process outliers first
from scipy.stats import mstats
KUMC_df = KUMC_df.apply(lambda col: mstats.winsorize(col, limits=[0.01, 0.01]))

In [101]:
# min-max norm the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
KUMC_df_scaled = scaler.fit_transform(KUMC_df)
KUMC_df = pd.DataFrame(KUMC_df_scaled, columns = KUMC_df.columns)

In [None]:
KUMC_df.to_csv('...', index = False)

In [104]:
%store feature_space
%store medication_space
%store PX_space
%store lab_space
%store label_col_names

Stored 'feature_space' (list)
Stored 'medication_space' (list)
Stored 'PX_space' (list)
Stored 'lab_space' (list)
Stored 'label_col_names' (list)
