In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()
pd.set_option("...", None)
%store -r ct_names
%store -r raw_path

In [None]:
raw_path = "..."
ct_names = ["...", "...", "...", "...", "...", "...", "...", "..."]
%store raw_path
%store ct_names

In [None]:
pat_id_cols = ["...", "...",  "..."]
%store pat_id_cols

# Read AKI and Non-AKI Patient Hospitalization Records

In [None]:
def read_and_format_onsets(ct_names, raw_path):
    print("...")
    onset_dict = read_onsets(ct_names, raw_path)
    processed_onset_dict = format_onest_dict(onset_dict)
    
    for ct_name, onset_df in processed_onset_dict.items():
        print("..." %(ct_name, len(onset_df.ONSETS_ENCOUNTERID)))
        
    onset_df = concat_dfs_to_one(processed_onset_dict)
    return onset_df

In [None]:
def concat_dfs_to_one(info_dict):
    dfs_to_concat = []
    for df in info_dict.values():
        dfs_to_concat.append(df)
    one_df = pd.concat(dfs_to_concat, axis = 0)
    return one_df

In [None]:
def read_onsets(ct_names, raw_path):
    onset_dict = dict()
    use_cols = ["...", "...", "...", "..."]
    
    for ct_name in ct_names:

        data_path = get_data_path(ct_name, raw_path)
            
        if (ct_name == "...") or (ct_name == "...") or (ct_name == "..."):
            onset_df = pd.read_csv(data_path + "...", delimiter = "...", usecols = use_cols)
            
        elif (ct_name == "..."):
            onset_df = pd.read_csv(data_path + "...", delimiter = "...", usecols = use_cols)
            
        elif (ct_name == "..."):
            onset_df = pd.read_csv(data_path + "...", delimiter = "...")
            onset_df.columns = [col.upper() for col in onset_df.columns] 
            onset_df = onset_df[use_cols]
            
        elif (ct_name == "..."):
            onset_df = pd.read_csv(data_path + "...", delimiter = "...", usecols = use_cols)
            
        elif (ct_name == "..."):
            onset_df = pd.read_csv(data_path + "...", delimiter = "...", usecols = use_cols)
            
        elif (ct_name == "..."):
            onset_df = pd.read_csv(data_path + "...", delimiter = "...")
            onset_cols = onset_df.columns.tolist()
            onset_cols = [s[:-len("..."..."...")] \
                              if s.endswith("..."..."...") else s for s in onset_cols]
            onset_df.columns = onset_cols
            onset_df = onset_df[use_cols]
            
        onset_dict[ct_name] = onset_df

    return onset_dict

In [None]:
def get_data_path(ct_name, raw_path):
    if ct_name == "...":
        data_path = raw_path + "..." + "..."
    else:
        data_path = raw_path + ct_name + "..."
    return data_path

In [None]:
def format_onest_dict(onset_dict):
    processed_onset_dict = dict()
    for ct_name, onset_df in onset_dict.items():    
        #convert id columns to string
        onset_df["..."] = onset_df["..."].astype(str)
        onset_df["..."] = onset_df["..."].astype(str)
        onset_df.rename(columns={"...": "..."}, inplace = True) 
        
        # Converting string data type into datetime object
        onset_df["..."] = pd.to_datetime(onset_df["..."], format="...")
        onset_df["..."] = pd.to_datetime(onset_df["..."], format="...")
        
        onset_df["..."] = ct_name
        processed_onset_dict[ct_name] = onset_df
    return processed_onset_dict

In [None]:
onset_df = read_and_format_onsets(ct_names, raw_path)

# Read  SCr Trajectories

In [None]:
def read_and_format_SCR(ct_names, raw_path):
    SCR_dict = read_SCR(ct_names, raw_path)
    processed_SCR_dict = format_SCR_dict(SCR_dict)
    SCR_df = concat_dfs_to_one(processed_SCR_dict)
    return SCR_df

In [None]:
#read Scr records, here we only kept the historical records(DAYS_SINCE_ADMIT < 0)
def read_SCR(ct_names, raw_path):
    SCR_dict = dict()
    use_cols = ["...","...","...","...","...", "..."]

    for ct_name in tqdm(ct_names):
        
        data_path = get_data_path(ct_name, raw_path)
        
        if (ct_name == "...") or (ct_name == "...") or (ct_name == "..."):
            SCR_df = pd.read_csv(data_path + "...", delimiter = "...", usecols=use_cols)
        elif (ct_name == "..."):
            SCR_df = pd.read_csv(data_path + "...", delimiter = "...", usecols=use_cols)
        elif (ct_name == "..."):
            SCR_df = pd.read_csv(data_path + "...", delimiter = "...")
            SCR_df.columns = [col.upper() for col in SCR_df.columns] 
            SCR_df = SCR_df[use_cols]
        elif (ct_name == "..."):
            SCR_df = pd.read_csv(data_path + "...", delimiter = "...", usecols=use_cols)
        elif (ct_name == "..."):
            SCR_df = pd.read_csv(data_path + "...", delimiter = "...", usecols=use_cols)
        elif (ct_name == "..."):
            SCR_df = pd.read_csv(data_path + "...", delimiter = "...")
            SCR_cols = SCR_df.columns.tolist()
            SCR_cols = [s[:-len("..."+PD.DATE_SHIFT"...")] \
                              if s.endswith("..."+PD.DATE_SHIFT"...") else s for s in SCR_cols]
            SCR_df.columns = SCR_cols
            SCR_df = SCR_df[use_cols]

        SCR_dict[ct_name] = SCR_df
        
    return SCR_dict

In [None]:
def format_SCR_dict(SCR_dict):
    processed_SCR_dict = dict()
    for ct_name, SCR_df in tqdm(SCR_dict.items()):
        SCR_df["..."] = SCR_df["..."].astype(str)
        SCR_df[["...", "..."]] = SCR_df[["...", "..."]].astype(str)
        SCR_df["..."] = pd.to_datetime(SCR_df["..."], format="...")
        if ct_name == "...":
            SCR_df["..."] = SCR_df["..."].dt.date
            SCR_df["..."] = pd.to_datetime(SCR_df["..."])
        SCR_df["..."] = ct_name
        processed_SCR_dict[ct_name] = SCR_df
    return processed_SCR_dict

In [None]:
SCR_df = read_and_format_SCR(ct_names, raw_path)

# Merge Hospitalization Records with SCR

In [None]:
# important: here we do not merge on encounter id, that is each encounter of each patient will be matched to
# all SCr measurements this patient had
complete_df = onset_df.merge(SCR_df[["...", "...", "...", "..."]], 
                             on = ["...", "..."], how = "...")

# Compute SCR Baseline Based on 7-day SCr Prior to Admission 

In [None]:
#SCr within 24 hour after admission, that is admission day and one day after, get mean
admission_SCr = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE) & \
                            (complete_df.SPECIMEN_DATE <= (complete_df.ADMIT_DATE + pd.Timedelta(days=1)))].copy()

In [None]:
# Admission SCr is the mean of all the SCr within 24h admission
admission_SCr = admission_SCr.groupby(pat_id_cols)["..."].mean().reset_index()

In [None]:
admission_SCr.rename(columns = {"...": "..."}, inplace = True)

In [None]:
#merge the ADMISSION_SCR back to the main frame
complete_df = complete_df.merge(admission_SCr, on = pat_id_cols, how = "...")

In [None]:
def check_missing_percentage_without_dup(df, col_to_check, id_col):
    n_unique = len(df[id_col].unique())
    n_nan_rows = len(df[df[col_to_check].isna()][id_col].unique())
    return (n_nan_rows, n_nan_rows / n_unique)

In [None]:
# how many encounters do not have admission SCr level?
check_missing_percentage_without_dup(complete_df, "...", "...")

In [None]:
#SCr within 7 days prior to admission
one_week_prior_admission = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE - pd.Timedelta(days=7)) & \
                                 (complete_df.SPECIMEN_DATE < complete_df.ADMIT_DATE)].copy()

In [None]:
# sort based on speciment date and group and take the last record
one_week_prior_admission = one_week_prior_admission.sort_values(by = pat_id_cols + ["..."], ascending = True)
one_week_prior_admission = one_week_prior_admission.groupby(pat_id_cols)["..."].last().reset_index()

In [None]:
one_week_prior_admission.rename(columns = {"...": "..."}, inplace = True)

In [None]:
complete_df = complete_df.merge(one_week_prior_admission, on = pat_id_cols, how = "...")

In [None]:
# how many encounters do not have 7-day prior SCr level?
check_missing_percentage_without_dup(complete_df, "...", "...")

# If No Prior 7-day SCr, Use One-year SCR

In [None]:
#SCr within 7 days - 365 days prior to admission
one_year_prior_admission = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE - pd.Timedelta(days=365)) & \
                                 (complete_df.SPECIMEN_DATE < complete_df.ADMIT_DATE- pd.Timedelta(days=7))].copy()

In [None]:
# 7 - 365 SCr records mean
one_year_prior_admission = one_year_prior_admission.groupby(pat_id_cols)["..."].mean().reset_index()

In [None]:
one_year_prior_admission.rename(columns = {"...": "..."}, inplace = True)

In [None]:
complete_df = complete_df.merge(one_year_prior_admission, on = pat_id_cols, how = "...")

In [None]:
# how many encoutner do not have one year SCr level?
check_missing_percentage_without_dup(complete_df, "...", "...")

In [None]:
# how many encoutner do not have both one week and one year SCr level?
n_unique = len(complete_df["..."].unique())
n_nan_rows = len(complete_df[(complete_df["..."].isna()) & (complete_df["..."].isna())]["..."].unique())
(n_nan_rows, n_nan_rows / n_unique)

# Apply Baseline for Patients with Computed SCR records

In [None]:
# for patients without history records but with admission SCr,
# we do not use admission SCr as baseline
complete_df["..."] = np.where(
    pd.notnull(complete_df["..."]),
    np.minimum(complete_df["..."], complete_df["..."]),
    np.minimum(complete_df["..."], complete_df["..."])
)

In [None]:
check_missing_percentage_without_dup(complete_df, "...", "...")

# Estimate SCR Baseline for Encounters without Any History Records

In [None]:
enc_to_MDRD = complete_df.loc[complete_df.BASELINE_SCR_RECORD.isna(), 
                              pat_id_cols + ["...", "..."]].copy(deep = True)
#one patient one row
enc_to_MDRD.drop_duplicates(subset = pat_id_cols, keep="...", inplace = True)

In [None]:
enc_to_MDRD

MDRD for non-CKD patients

In [None]:
def read_and_format_DX(ct_names, raw_path, pat_df):
    DX_dict = read_DX(ct_names, raw_path)
    processed_DX_dict = format_DX_dict(DX_dict, pat_df)
    DX_df = concat_dfs_to_one(processed_DX_dict)
    return DX_df

In [None]:
#read patients' diagnosis data
def read_DX(ct_names, raw_path):
    DX_dict = dict()
    use_cols = ["...", "...", "...", "...", "...", "..."]
    ct_missing_DX_DATE = ["...", "...", "..."]
    
    for ct_name in tqdm(ct_names):
        
        data_path = get_data_path(ct_name, raw_path)
        
        if (ct_name == "...") or (ct_name == "...") or (ct_name == "..."):
            DX_df = pd.read_csv(data_path + "...", delimiter = "...", usecols=use_cols)
            
            #adjust the col order of UIOWA
            if ct_name == "...":
                DX_df = DX_df[use_cols]
                
        elif (ct_name == "..."):
            DX_df = pd.read_csv(data_path + "...", delimiter = "...", usecols=use_cols)
            
        elif (ct_name == "..."):
            DX_df = pd.read_csv(data_path + "...", delimiter = "...")
            DX_df.columns = [col.upper() for col in DX_df.columns] 
            DX_df = DX_df[use_cols]
            
        elif (ct_name == "..."):
            DX_df = pd.read_csv(data_path + "...", delimiter = "...", usecols=use_cols)
            
        elif (ct_name == "..."):
            DX_df = pd.read_csv(data_path + "...", delimiter = "...", header=None, 
                                           skiprows = 1, usecols=[0, 2, 6, 8, 9, 20])
            DX_df.columns = use_cols
            
        elif (ct_name == "..."):
            DX_df = pd.read_csv(data_path + "...", delimiter = "...")
            DX_cols = DX_df.columns.tolist()
            DX_cols = [s[:-len("..."+PD.DATE_SHIFT"...")] \
                              if s.endswith("..."+PD.DATE_SHIFT"...") else s for s in DX_cols]
            DX_df.columns = DX_cols
            DX_df = DX_df[use_cols]
            
        DX_dict[ct_name] = DX_df
        
    return DX_dict

In [None]:
def format_DX_dict(DX_dict, pat_df):
    processed_DX_dict = dict()
    ct_missing_DX_DATE = ["...", "...", "..."]
    
    for ct_name, DX_df in tqdm(DX_dict.items()):
        DX_df["..."] = DX_df["..."].astype(str)
        pat_ct_df = pat_df[pat_df.CENTER_NAME == ct_name]
        pat_ct_df = pat_ct_df.merge(DX_df[["...", "...", "...", "...", "..."]], 
                                    on = "...", how = "...")
        pat_ct_df.dropna(subset=["..."], inplace = True)
        
        if ct_name not in ct_missing_DX_DATE:
            pat_ct_df["..."] = pd.to_datetime(pat_ct_df["..."], format = "...")
            pat_ct_df["..."] = pat_ct_df["..."].dt.strftime("...")
            pat_ct_df["..."] = pd.to_datetime(pat_ct_df["..."], format = "...")
        else:
            pat_ct_df.loc[:, "..."] = pat_ct_df.loc[:, "..."] + \
            pd.to_timedelta(pat_ct_df.loc[:, "..."], unit="...")

        #make type consistent
        pat_ct_df = pat_ct_df[pat_ct_df.DX_DATE < pat_ct_df.ADMIT_DATE]
        pat_ct_df["..."] = pat_ct_df["..."].astype(str)
        pat_ct_df["..."] = pat_ct_df["..."].replace("...", "...")
        pat_ct_df["..."] = pat_ct_df["..."].replace("...", "...")
        pat_ct_df["..."] = pat_ct_df["..."].replace("...", "...")
        pat_ct_df = pat_ct_df[["...", "...", "...", 
                              "...", "...", "..."]]
        processed_DX_dict[ct_name] = pat_ct_df
        
    return processed_DX_dict

In [None]:
DX_df = read_and_format_DX(ct_names, raw_path, enc_to_MDRD)

In [None]:
# again, we do not limit DX on specific encounter
DX_df.drop("...", axis = 1, inplace = True)

In [None]:
enc_to_MDRD_DX = enc_to_MDRD.merge(DX_df, on = ["...", "..."], how = "...")

In [None]:
# require DX is before admission (i.e. comorbidities)
enc_to_MDRD_DX = enc_to_MDRD_DX[enc_to_MDRD_DX.DX_DATE < enc_to_MDRD_DX.ADMIT_DATE].copy()

In [None]:
# encounters with CKD without any history records
ICD9_CKD = ["...", "...", "...", "...", "...", "..."]
ICD10_CKD = ["...", "...", "...", "...", "...", "..."]
enc_to_MDRD_CKD = enc_to_MDRD_DX[((enc_to_MDRD_DX.DX.isin(ICD9_CKD)) & (enc_to_MDRD_DX.DX_TYPE == "...")) | \
              ((enc_to_MDRD_DX.DX.isin(ICD10_CKD)) & (enc_to_MDRD_DX.DX_TYPE == "..."))].copy()

In [None]:
def estimate_eGFR_CKD(row):
    CKD_stage = row["..."]
    if CKD_stage == "...":
        return 90
    elif CKD_stage == "...":
        return 75
    elif CKD_stage == "...":
        return 45
    elif CKD_stage == "...":
        return 22.5
    elif CKD_stage == "...":
        return 15
    else:
        return 7.5

In [None]:
enc_to_MDRD_CKD.loc[:, "..."] = enc_to_MDRD_CKD.loc[:, "..."].str[-1]
enc_to_MDRD_CKD["..."] = enc_to_MDRD_CKD.progress_apply(estimate_eGFR_CKD, axis = 1)

In [None]:
# one encounter one prior eGFR
enc_to_MDRD_CKD_without_dup = enc_to_MDRD_CKD.groupby(pat_id_cols)["..."].min().reset_index()

In [None]:
enc_to_MDRD = enc_to_MDRD.merge(enc_to_MDRD_CKD_without_dup, on = pat_id_cols, how = "...")

In [None]:
# use 75 to represent non-CKD patients' eGFR
enc_to_MDRD["..."] = enc_to_MDRD["..."].fillna(75)

# Read Demographics

In [None]:
def read_and_format_DEMO(ct_names, raw_path, race_mapping):
    DEMO_dict = read_DEMO(ct_names, raw_path)
    processed_DEMO_dict = format_DEMO_dict(DEMO_dict, race_mapping)
    DEMO_df = concat_dfs_to_one(processed_DEMO_dict)
    return DEMO_df

In [None]:
#read patients' demographical data
def read_DEMO(ct_names, raw_path):
    DEMO_dict = dict()
    use_cols = ["...", "...", "...", "...", "..."]

    for ct_name in ct_names:
        
        data_path = get_data_path(ct_name, raw_path)
        
        if (ct_name == "...") or (ct_name == "...") or (ct_name == "...") or (ct_name == "..."):
            DEMO_df = pd.read_csv(data_path + "...", delimiter = "...", usecols = use_cols)
        elif (ct_name == "..."):
            DEMO_df = pd.read_csv(data_path + "...", delimiter = "...", usecols = use_cols)
        elif (ct_name == "..."):
            DEMO_df = pd.read_csv(data_path + "...", delimiter = "...")
            DEMO_df.columns = [col.upper() for col in DEMO_df.columns] 
            DEMO_df = DEMO_df[use_cols]
        elif (ct_name == "..."):
            DEMO_df = pd.read_csv(data_path + "...", delimiter = "...", usecols = use_cols)
        elif (ct_name == "..."):
            DEMO_df = pd.read_csv(data_path + "...", delimiter = "...", 
                                           header=None, skiprows = 1, usecols=[0, 1, 2, 5, 17])
            DEMO_df.columns = use_cols
    
        DEMO_df["..."] = ct_name
        DEMO_dict[ct_name] = DEMO_df
        
    return DEMO_dict

In [None]:
def format_DEMO_dict(DEMO_dict, race_mapping):
    processed_DEMO_dict = dict()
    for ct_name, DEMO_df in DEMO_dict.items():    
        #convert id columns to string
        DEMO_df[["...", "..."]] = DEMO_df[["...", "..."]].astype(str)
        
        DEMO_df["..."] = DEMO_df["..."].replace(race_mapping)
        
        DEMO_df["..."] = ct_name

        processed_DEMO_dict[ct_name] = DEMO_df
    return processed_DEMO_dict

In [None]:
race_mapping = \
{
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...", 
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...": "...",
    "...":  "..."
}
%store race_mapping

In [None]:
DEMO_df = read_and_format_DEMO(ct_names, raw_path, race_mapping)

In [None]:
# check that all df read in successfully
for ct_name in ct_names:
    assert(len(DEMO_df[DEMO_df.CENTER_NAME == ct_name]) != 0)

In [None]:
enc_to_MDRD = enc_to_MDRD.merge(DEMO_df, on = pat_id_cols, how = "...")

In [None]:
# drop rows that do not have demographics
enc_to_MDRD = enc_to_MDRD.dropna(subset=["...", "...", "..."])

In [None]:
# drop dups
enc_to_MDRD = enc_to_MDRD.drop_duplicates(subset=pat_id_cols)

In [None]:
def calculate_SCR(row):
    GFR, age, gender, race = row["..."], row["..."], row["..."], row["..."]

    # adjust coefficient
    gender_factor = 0.742 if gender == "..." else 1
    race_factor = 1.212 if race == "..." else 1

    # compute SCr
    SCR = (GFR / (175 * (age ** -0.203) * gender_factor * race_factor)) ** (1 / -1.154)
    return SCR

In [None]:
enc_to_MDRD["..."] = enc_to_MDRD.progress_apply(calculate_SCR, axis = 1)

In [None]:
# take the lower one between SCr est and admission SCr
# if one is nan choose the other
enc_to_MDRD["..."] = np.nanmin(enc_to_MDRD[["...", "..."]], axis=1)

In [None]:
complete_df = complete_df.merge(enc_to_MDRD[pat_id_cols + ["..."]], 
                                on = pat_id_cols, how = "...")

In [None]:
complete_df["..."] = np.where(pd.notnull(complete_df["..."]), 
                                       complete_df["..."], 
                                       complete_df["..."])

In [None]:
#drop those still cannot find baseline
complete_df = complete_df.dropna(subset=["..."])

In [None]:
# only keep BASELINE_SCR
complete_df.drop(["...", "...", "...", "...",
                 "..."], inplace = True, axis = 1)

# Preprocessing before Labeling AKI Stages

In [None]:
#here we only care about SCr measurements within hospitalization, thus we filter out those history records
# plese be note that this is only for AKI labeling, for trajectory clustering we still consider community SCr
complete_df = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE) & \
                                 (complete_df.SPECIMEN_DATE <= complete_df.DISCHARGE_DATE)].copy(deep = True)

In [None]:
# sort SCr based on specimen time
complete_df = complete_df.sort_values(pat_id_cols + ["..."]).reset_index(drop=True)

In [None]:
# drop dups based on all cols
complete_df = complete_df.drop_duplicates()

# Label AKI Stage 1

In [None]:
from multiprocessing import Pool, cpu_count

In [None]:
# get the max SCr increment within past 2 days
def calculate_max_diff(group):
    max_diffs = []
    for index, row in group.iterrows():
        # past 2 days records
        past_2_days = group[(group["..."] >= row["..."] - \
                             pd.Timedelta(days=2)) & (group["..."] < row["..."])]
        
        if not past_2_days.empty:
            # difference
            diffs = row["..."] - past_2_days["..."]
            max_diff = diffs.max()
        else:
            max_diff = pd.NA
        max_diffs.append(max_diff)
    group["..."] = max_diffs
    return group

In [None]:
encounter_grouped = complete_df.groupby(pat_id_cols)
groups = [group for _, group in encounter_grouped]

In [None]:
with Pool(cpu_count()) as p:
    result_list = list(tqdm(p.imap(calculate_max_diff, groups), 
                         total=len(groups), 
                         desc="..."))

In [None]:
complete_df = pd.concat(result_list, axis = 0, ignore_index=True)

In [None]:
#here we get the cumulative max measurements within each group
complete_df["..."] = complete_df.groupby(pat_id_cols)["..."].cummax()

In [None]:
#AKI stage 1 definition part 1, absolute increment of 0.3 within 48 hours
condition1 = (complete_df["..."] >= 0.3)
#AKI stage 1 definition part 2, fold increment
condition2 = (complete_df["..."] >= 1.5 * complete_df["..."]) & \
(complete_df["..."] < 2.0 * complete_df["..."])
#we require that onset src should be the max value until that time, so that higher stage will overwrite lower stage
condition3 = (complete_df["..."] == complete_df["..."])

In [None]:
#measurement satisfying AKI-1 
AKI_1 = complete_df[(condition1 | condition2) & condition3]

In [None]:
# the first measurement satisfying AKI-1 
AKI_1 = AKI_1.groupby(pat_id_cols).first().reset_index().copy(deep = True)
AKI_1.rename(columns = {"...": "..."}, inplace = True)

In [None]:
# merge back AKI-1 onset date
complete_df = complete_df.merge(AKI_1[pat_id_cols + ["..."]], on=pat_id_cols, how="...")

# Label AKI Stage 2

In [None]:
condition4 = (complete_df["..."] >= 2.0 * complete_df["..."]) & \
(complete_df["..."] < 3.0 * complete_df["..."])

In [None]:
AKI_2 = complete_df[condition4 & condition3]
AKI_2 = AKI_2.groupby(pat_id_cols).first().reset_index().copy(deep = True)
AKI_2.rename(columns = {"...": "..."}, inplace = True)

In [None]:
# merge back AKI-2 onset date
complete_df = complete_df.merge(AKI_2[pat_id_cols + ["..."]], on=pat_id_cols, how="...")

# Label AKI stage 3

In [None]:
condition5 = (complete_df["..."] >= 3.0 * complete_df["..."])
condition6 = (complete_df["..."] >= 4.0)

In [None]:
AKI_3 = complete_df[(condition5 | condition6) & condition3]
AKI_3 = AKI_3.groupby(pat_id_cols).first().reset_index().copy(deep = True)
AKI_3.rename(columns = {"...": "..."}, inplace = True)

In [None]:
# merge back AKI-3 onset date
complete_df = complete_df.merge(AKI_3[pat_id_cols + ["..."]], on=pat_id_cols, how="...")

# Process before Saving

In [None]:
complete_df["..."] = np.where(
    complete_df["..."].notna() | complete_df["..."].notna() | complete_df["..."].notna(), 
    False, 
    True
)

In [None]:
complete_df = complete_df[["...", "...", "...", "...", "...",
                          "...", "...", "...", "...", "..."]].copy(deep = True)

In [None]:
#each encounter we only keep one row info, that is each encounter is unique
complete_df = complete_df.drop_duplicates(pat_id_cols)

In [None]:
complete_df.NONAKI_SINCE_ADMIT.value_counts()

# Save the AKI Onset Table

In [None]:
complete_df.to_csv("...", index=False)