In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()
import nbimporter
from A_Label_AKI_Onsets import read_and_format_DX, read_and_format_DEMO
import warnings
warnings.simplefilter(action="...", category=FutureWarning)

Procedures:    
1. Matching non-AKI patients according step 1
2. Append cormorbidities to non-AKI patients
3. Score non-AKI patients and choose the best one

# Read Data with Cluster Labels

In [None]:
%store -r pat_id_cols
%store -r ct_names
%store -r raw_path
%store -r race_mapping

In [None]:
AKI_df = pd.read_csv("...")

In [None]:
# format data
AKI_df[pat_id_cols] = AKI_df[pat_id_cols].astype(str)
date_cols = ["...", "...", "..."]
for col in date_cols:
    AKI_df[col] = pd.to_datetime(AKI_df[col], format = "...")

In [None]:
AKI_DX_df = read_and_format_DX(ct_names, raw_path, AKI_df)

In [None]:
AKI_DX_df.drop("...", axis = 1, inplace = True)

In [None]:
DEMO_df = read_and_format_DEMO(ct_names, raw_path, race_mapping)

# Prepare DX Reference Dict

In [None]:
Comorbidities_dict = {
    "...": {"...": ["..."], "...":["..."]},
    "...": {"...": ["..."], "...":["..."]},
    "...": {"...": ["..."], "...":["...", "...", "...", "..."]},
    "...": {"...": ["..."], "...":["..."]},
    "...": {"...": ["..."], "...":["..."]},
    "...":{"...":["..."], "...": ["...", "..."]},
    "...":{"...":["...", "...", "...", "...", "...", "...", "...", "...", "...", "...",
                                 "...", "...", "..."], 
                             "...": ["...", "...", "...", "...", "...", "...",
                                    "...", "...", "...", "...", "...", "...",
                                   "...", "...", "...", "...", "...", "...", "...", "...",
                                   "...", "...", "...", "...", "...", "...", "...", "...", "...", "...",
                                   "...", "...", "...", "...", "...", "...", "...", "...", "...",
                                   "...", "...", "...", "...", "...", "...", "...", "...", "...", "...",
                                    "...", "...", "...", "...", "...", "...", "...", "...", "...", "...",
                                   "...", "...", "...", "...", "...", "...", "...", "...", "...", "...", "...", "..."]},
    "...": {"...":["...", "...", "...", "...", "...", "...",
                                       "...", "..."], 
                                   "...": ["...", "...", "...", "...", "...", "...", "..."]},
    "...":{"...":["...", "...", "...", "...", "...", "...", "...", "...", "..."], 
                            "...": ["...", "...", "..."]},
    "...":{"...":["...", "...", "...", "...", "...", "..."], 
                              "...": ["...", "...", "...", "..."]},
    "...":{"...":["..."], "...": ["..."]},
    "...":{"...":["...", "...", "...", "..."], "...": ["...", "..."]},
    "...":{"...":["..."], "...": ["..."]},
    "...":{"...":["..."], "...": ["..."]},
    "...":{"...":["..."], "...": ["..."]},
    "...":{"...":["..."], "...": ["..."]},
    "...": {"...":["...", "...", "...", "...", "...", "..."], 
                             "...": ["...", "...", "...", "...", "...", "..."]},
    "...": {"...":["...", "...", "...", "...", "...", "...", "..."], 
                             "...": ["...", "...", "...", "...", "...", "...", "...", "..."]},
    "...":{"...":["..."], "...": ["..."]},
    "...":{"...":["..."], "...": ["..."]},
    "...":{"...":["..."], "...": ["..."]},
}
%store Comorbidities_dict

In [None]:
rows = []
for condition, codes in Comorbidities_dict.items():
    formatted_condition = condition.replace("...", "...").title()
    icd_9_codes = "...".join(codes.get("...", []))  
    icd_10_codes = "...".join(codes.get("...", []))  
    rows.append([formatted_condition, icd_9_codes, icd_10_codes])

ICD_code_df = pd.DataFrame(rows, columns=["...", "...", "..."])

In [None]:
ICD_code_df.to_csv("...", index = False)

In [None]:
def reverse_comorbidity_dict(Comorbidities_dict):
    reversed_Comorbidities_dict = dict()
    for disease, ICDs in Comorbidities_dict.items():
        merged_codes = []
        for codes in ICDs.values():
            merged_codes += codes
        reversed_Comorbidities_dict[disease] = merged_codes

    reversed_Comorbidities_dict = {code: disease for disease, codes in reversed_Comorbidities_dict.items() for code in codes} 
    return reversed_Comorbidities_dict

In [None]:
reversed_Comorbidities_dict = reverse_comorbidity_dict(Comorbidities_dict)
%store reversed_Comorbidities_dict

# Append Comorbidities

In [None]:
def append_comorbidities(pat_info, DX_df, reversed_Comorbidities_dict, pat_id_cols):
    ct_names = list(pat_info.CENTER_NAME.unique())
    pat_DX_df = pat_info.merge(DX_df, on = ["...", "..."], how = "...")
    
        
    # make sure that comorbidities happened in the past and unique for each encounter
    pat_DX_df = pat_DX_df[pat_DX_df.DX_DATE < pat_DX_df.ADMIT_DATE]
    
    # sanity check of merge
    for ct_name in tqdm(ct_names):
        assert(len(pat_DX_df[pat_DX_df.CENTER_NAME == ct_name] != 0))

    # remove DX dups for each encoutner (we do not care about DX time here), since DX
    # codes are unique between ICD 9 and 10, we drop DX_TYPE
    pat_DX_df.drop(["...", "..."], axis = 1, inplace = True)
    pat_DX_df = pat_DX_df.drop_duplicates(subset=["...", "...", "...", "..."])

    # convert DX to the comorbidities we interest in, drop others
    pat_DX_df.loc[:, "..."] = pat_DX_df.loc[:, "..."].map(reversed_Comorbidities_dict)
    pat_DX_df.dropna(subset=["..."], inplace=True)
    pat_DX_df.drop("...", axis = 1, inplace = True)
    
    # pivot table
    pat_DX_df["..."] = True
    pat_DX_df = pat_DX_df.pivot_table(
        index=pat_id_cols,
        columns="...",
        values="...",
        aggfunc="...",  
        fill_value=False 
    )
    pat_DX_df = pat_DX_df.reset_index()
    
    final_df = pat_info.merge(pat_DX_df, on = pat_id_cols, how = "...")

    return final_df

In [None]:
AKI_df = append_comorbidities(AKI_df, AKI_DX_df, reversed_Comorbidities_dict, pat_id_cols)

In [None]:
comorbidity_cols = list(Comorbidities_dict.keys())

In [None]:
# those who do not have records we set their comorbidities to False
AKI_df.loc[:, comorbidity_cols] = AKI_df.loc[:, comorbidity_cols].fillna(False)

# Get Non-AKI Patient Pool and Filtered by Step 1

In [None]:
onset_df = pd.read_csv("...")

In [None]:
NON_AKI_df = onset_df[onset_df.NONAKI_SINCE_ADMIT == True].copy(deep = True).reset_index(drop = True)

In [None]:
# format data
NON_AKI_df[pat_id_cols] = NON_AKI_df[pat_id_cols].astype(str)
date_cols = ["...", "..."]
for col in date_cols:
    NON_AKI_df[col] = pd.to_datetime(NON_AKI_df[col], format = "...")

In [None]:
# append demographics
NON_AKI_df = NON_AKI_df.merge(DEMO_df, on = pat_id_cols, how = "...")
# drop those who do not have demos
NON_AKI_df = NON_AKI_df.dropna(subset=["...", "...", "..."])
# only keep each patients' earliest encunter to keep them unique
NON_AKI_df = NON_AKI_df.sort_values(by=["...", "...", "..."], ascending=True)
NON_AKI_df = NON_AKI_df.groupby(["...", "..."]).first().reset_index()
# further remove those who already in AKI df in case they match themselves
NON_AKI_df = NON_AKI_df[~NON_AKI_df.set_index(["...", "..."]).index.isin(AKI_df.set_index(["...", "..."]).index)]

In [None]:
def find_matches_by_DEMO(row, NON_AKI_df, NON_AKI_info_dict, sample_n):
    age = row["..."]
    gender = row["..."]
    is_black = row["..."] == "..."
    SCr_base = row["..."]
    ct_name = row["..."]
    
    
    this_center_NON_AKI = NON_AKI_info_dict[ct_name]
    
    matched = this_center_NON_AKI[
        (this_center_NON_AKI["..."].between(age - 1, age + 1)) &
        ((this_center_NON_AKI["..."] == "...") == is_black) &
        (this_center_NON_AKI["..."] == gender) &
        (this_center_NON_AKI["..."].between(0.95 * SCr_base, 1.05 * SCr_base))
    ]
    
    #loose: if not found matched, we lose the age
    if len(matched) == 0:
        matched = this_center_NON_AKI[
                (this_center_NON_AKI["..."].between(age - 2, age + 2)) &
                ((this_center_NON_AKI["..."] == "...") == is_black) &
                (this_center_NON_AKI["..."] == gender) &
                (this_center_NON_AKI["..."].between(0.95 * SCr_base, 1.05 * SCr_base))
        ]
    #loose: if not found matched, we lose the SCr 
    if len(matched) == 0:
        matched = this_center_NON_AKI[
                (this_center_NON_AKI["..."].between(age - 2, age + 2)) &
                ((this_center_NON_AKI["..."] == "...") == is_black) &
                (this_center_NON_AKI["..."] == gender) &
                (this_center_NON_AKI["..."].between(0.9 * SCr_base, 1.1 * SCr_base))
        ]
    
    #loose: if not found matched, we use the whole dataset to search
    if len(matched) == 0:
        matched = NON_AKI_df[
            (NON_AKI_df["..."].between(age - 1, age + 1)) &
            ((NON_AKI_df["..."] == "...") == is_black) &
            (NON_AKI_df["..."] == gender) &
            (NON_AKI_df["..."].between(0.95 * SCr_base, 1.05 * SCr_base))
        ]
                    
    #loose: if still not found, we loose the requirement of race, age and SCr
    if len(matched) == 0:
        matched = NON_AKI_df[
            (NON_AKI_df["..."].between(age - 2, age + 2)) &
            (NON_AKI_df["..."] == gender) &
            (NON_AKI_df["..."].between(0.95 * SCr_base, 1.05 * SCr_base))
        ]

    #loose: if still no match, we loose the requirement of SCr
    if len(matched) == 0:
        matched = NON_AKI_df[
            (NON_AKI_df["..."].between(age - 2, age + 2)) &
            (NON_AKI_df["..."] == gender) & 
            (NON_AKI_df["..."].between(0.9 * SCr_base, 1.1 * SCr_base))
        ] 

    #loose: if still no match, we loose the requirement of SCr
    if len(matched) == 0:
        matched = NON_AKI_df[
            (NON_AKI_df["..."].between(age - 2, age + 2)) &
            (NON_AKI_df["..."] == gender) & 
            (NON_AKI_df["..."].between(0.8 * SCr_base, 1.2 * SCr_base))
        ] 

    #loose: only requirement age
    if len(matched) == 0:
        matched = NON_AKI_df[
            (NON_AKI_df["..."].between(age - 2, age + 2))
        ] 

    #loose: use all data
    if len(matched) == 0:
        matched = NON_AKI_df
    
    n_rows = min(len(matched), sample_n)
    matched = matched.sample(n=n_rows, replace=False)
    
    # label each non-AKI with its matched AKI patients 
    matched["..."] = ct_name
    matched["..."] = row["..."]
    matched["..."] = row["..."]
    matched["..."] = row["..."]
    
    return matched

In [None]:
# prepare a dict in advance to reduce conputation time
NON_AKI_info_dict = dict()
for ct_name in ct_names:
    NON_AKI_df_this_center = NON_AKI_df[NON_AKI_df.CENTER_NAME == ct_name]
    NON_AKI_info_dict[ct_name] = NON_AKI_df_this_center

In [None]:
NON_AKI_matched_df_lists = AKI_df.progress_apply(lambda row: find_matches_by_DEMO(row, 
                                                                    NON_AKI_df, NON_AKI_info_dict, 50), axis=1)

In [None]:
NON_AKI_matched_df = pd.concat(NON_AKI_matched_df_lists.tolist(), ignore_index=True)

# Append Comorbidities to Non-AKI Patients 

In [None]:
NON_AKI_matched_ids = NON_AKI_matched_df.loc[:, pat_id_cols + ["..."]]
NON_AKI_matched_ids.drop_duplicates(subset = pat_id_cols, inplace = True)

In [None]:
NON_AKI_DX_df = read_and_format_DX(ct_names, raw_path, NON_AKI_matched_ids)

In [None]:
NON_AKI_DX_df.drop("...", axis = 1, inplace = True)

In [None]:
NON_AKI_matched_DX_df = append_comorbidities(NON_AKI_matched_ids, NON_AKI_DX_df, 
                                             reversed_Comorbidities_dict, pat_id_cols)

In [None]:
# those who do not have records we set their comorbidities to False
NON_AKI_matched_DX_df.loc[:, comorbidity_cols] = NON_AKI_matched_DX_df.loc[:, comorbidity_cols].fillna(False)

# Screening by Scoring on Comorbidities

In [None]:
NON_AKI_matched_df = NON_AKI_matched_df.merge(NON_AKI_matched_DX_df[pat_id_cols + comorbidity_cols],
                                             on = pat_id_cols, how = "...")

In [None]:
AKI_df_for_merge = AKI_df[pat_id_cols + comorbidity_cols].copy(deep = True)

In [None]:
AKI_df_for_merge.columns = ["..." + col for col in AKI_df_for_merge.columns]

In [None]:
AKI_NON_AKI_matched_df = NON_AKI_matched_df.merge(AKI_df_for_merge, 
                                                  on = ["...", "...", "..."], 
                                                  how = "...")

In [None]:
comorbidity_socre_dict = {
    "...": 4,
    "...": 4,
    "...": 4,
    "...": 4,
    "...": 4,
    "...": 1,
    "...": 2,
    "...": 1,
    "...": 2,
    "...": 1,
    "...": 1,
    "...": 2,
    "...": 2,
    "...": 1,
    "...": 1,
    "...": 1,
    "...": 2,
    "...": 2, 
}

In [None]:
def score_each_patient_pair(row, comorbidity_socre_dict):
    score = 0
    for k, v in comorbidity_socre_dict.items():
        if (row[k] == True) and (row["..." + k] == True):
            score += v
        elif (row[k] == False) and (row["..." + k] == False):
            score += 0
        else:
            score -= v
    return score

In [None]:
AKI_NON_AKI_matched_df["..."] = AKI_NON_AKI_matched_df.progress_apply(score_each_patient_pair, 
                                                                 args = (comorbidity_socre_dict,), axis = 1)

# Pick the Highest Matching Score without Replacement

In [None]:
# Step 1: sort by scores 
AKI_NON_AKI_matched_df = AKI_NON_AKI_matched_df.sort_values(by=["...", "...", "...", "..."], 
                                                            ascending=[True, True, True, False])

# create a set to save those used PATID
selected_patids = set()
# save the picked rows
final_rows = []

# Step 2: process each group 
for _, group in tqdm(AKI_NON_AKI_matched_df.groupby(["...", "...", "..."])):
    selected_row = None

    for idx, row in group.iterrows():
        # Step 3: if this row was not chosen
        if row["..."] not in selected_patids:
            selected_row = row
            selected_patids.add(row["..."])
            break

    # Step 4: if non matched, just pick the highest score one
    if selected_row is None:
        selected_row = group.iloc[0]

    final_rows.append(selected_row)

AKI_best_match = pd.DataFrame(final_rows)

In [None]:
# fill in those columns that is not aligned with AKI_df
from B_Data_Preprocessing import calculate_ckd_epi
%store -r window_full

In [None]:
AKI_best_match[window_full] = np.NaN
AKI_best_match["..."] = (AKI_best_match["..."] - AKI_best_match["..."]).dt.days
AKI_best_match["..."] = AKI_best_match.apply(calculate_ckd_epi, axis = 1)

# we want the Non-AKI cluster to be the base, thus change the AKI_df cluster
AKI_best_match["..."] = AKI_best_match["..."]
n_clusters = len(AKI_df.CLUSTERS.unique())
AKI_df["..."] = AKI_df["..."] + n_clusters

In [None]:
Non_AKI_final_df = AKI_best_match.loc[:, AKI_df.columns]

In [None]:
all_patients_df = pd.concat([AKI_df, Non_AKI_final_df], axis = 0)

In [None]:
all_patients_df.reset_index(drop = True, inplace = True)

# Sanity Check of Final DataFrame and Save

In [None]:
all_patients_df.to_csv("...", index = False)

In [None]:
all_patients_df

In [None]:
all_patients_df.CLUSTERS.value_counts()