In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import multiprocessing
import json
from sklearn.metrics import pairwise_distances
import os
# pandas show all columns
pd.set_option('display.max_columns', None)

In [None]:
# Load the variables
with open('./utils/variables.json', 'r') as file:
    variables = json.load(file)

SCR_feature_space = variables['SCR_feature_space']
LAB_feature_space = variables['LAB_feature_space']
train_len = variables['train_len']
test_len = variables['test_len']
print(len(SCR_feature_space), len(LAB_feature_space))
print(train_len, test_len)

# get num_processors for parallel computing
num_processors = multiprocessing.cpu_count()

In [None]:
figure_folder = "External_validation"

# Process UPITT Dataset

In [None]:
#Read in Onsets data and only use KUMC data
All_onsets = pd.read_csv('.../NEW_ONSETS.csv')
ext_ONSET = All_onsets.loc[All_onsets.CENTER_NAME == "UPITT"].copy(deep = True)

In [None]:
#read in lab
raw_path = '/blue/yonghui.wu/hoyinchan/Data/data2022raw/'
data_path = raw_path + "UPITT" + '/raw/'
ext_LAB = pd.read_csv(data_path + 'AKI_LAB.csv', delimiter = ',', usecols = ['PATID', 'LAB_LOINC', 'SPECIMEN_DATE', 'RESULT_NUM'])

In [None]:
# Read in SCr trajectories
SCR_use_cols = ['ONSETS_ENCOUNTERID','PATID','ENCOUNTERID','SPECIMEN_DATE','RESULT_NUM', 'DAYS_SINCE_ADMIT']
ext_SCR = pd.read_csv(data_path + "AKI_LAB_SCR.csv", delimiter = ',', usecols=SCR_use_cols)

In [None]:
#format datatype for merge
#exclude those baseline SCr > 3.5, which indicate poor renal functions
ext_ONSET = ext_ONSET.loc[ext_ONSET.SERUM_CREAT_BASE < 3.5, :].copy(deep = True)

ext_ONSET.loc[:, ["PATID", "ONSETS_ENCOUNTERID"]] = ext_ONSET[["PATID", "ONSETS_ENCOUNTERID"]].astype(str)

time_cols = ["ADMIT_DATE", "DISCHARGE_DATE", "AKI1_ONSET", "AKI2_ONSET", "AKI3_ONSET"]
for time_col in time_cols:
    ext_ONSET[time_col] = pd.to_datetime(ext_ONSET[time_col], format = "mixed")
    
# binary predictiton task
ext_ONSET.loc[:, "EARLIEST_ONSET_DATE"] = np.min(ext_ONSET[["AKI1_ONSET", "AKI2_ONSET", "AKI3_ONSET"]], axis = 1)
ext_ONSET.loc[:, "AKI_LABEL"] = ext_ONSET["EARLIEST_ONSET_DATE"].notna().astype(int)

ext_ONSET.drop(["CENTER_NAME", "SERUM_CREAT_BASE", "NONAKI_SINCE_ADMIT", "AKI1_ONSET",
           "AKI2_ONSET", "AKI3_ONSET"], axis = 1, inplace = True)

#process data type 
ext_SCR["PATID"] = ext_SCR["PATID"].astype(str)

In [None]:
ext_ONSET_SCR = ext_ONSET.merge(ext_SCR[["PATID", "SPECIMEN_DATE", "RESULT_NUM"]], on = "PATID", how = "left")
#after merging, process date time
ext_ONSET_SCR["SPECIMEN_DATE"] = pd.to_datetime(ext_ONSET_SCR["SPECIMEN_DATE"], format = "mixed")
#filter out those beyond this hospitalization (we also need history prior to this hospitalization)
ext_ONSET_SCR = ext_ONSET_SCR.loc[ext_ONSET_SCR.SPECIMEN_DATE <= ext_ONSET_SCR.DISCHARGE_DATE, :]
ext_ONSET_SCR = ext_ONSET_SCR.sort_values(by=['PATID', 'ADMIT_DATE', 'SPECIMEN_DATE'])
# get average SCr on the same day
ext_ONSET_SCR_avg = ext_ONSET_SCR.groupby(['PATID', 'ONSETS_ENCOUNTERID', 'SPECIMEN_DATE'])['RESULT_NUM'].mean().reset_index()

In [None]:
# append the info back
ext_ONSET_SCR_app = ext_ONSET_SCR.loc[:, ["PATID", "ONSETS_ENCOUNTERID", "ADMIT_DATE", "DISCHARGE_DATE", "EARLIEST_ONSET_DATE", "AKI_LABEL"]]
ext_ONSET_SCR_app.drop_duplicates(inplace = True)
ext_ONSET_SCR_avg = ext_ONSET_SCR_app.merge(ext_ONSET_SCR_avg, on = ["PATID", "ONSETS_ENCOUNTERID"], how = "left")

In [None]:
# get the prediction point for non-AKI patient
ext_non_AKI_pat = ext_ONSET_SCR_avg.loc[ext_ONSET_SCR_avg.AKI_LABEL == 0, ["PATID", "ONSETS_ENCOUNTERID", "SPECIMEN_DATE"]]
ext_non_AKI_pat.drop_duplicates(subset = ["PATID", "ONSETS_ENCOUNTERID"], keep = "last", inplace = True)
ext_non_AKI_pat.rename(columns = {"SPECIMEN_DATE": "PREDICTION_POINT"}, inplace = True)
ext_ONSET_SCR_avg = ext_ONSET_SCR_avg.merge(ext_non_AKI_pat, on = ["PATID", "ONSETS_ENCOUNTERID"], how = "left")

In [None]:
ext_ONSET_SCR_avg.loc[ext_ONSET_SCR_avg.AKI_LABEL == 1, "PREDICTION_POINT"] = ext_ONSET_SCR_avg.loc[ext_ONSET_SCR_avg.AKI_LABEL == 1, "EARLIEST_ONSET_DATE"]
#check that we have predicition point for each encounter
assert(ext_ONSET_SCR_avg.PREDICTION_POINT.isna().mean() == 0)

In [None]:
# the time frame we need for SCr is the -8 to -2 days prior to prediction point
ext_ONSET_SCR_avg = ext_ONSET_SCR_avg[((ext_ONSET_SCR_avg.SPECIMEN_DATE <= (ext_ONSET_SCR_avg.PREDICTION_POINT) - pd.Timedelta(days=2))) & \
                             (ext_ONSET_SCR_avg.SPECIMEN_DATE >= ext_ONSET_SCR_avg.PREDICTION_POINT - pd.Timedelta(days=8))]
#drop patients with less than 2 SCr measurements during the 7-day window
# group them and calcualte number of measurements
measure_num = ext_ONSET_SCR_avg.groupby('ONSETS_ENCOUNTERID').size()
encounterID_to_drop = measure_num[measure_num < 2].index
ext_ONSET_SCR_avg = ext_ONSET_SCR_avg.loc[~ext_ONSET_SCR_avg.ONSETS_ENCOUNTERID.isin(encounterID_to_drop), :]

In [None]:
#pivot all the SCr values, that is create features -8 ~ -2 and entries are RESULT_NUM
ext_ONSET_SCR_avg["DAYS_BEFORE_PREDICTION_POINT"] = (ext_ONSET_SCR_avg["SPECIMEN_DATE"] - ext_ONSET_SCR_avg["PREDICTION_POINT"]).dt.days

#prepare a skleleton to merge on
unique_encounterids = list(ext_ONSET_SCR_avg['ONSETS_ENCOUNTERID'].unique())
time_window = np.arange(-8, -1)  # from -8 to -2
skeleton = pd.MultiIndex.from_product([unique_encounterids, time_window], 
                                      names=['ONSETS_ENCOUNTERID', 'DAYS_BEFORE_PREDICTION_POINT']).to_frame(index=False)
#merge on
skeleton = pd.merge(skeleton, ext_ONSET_SCR_avg, on=['ONSETS_ENCOUNTERID', 'DAYS_BEFORE_PREDICTION_POINT'], how='left')

#pivot
ONSET_SCR_formatted = skeleton.pivot(index='ONSETS_ENCOUNTERID', 
                                          columns='DAYS_BEFORE_PREDICTION_POINT', 
                                          values='RESULT_NUM').reset_index()

# get other info back
ONSET_SCR_app2 = ext_ONSET_SCR_avg.loc[:, ["PATID", "ONSETS_ENCOUNTERID", "ADMIT_DATE", "DISCHARGE_DATE", 
                                       "PREDICTION_POINT", "AKI_LABEL"]]
ONSET_SCR_app2.drop_duplicates(inplace = True)
ext_ONSET_SCR_formatted = ONSET_SCR_formatted.merge(ONSET_SCR_app2, on = "ONSETS_ENCOUNTERID", how = "left")

In [None]:
# only keep the earliest encounter of each patient
ext_ONSET_SCR_formatted = ext_ONSET_SCR_formatted.sort_values(by=['PATID', 'ADMIT_DATE'])
ext_ONSET_SCR_formatted = ext_ONSET_SCR_formatted.drop_duplicates(subset='PATID', keep='first')

In [None]:
ext_LAB["PATID"] = ext_LAB["PATID"].astype(str)
# merge the lab 
ext_ONSET_SCR_LAB = ext_ONSET_SCR_formatted.merge(ext_LAB, on = "PATID", how = "left")
ext_ONSET_SCR_LAB["SPECIMEN_DATE"] = pd.to_datetime(ext_ONSET_SCR_LAB["SPECIMEN_DATE"], format = "mixed")
ext_ONSET_SCR_LAB = ext_ONSET_SCR_LAB[(ext_ONSET_SCR_LAB.SPECIMEN_DATE <= (ext_ONSET_SCR_LAB.PREDICTION_POINT - pd.Timedelta(days=2))) & \
                              (ext_ONSET_SCR_LAB.SPECIMEN_DATE >= ext_ONSET_SCR_LAB.ADMIT_DATE - pd.Timedelta(days=8))]

In [None]:
#we only keep the lastest result of a certain lab within the time window
ext_ONSET_SCR_LAB_temp = ext_ONSET_SCR_LAB.sort_values(by=['PATID', 'ONSETS_ENCOUNTERID', 'LAB_LOINC', 'SPECIMEN_DATE'])
ext_ONSET_SCR_LAB_temp = ext_ONSET_SCR_LAB_temp.groupby(['PATID', 'ONSETS_ENCOUNTERID', 'LAB_LOINC']).last().reset_index()
#turn lab into feature columns
ext_LAB_info = ext_ONSET_SCR_LAB_temp.pivot(index='ONSETS_ENCOUNTERID', columns='LAB_LOINC', values='RESULT_NUM')
ext_LAB_info = ext_LAB_info.reset_index()

In [None]:
# align feature space with the internal dataset
ext_LAB_info = ext_LAB_info[["ONSETS_ENCOUNTERID"] + LAB_feature_space]
#merge them back to the original dataframe
ext_ONSET_SCR_LAB = ext_ONSET_SCR_formatted.merge(ext_LAB_info, on = 'ONSETS_ENCOUNTERID', how = 'left')

# Sample the External Dataset

In [None]:
%store -r start_date
%store -r split_date
%store -r end_date

In [None]:
# filter patients that from the same time range and sample number
ext_test = ext_ONSET_SCR_LAB[(ext_ONSET_SCR_LAB.ADMIT_DATE >= split_date) & (ext_ONSET_SCR_LAB.ADMIT_DATE < end_date)]

In [None]:
from sklearn.model_selection import train_test_split
ext_test_sampled, _ = train_test_split(
    ext_test, 
    test_size=(len(ext_test) - test_len) / len(ext_test), 
    random_state=88, 
    stratify=ext_test['AKI_LABEL']
)

In [None]:
ext_test_sampled.to_csv('.../ext_test.csv', index = False)

In [None]:
ext_test_sampled = pd.read_csv('.../ext_test.csv')

In [None]:
# re-index the external dataset such that the index start from train_len
ext_test_sampled.index = range(train_len, train_len + len(ext_test_sampled))

# Read Internal Dataset

In [None]:
int_dataset = pd.read_csv(".../dataset.csv")
assert(len(int_dataset) == (train_len + test_len))
int_train = int_dataset.iloc[:train_len, :].copy(deep = True)

In [None]:
# align the feature space with the internal dataset
int_train.columns = int_train.columns.map(str)
ext_test_sampled.columns = ext_test_sampled.columns.map(str)
ext_test_sampled = ext_test_sampled.loc[:, SCR_feature_space + LAB_feature_space + ['AKI_LABEL']]
# concatenate the two datasets
dataset_full = pd.concat([int_train, ext_test_sampled], axis = 0)
dataset_full.reset_index(drop = True, inplace = True)

In [None]:
dataset_full.head()

# Compute Data Overlap Rates

In [None]:
SCR_full = dataset_full.loc[:, SCR_feature_space]
LAB_full = dataset_full.loc[:, LAB_feature_space]
SCR_full_bin = SCR_full.notna().astype(int)
LAB_full_bin = LAB_full.notna().astype(int)

In [None]:
from utils.Data_Overlap_Rates_Computing import parallel_overlap_matrix_comp, check_matrix_sanity, calculate_overlap_rate_SCR, calculate_overlap_rate_LAB
%store -r normal_distribution_SCR
%store -r lab_overlap_weighting

In [None]:
SCR_overlap = parallel_overlap_matrix_comp(SCR_full_bin, num_processors, calculate_overlap_rate_SCR, normal_distribution_SCR)
check_matrix_sanity(SCR_overlap)
print(np.median(SCR_overlap))
print(np.mean(SCR_overlap))
np.save('.../SCR_overlap_external.npy', SCR_overlap)

In [None]:
LAB_overlap = parallel_overlap_matrix_comp(LAB_full_bin, num_processors, calculate_overlap_rate_LAB, lab_overlap_weighting)
check_matrix_sanity(LAB_overlap)
print(np.median(LAB_overlap))
print(np.mean(LAB_overlap))
np.save('.../lab_overlap_external.npy', LAB_overlap)

# Compute Distance

In [None]:
from utils.Distance_Computing import parallel_distance_matrix, get_DTW_distance
from utils.Z_Helping_Functions import translate_dist_mtx_to_simi, fast_argsort, min_max_normalization

In [None]:
SCR_DTW_dist_full = parallel_distance_matrix(SCR_full, num_processors, get_DTW_distance)
np.save('.../SCR_DTW_dist_external.npy', SCR_DTW_dist_full)

In [None]:
SCR_DTW_dist_full = np.load('.../SCR_DTW_dist_external.npy')

In [None]:
# transfrom distance mtx to similarity score mtx by min-max normalization and substration from 1
SCR_DTW_simi_full = translate_dist_mtx_to_simi(SCR_DTW_dist_full)
# sort similarity score mtx into idx matrix by most similar rank highest. This is for the entire dataset, train + test
SCR_DTW_idx_full = fast_argsort(SCR_DTW_simi_full, num_processors)

Impute Missing Values for SCR and LAB Respectively

In [None]:
# SCR, since it just row wise we do not need to split into train and test
SCR_full = SCR_full.interpolate(method='linear', axis = 1)
SCR_full = SCR_full.bfill(axis=1)
SCR_full = SCR_full.ffill(axis=1)
# after imputation, we can safely split into train and test
SCR_train = SCR_full.iloc[:train_len, :].copy(deep = True)
SCR_test = SCR_full.iloc[train_len:, :].copy(deep = True)

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

In [None]:
# it is of note that the imputation is done on the lab train and lab test separately
# this is to avoid data leakage
LAB_train = int_train.loc[:, LAB_feature_space].copy(deep = True)
LAB_test = ext_test_sampled.loc[:, LAB_feature_space].copy(deep = True)
# lab normalization
LAB_train = (LAB_train - LAB_train.min(skipna=True)) / (LAB_train.max(skipna=True) - LAB_train.min(skipna=True))
LAB_test = (LAB_test - LAB_test.min(skipna=True)) / (LAB_test.max(skipna=True) - LAB_test.min(skipna=True))
# impute lab missing values
imputer = IterativeImputer(missing_values=np.nan, max_iter=1000, random_state=42)
imputer.fit(LAB_train)
LAB_train_temp = imputer.transform(LAB_train)
LAB_test_temp = imputer.transform(LAB_test)
LAB_train.loc[:, :] = LAB_train_temp
LAB_test.loc[:, :] = LAB_test_temp
# Concate train and test
LAB_full = pd.concat([LAB_train, LAB_test], axis = 0)

Euclidean, Cosine and Manhattan Matrix of SCR and LAB

In [None]:
from utils.Distance_Computing import compute_similarity

In [None]:
# SCR
# Euclidean
SCR_Euc_simi_full, SCR_Euc_idx_full, _, _ = compute_similarity(SCR_full, 'euclidean', train_len, num_processors)
# Cosine
SCR_Cos_simi_full, SCR_Cos_idx_full, _, _ = compute_similarity(SCR_full, 'cosine', train_len, num_processors)
# Manhattan
SCR_Manh_simi_full, SCR_Manh_idx_full, _, _ = compute_similarity(SCR_full, 'manhattan', train_len, num_processors)

In [None]:
# LAB
# Euclidean
LAB_Euc_simi_full, LAB_Euc_idx_full, _, _ = compute_similarity(LAB_full, 'euclidean', train_len, num_processors)
# Cosine
LAB_Cos_simi_full, LAB_Cos_idx_full, _, _ = compute_similarity(LAB_full, 'cosine', train_len, num_processors)
# Manhattan
LAB_Manh_simi_full, LAB_Manh_idx_full, _, _ = compute_similarity(LAB_full, 'manhattan', train_len, num_processors)

In [None]:
# simi here is the un-ordered, normalized similarity score matrix, idx is the ordered paitent index matrix
# nw stands for not weighted by data overlap rates matrix
nw_fea_arrs_dict = {"SCR": {"DTW": {"simi": {"full": SCR_DTW_simi_full}, "idx": {"full": SCR_DTW_idx_full}}, 
                            "Euc": {"simi": {"full": SCR_Euc_simi_full}, "idx": {"full": SCR_Euc_idx_full}}, 
                            "Cos": {"simi": {"full": SCR_Cos_simi_full}, "idx": {"full": SCR_Cos_idx_full}}, 
                            "Manh":{"simi": {"full": SCR_Manh_simi_full}, "idx": {"full": SCR_Manh_idx_full}}}, 
                    "LAB": {"Euc": {"simi": {"full": LAB_Euc_simi_full}, "idx": {"full": LAB_Euc_idx_full}}, 
                            "Cos": {"simi": {"full": LAB_Cos_simi_full}, "idx": {"full": LAB_Cos_idx_full}}, 
                            "Manh":{"simi": {"full": LAB_Manh_simi_full}, "idx": {"full": LAB_Manh_idx_full}}}}

# Data Overlap Rates Weighting

In [None]:
from utils.Distance_Computing import overlap_rates_weighting

In [None]:
# read pre-computed pairwise data overlap rates
SCR_overlap_full = np.load('.../SCR_overlap_external.npy')
LAB_overlap_full = np.load('.../lab_overlap_external.npy')

In [None]:
# on full data (for testing)
# SCR
SCR_DTW_simi_wt_full, SCR_DTW_idx_wt_full = overlap_rates_weighting(SCR_overlap_full, nw_fea_arrs_dict["SCR"]["DTW"]["simi"]["full"], num_processors)
SCR_Euc_simi_wt_full, SCR_Euc_idx_wt_full = overlap_rates_weighting(SCR_overlap_full, nw_fea_arrs_dict["SCR"]["Euc"]["simi"]["full"], num_processors)
SCR_Cos_simi_wt_full, SCR_Cos_idx_wt_full = overlap_rates_weighting(SCR_overlap_full, nw_fea_arrs_dict["SCR"]["Cos"]["simi"]["full"], num_processors)
SCR_Manh_simi_wt_full, SCR_Manh_idx_wt_full = overlap_rates_weighting(SCR_overlap_full, nw_fea_arrs_dict["SCR"]["Manh"]["simi"]["full"], num_processors)

# LAB
LAB_Euc_simi_wt_full, LAB_Euc_idx_wt_full = overlap_rates_weighting(LAB_overlap_full, nw_fea_arrs_dict["LAB"]["Euc"]["simi"]["full"], num_processors)
LAB_Cos_simi_wt_full, LAB_Cos_idx_wt_full = overlap_rates_weighting(LAB_overlap_full, nw_fea_arrs_dict["LAB"]["Cos"]["simi"]["full"], num_processors)
LAB_Manh_simi_wt_full, LAB_Manh_idx_wt_full = overlap_rates_weighting(LAB_overlap_full, nw_fea_arrs_dict["LAB"]["Manh"]["simi"]["full"], num_processors)

In [None]:
# simi here is the un-ordered, normalized similarity score matrix, idx is the ordered paitent index matrix
# nw stands for not weighted by data overlap rates matrix
wt_fea_arrs_dict = {"SCR": {"DTW": {"simi": {"full": SCR_DTW_simi_wt_full}, "idx": {"full": SCR_DTW_idx_wt_full}}, 
                            "Euc": {"simi": {"full": SCR_Euc_simi_wt_full}, "idx": {"full": SCR_Euc_idx_wt_full}}, 
                            "Cos": {"simi": {"full": SCR_Cos_simi_wt_full}, "idx": {"full": SCR_Cos_idx_wt_full}}, 
                            "Manh":{"simi": {"full": SCR_Manh_simi_wt_full}, "idx": {"full": SCR_Manh_idx_wt_full}}}, 
                    "LAB": {"Euc": {"simi": {"full": LAB_Euc_simi_wt_full}, "idx": {"full": LAB_Euc_idx_wt_full}}, 
                            "Cos": {"simi": {"full": LAB_Cos_simi_wt_full}, "idx": {"full": LAB_Cos_idx_wt_full}}, 
                            "Manh":{"simi": {"full": LAB_Manh_simi_wt_full}, "idx": {"full": LAB_Manh_idx_wt_full}}}}

# Test the Best Distance Measures on Test Set Using KNN/LR

In [None]:
train_idx = dataset_full.iloc[:train_len, :].index
test_idx = dataset_full.iloc[train_len:, :].index
y_test = np.array(ext_test_sampled['AKI_LABEL'])
y_full = np.array(dataset_full['AKI_LABEL'])

In [None]:
from utils.Testing import process_idx_arr_for_test

In [None]:
# SCR
SCR_idx_y_nw_dict_test = {}
SCR_idx_y_wt_dict_test = {}
for dist_measure, arrs in tqdm(nw_fea_arrs_dict["SCR"].items()):
    idx_arr_test_clean, y_test_arr = process_idx_arr_for_test(train_idx, test_idx, arrs["idx"]["full"], y_full)
    SCR_idx_y_nw_dict_test[dist_measure] = {"idx": idx_arr_test_clean, "label": y_test_arr}
for dist_measure, arrs in tqdm(wt_fea_arrs_dict["SCR"].items()):
    idx_arr_test_clean, y_test_arr = process_idx_arr_for_test(train_idx, test_idx, arrs["idx"]["full"], y_full)
    SCR_idx_y_wt_dict_test[dist_measure] = {"idx": idx_arr_test_clean, "label": y_test_arr}

In [None]:
# LAB
LAB_idx_y_nw_dict_test = {}
LAB_idx_y_wt_dict_test = {}
for dist_measure, arrs in tqdm(nw_fea_arrs_dict["LAB"].items()):
    idx_arr_test_clean, y_test_arr = process_idx_arr_for_test(train_idx, test_idx, arrs["idx"]["full"], y_full)
    LAB_idx_y_nw_dict_test[dist_measure] = {"idx": idx_arr_test_clean, "label": y_test_arr}
for dist_measure, arrs in tqdm(wt_fea_arrs_dict["LAB"].items()):
    idx_arr_test_clean, y_test_arr = process_idx_arr_for_test(train_idx, test_idx, arrs["idx"]["full"], y_full)
    LAB_idx_y_wt_dict_test[dist_measure] = {"idx": idx_arr_test_clean, "label": y_test_arr}

In [None]:
# here we reduce the number of k to be tested
k_sizes_test = [i for i in range(10, 201, 10)]
print(len(k_sizes_test))

In [None]:
from utils.Testing import evluate_on_test_set

In [None]:
SCR_DTW_control_KNN, _ = evluate_on_test_set(SCR_train, SCR_test, LAB_train, LAB_test, "DTW", "Euc", 
                                             y_test, k_sizes_test, SCR_idx_y_nw_dict_test, SCR_idx_y_wt_dict_test, 
                                             LAB_idx_y_nw_dict_test, LAB_idx_y_wt_dict_test, "KNN")

In [None]:
SCR_DTW_control_LR, _ = evluate_on_test_set(SCR_train, SCR_test, LAB_train, LAB_test, "DTW", "Euc", 
                                             y_test, k_sizes_test, SCR_idx_y_nw_dict_test, SCR_idx_y_wt_dict_test, 
                                             LAB_idx_y_nw_dict_test, LAB_idx_y_wt_dict_test, "LR")

In [None]:
SCR_Euc_control_KNN, LAB_Euc_control_KNN = evluate_on_test_set(SCR_train, SCR_test, LAB_train, LAB_test, "Euc", "Euc", y_test, k_sizes_test, 
                                                               SCR_idx_y_nw_dict_test, SCR_idx_y_wt_dict_test, 
                                                               LAB_idx_y_nw_dict_test, LAB_idx_y_wt_dict_test, "KNN")

In [None]:
SCR_Euc_control_LR, LAB_Euc_control_LR = evluate_on_test_set(SCR_train, SCR_test, LAB_train, LAB_test, "Euc", "Euc", 
                                                             y_test, k_sizes_test, SCR_idx_y_nw_dict_test, SCR_idx_y_wt_dict_test, 
                                                             LAB_idx_y_nw_dict_test, LAB_idx_y_wt_dict_test, "LR")

In [None]:
SCR_Cos_control_KNN, LAB_Cos_control_KNN = evluate_on_test_set(SCR_train, SCR_test, LAB_train, LAB_test, "Cos", "Cos", 
                                                               y_test, k_sizes_test, SCR_idx_y_nw_dict_test, SCR_idx_y_wt_dict_test, 
                                                               LAB_idx_y_nw_dict_test, LAB_idx_y_wt_dict_test, "KNN")

In [None]:
SCR_Cos_control_LR, LAB_Cos_control_LR = evluate_on_test_set(SCR_train, SCR_test, LAB_train, LAB_test, "Cos", "Cos", 
                                                               y_test, k_sizes_test, SCR_idx_y_nw_dict_test, SCR_idx_y_wt_dict_test, 
                                                               LAB_idx_y_nw_dict_test, LAB_idx_y_wt_dict_test, "LR")

In [None]:
SCR_Manh_control_KNN, LAB_Manh_control_KNN = evluate_on_test_set(SCR_train, SCR_test, LAB_train, LAB_test, "Manh", "Manh", 
                                                                 y_test, k_sizes_test, SCR_idx_y_nw_dict_test, SCR_idx_y_wt_dict_test, 
                                                                 LAB_idx_y_nw_dict_test, LAB_idx_y_wt_dict_test, "KNN")

In [None]:
SCR_Manh_control_LR, LAB_Manh_control_LR = evluate_on_test_set(SCR_train, SCR_test, LAB_train, LAB_test, "Manh", "Manh", 
                                                                y_test, k_sizes_test, SCR_idx_y_nw_dict_test, SCR_idx_y_wt_dict_test, 
                                                                LAB_idx_y_nw_dict_test, LAB_idx_y_wt_dict_test, "LR")

In [None]:
NW_WT_performace_results = {
    "SCR_DTW_control_KNN": SCR_DTW_control_KNN,
    "SCR_DTW_control_LR": SCR_DTW_control_LR,
    "SCR_Euc_control_KNN": SCR_Euc_control_KNN,
    "LAB_Euc_control_KNN": LAB_Euc_control_KNN,
    "SCR_Euc_control_LR": SCR_Euc_control_LR,
    "LAB_Euc_control_LR": LAB_Euc_control_LR,
    "SCR_Cos_control_KNN": SCR_Cos_control_KNN,
    "LAB_Cos_control_KNN": LAB_Cos_control_KNN,
    "SCR_Cos_control_LR": SCR_Cos_control_LR,
    "LAB_Cos_control_LR": LAB_Cos_control_LR,
    "SCR_Manh_control_KNN": SCR_Manh_control_KNN,
    "LAB_Manh_control_KNN": LAB_Manh_control_KNN,
    "SCR_Manh_control_LR": SCR_Manh_control_LR,
    "LAB_Manh_control_LR": LAB_Manh_control_LR,
}

# Save to a JSON file
with open("./Results_dict/External_validation/NW_WT_performace_results.json", "w") as json_file:
    json.dump(NW_WT_performace_results, json_file, indent=4)  # Use indent for readability

# Plot and Prove: Data Overlap Rates Weighting can Improve Performance in External Validation

In [None]:
from utils.Plotting import plot_metric_along_k, add_subplot_index, save_figure

In [None]:
with open("./Results_dict/External_validation/NW_WT_performace_results.json", "r") as json_file:
    NW_WT_performace_results = json.load(json_file)
SCR_DTW_control_KNN = NW_WT_performace_results["SCR_DTW_control_KNN"]  
SCR_DTW_control_LR = NW_WT_performace_results["SCR_DTW_control_LR"]  
SCR_Euc_control_KNN = NW_WT_performace_results["SCR_Euc_control_KNN"]  
LAB_Euc_control_KNN = NW_WT_performace_results["LAB_Euc_control_KNN"]  
SCR_Euc_control_LR = NW_WT_performace_results["SCR_Euc_control_LR"]  
LAB_Euc_control_LR = NW_WT_performace_results["LAB_Euc_control_LR"]  
SCR_Cos_control_KNN = NW_WT_performace_results["SCR_Cos_control_KNN"]  
LAB_Cos_control_KNN = NW_WT_performace_results["LAB_Cos_control_KNN"]  
SCR_Cos_control_LR = NW_WT_performace_results["SCR_Cos_control_LR"]  
LAB_Cos_control_LR = NW_WT_performace_results["LAB_Cos_control_LR"]
SCR_Manh_control_KNN = NW_WT_performace_results["SCR_Manh_control_KNN"]
LAB_Manh_control_KNN = NW_WT_performace_results["LAB_Manh_control_KNN"]
SCR_Manh_control_LR = NW_WT_performace_results["SCR_Manh_control_LR"]
LAB_Manh_control_LR = NW_WT_performace_results["LAB_Manh_control_LR"]

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(11, 8)) 

metric = "AUPRC"

plot_metric_along_k(axs[0,0], k_sizes_test, SCR_DTW_control_KNN["NW"][metric], SCR_DTW_control_KNN["WT"][metric], "SCr-KNN, DTW-AROW: %s"%(metric), metric, 'KNN')
plot_metric_along_k(axs[0,1], k_sizes_test, SCR_Euc_control_KNN["NW"][metric], SCR_Euc_control_KNN["WT"][metric], "SCr-KNN, Euclidean: %s"%(metric), metric, 'KNN')
plot_metric_along_k(axs[1,0], k_sizes_test, SCR_Cos_control_KNN["NW"][metric], SCR_Cos_control_KNN["WT"][metric], "SCr-KNN, Cosine: %s"%(metric), metric, 'KNN')
plot_metric_along_k(axs[1,1], k_sizes_test, SCR_Manh_control_KNN["NW"][metric], SCR_Manh_control_KNN["WT"][metric], "SCr-KNN, Manhattan: %s"%(metric), metric, 'KNN')
add_subplot_index(axs, 2, 2)
plt.tight_layout()
plt.show()
save_figure(fig, figure_folder, "SCR-KNN-AUPRC")

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(11, 8)) 

metric = "AUROC"

plot_metric_along_k(axs[0,0], k_sizes_test, SCR_DTW_control_KNN["NW"][metric], SCR_DTW_control_KNN["WT"][metric], "SCr-KNN, DTW-AROW: %s"%(metric), metric, 'KNN')
plot_metric_along_k(axs[0,1], k_sizes_test, SCR_Euc_control_KNN["NW"][metric], SCR_Euc_control_KNN["WT"][metric], "SCr-KNN, Euclidean: %s"%(metric), metric, 'KNN')
plot_metric_along_k(axs[1,0], k_sizes_test, SCR_Cos_control_KNN["NW"][metric], SCR_Cos_control_KNN["WT"][metric], "SCr-KNN, Cosine: %s"%(metric), metric, 'KNN')
plot_metric_along_k(axs[1,1], k_sizes_test, SCR_Manh_control_KNN["NW"][metric], SCR_Manh_control_KNN["WT"][metric], "SCr-KNN, Manhattan: %s"%(metric), metric, 'KNN')
add_subplot_index(axs, 2, 2)
plt.tight_layout()
plt.show()
save_figure(fig, figure_folder, "SCR-KNN-AUROC")

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(11, 8)) 

metric = "AUPRC"

plot_metric_along_k(axs[0,0], k_sizes_test, SCR_DTW_control_LR["NW"][metric], SCR_DTW_control_LR["WT"][metric], "SCr-LR, DTW-AROW: %s"%(metric), metric, "LR")
plot_metric_along_k(axs[0,1], k_sizes_test, SCR_Euc_control_LR["NW"][metric], SCR_Euc_control_LR["WT"][metric], "SCr-LR, Euclidean: %s"%(metric), metric, "LR")
plot_metric_along_k(axs[1,0], k_sizes_test, SCR_Cos_control_LR["NW"][metric], SCR_Cos_control_LR["WT"][metric], "SCr-LR, Cosine: %s"%(metric), metric, "LR")
plot_metric_along_k(axs[1,1], k_sizes_test, SCR_Manh_control_LR["NW"][metric], SCR_Manh_control_LR["WT"][metric], "SCr-LR, Manhattan: %s"%(metric), metric, "LR")
add_subplot_index(axs, 2, 2)
plt.tight_layout()
plt.show()
save_figure(fig, figure_folder, "SCR-LR-AUPRC")

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(11, 8)) 

metric = "AUROC"

plot_metric_along_k(axs[0,0], k_sizes_test, SCR_DTW_control_LR["NW"][metric], SCR_DTW_control_LR["WT"][metric], "SCr-LR, DTW-AROW: %s"%(metric), metric, "LR")
plot_metric_along_k(axs[0,1], k_sizes_test, SCR_Euc_control_LR["NW"][metric], SCR_Euc_control_LR["WT"][metric], "SCr-LR, Euclidean: %s"%(metric), metric, "LR")
plot_metric_along_k(axs[1,0], k_sizes_test, SCR_Cos_control_LR["NW"][metric], SCR_Cos_control_LR["WT"][metric], "SCr-LR, Cosine: %s"%(metric), metric, "LR")
plot_metric_along_k(axs[1,1], k_sizes_test, SCR_Manh_control_LR["NW"][metric], SCR_Manh_control_LR["WT"][metric], "SCr-LR, Manhattan: %s"%(metric), metric, "LR")
add_subplot_index(axs, 2, 2)
plt.tight_layout()
plt.show()
save_figure(fig, figure_folder, "SCR-LR-AUROC")

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12.5, 3.5)) 

metric = "AUPRC"

plot_metric_along_k(axs[0], k_sizes_test, LAB_Euc_control_KNN["NW"][metric], LAB_Euc_control_KNN["WT"][metric], "Lab-KNN, Euclidean: %s"%(metric), metric, "KNN")
plot_metric_along_k(axs[1], k_sizes_test, LAB_Cos_control_KNN["NW"][metric], LAB_Cos_control_KNN["WT"][metric], "Lab-KNN, Cosine: %s"%(metric), metric, "KNN")
plot_metric_along_k(axs[2], k_sizes_test, LAB_Manh_control_KNN["NW"][metric], LAB_Manh_control_KNN["WT"][metric], "Lab-KNN, Manhattan: %s"%(metric), metric, "KNN")
add_subplot_index(axs, 1, 3)
plt.tight_layout()
plt.show()
save_figure(fig, figure_folder, "LAB-KNN-AUPRC")

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12.5, 3.5)) 

metric = "AUROC"

plot_metric_along_k(axs[0], k_sizes_test, LAB_Euc_control_KNN["NW"][metric], LAB_Euc_control_KNN["WT"][metric], "Lab-KNN, Euclidean: %s"%(metric), metric, "KNN")
plot_metric_along_k(axs[1], k_sizes_test, LAB_Cos_control_KNN["NW"][metric], LAB_Cos_control_KNN["WT"][metric], "Lab-KNN, Cosine: %s"%(metric), metric, "KNN")
plot_metric_along_k(axs[2], k_sizes_test, LAB_Manh_control_KNN["NW"][metric], LAB_Manh_control_KNN["WT"][metric], "Lab-KNN, Manhattan: %s"%(metric), metric, "KNN")
add_subplot_index(axs, 1, 3)
plt.tight_layout()
plt.show()
save_figure(fig, figure_folder, "LAB-KNN-AUROC")

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12.5, 3.5)) 

metric = "AUPRC"

plot_metric_along_k(axs[0], k_sizes_test, LAB_Euc_control_LR["NW"][metric], LAB_Euc_control_LR["WT"][metric], "Lab-LR, Euclidean: %s"%(metric), metric, "LR")
plot_metric_along_k(axs[1], k_sizes_test, LAB_Cos_control_LR["NW"][metric], LAB_Cos_control_LR["WT"][metric], "Lab-LR, Cosine: %s"%(metric), metric, "LR")
plot_metric_along_k(axs[2], k_sizes_test, LAB_Manh_control_LR["NW"][metric], LAB_Manh_control_LR["WT"][metric], "Lab-LR, Manhattan: %s"%(metric), metric, "LR")
add_subplot_index(axs, 1, 3)
plt.tight_layout()
plt.show()
save_figure(fig, figure_folder, "LAB-LR-AUPRC")

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12.5, 3.5)) 

metric = "AUROC"

plot_metric_along_k(axs[0], k_sizes_test, LAB_Euc_control_LR["NW"][metric], LAB_Euc_control_LR["WT"][metric], "Lab-LR, Euclidean: %s"%(metric), metric, "LR")
plot_metric_along_k(axs[1], k_sizes_test, LAB_Cos_control_LR["NW"][metric], LAB_Cos_control_LR["WT"][metric], "Lab-LR, Cosine: %s"%(metric), metric, "LR")
plot_metric_along_k(axs[2], k_sizes_test, LAB_Manh_control_LR["NW"][metric], LAB_Manh_control_LR["WT"][metric], "Lab-LR, Manhattan: %s"%(metric), metric, "LR")
add_subplot_index(axs, 1, 3)
plt.tight_layout()
plt.show()
save_figure(fig, figure_folder, "LAB-LR-AUROC")

# Test Final Performance

In [None]:
from utils.Testing import test_final_personalized_model, get_best_weights, combine_best_weights_for_test, KNN, predict_by_LR

In [None]:
grid_search_table = pd.read_csv(".../grid_search_table_imput2_LR.csv", index_col = 0)
best_distance_measures = dict()
for column in grid_search_table.columns:
    mode_value = grid_search_table[column].mode()[0]
    best_distance_measures[column] = mode_value

In [None]:
opt_SCR_simi_nw_full = nw_fea_arrs_dict["SCR"][best_distance_measures["SCR NW"]]["simi"]["full"]
opt_LAB_simi_nw_full = nw_fea_arrs_dict["LAB"][best_distance_measures["LAB NW"]]["simi"]["full"]
opt_SCR_simi_wt_full = wt_fea_arrs_dict["SCR"][best_distance_measures["SCR WT"]]["simi"]["full"]
opt_LAB_simi_wt_full = wt_fea_arrs_dict["LAB"][best_distance_measures["LAB WT"]]["simi"]["full"]

opt_measure_simi_nw_full_dict = {"SCR": opt_SCR_simi_nw_full, "LAB": opt_LAB_simi_nw_full}
opt_measure_simi_wt_full_dict = {"SCR": opt_SCR_simi_wt_full, "LAB": opt_LAB_simi_wt_full}

In [None]:
# these are the no nan features
X_train = pd.concat([SCR_train, LAB_train], axis = 1)
X_test = pd.concat([SCR_test, LAB_test], axis = 1)
assert X_train.shape[1] == X_test.shape[1]
# assert no nan values
assert not X_train.isnull().values.any(), "The DataFrame contains NaN values!"
assert not X_test.isnull().values.any(), "The DataFrame contains NaN values!"

Performance 1: Optimized distance measure + optimized feature type weights + no overlap rates weighting

In [None]:
final_model_performance_nw_KNN = test_final_personalized_model(X_train, X_test, k_sizes_test, grid_search_table, 
                                                           train_idx, test_idx, y_full, y_test, opt_measure_simi_nw_full_dict, 
                                                           num_processors, "KNN", False)

In [None]:
final_model_performance_nw_LR = test_final_personalized_model(X_train, X_test, k_sizes_test, grid_search_table, 
                                                           train_idx, test_idx, y_full, y_test, opt_measure_simi_nw_full_dict, 
                                                           num_processors, "LR", False)

Performance 2: Optimized distance measure + optimized feature type weights + overlap rates weighting

In [None]:
final_model_performance_wt_KNN = test_final_personalized_model(X_train, X_test, k_sizes_test, grid_search_table, 
                                                           train_idx, test_idx, y_full, y_test, opt_measure_simi_wt_full_dict, 
                                                           num_processors, "KNN", True)

In [None]:
final_model_performance_wt_LR = test_final_personalized_model(X_train, X_test, k_sizes_test, grid_search_table, 
                                                           train_idx, test_idx, y_full, y_test, opt_measure_simi_wt_full_dict, 
                                                           num_processors, "LR", True)

Performance 3: Fixed Euclidean distance + fixed feature type weights, k = 20 + no overlap rates weighting

In [None]:
base_k = 20
base_fix_distance_performance_nw_KNN = {"AUPRC": [], "AUROC": []}
base_fix_distance_performance_nw_LR = {"AUPRC": [], "AUROC": []}
A_nw, B_nw = eval(get_best_weights(grid_search_table, base_k, False))
base_fix_SCR_simi_nw_full = nw_fea_arrs_dict["SCR"]["DTW"]["simi"]["full"]
base_fix_LAB_simi_nw_full = nw_fea_arrs_dict["LAB"]["Euc"]["simi"]["full"]
fix_combined_weights_dict_nw = combine_best_weights_for_test(base_fix_SCR_simi_nw_full, base_fix_LAB_simi_nw_full, A_nw, B_nw, train_idx, test_idx, y_full, num_processors)

for k in tqdm(k_sizes_test):
    base_AUPRC_nw_KNN, base_AUROC_nw_KNN = KNN(fix_combined_weights_dict_nw, k, y_test)
    base_fix_distance_performance_nw_KNN["AUPRC"].append(base_AUPRC_nw_KNN)
    base_fix_distance_performance_nw_KNN["AUROC"].append(base_AUROC_nw_KNN)
    base_AUPRC_nw_LR, base_AUROC_nw_LR = predict_by_LR(X_train, X_test, fix_combined_weights_dict_nw, k, y_test)
    base_fix_distance_performance_nw_LR["AUPRC"].append(base_AUPRC_nw_LR)
    base_fix_distance_performance_nw_LR["AUROC"].append(base_AUROC_nw_LR)

Performance 4: Optimized distance measure + fixed feature type weights, k = 20 + no overlap rates weighting

In [None]:
base_opt_distance_performance_nw_KNN = {"AUPRC": [], "AUROC": []}
base_opt_distance_performance_nw_LR = {"AUPRC": [], "AUROC": []}
base_opt_SCR_simi_nw_full = nw_fea_arrs_dict["SCR"][best_distance_measures["SCR NW"]]["simi"]["full"]
base_opt_LAB_simi_nw_full = nw_fea_arrs_dict["LAB"][best_distance_measures["LAB NW"]]["simi"]["full"]
fix_combined_weights_dict_nw = combine_best_weights_for_test(base_opt_SCR_simi_nw_full, base_opt_LAB_simi_nw_full, A_nw, B_nw, train_idx, test_idx, y_full, num_processors)

for k in tqdm(k_sizes_test):
    base_AUPRC_nw_KNN, base_AUROC_nw_KNN = KNN(fix_combined_weights_dict_nw, k, y_test)
    base_opt_distance_performance_nw_KNN["AUPRC"].append(base_AUPRC_nw_KNN)
    base_opt_distance_performance_nw_KNN["AUROC"].append(base_AUROC_nw_KNN)
    base_AUPRC_nw_LR, base_AUROC_nw_LR = predict_by_LR(X_train, X_test, fix_combined_weights_dict_nw, k, y_test)
    base_opt_distance_performance_nw_LR["AUPRC"].append(base_AUPRC_nw_LR)
    base_opt_distance_performance_nw_LR["AUROC"].append(base_AUROC_nw_LR)

Performance 5: Global Euclidean Distance

In [None]:
X_train_norm = X_train.copy(deep = True)
X_test_norm = X_test.copy(deep = True) 
X_train_norm.loc[:, :] = min_max_normalization(X_train_norm, axis = 0)
X_test_norm.loc[:, :] = min_max_normalization(X_test_norm, axis = 0)
X_full_norm = pd.concat([X_train_norm, X_test_norm], axis = 0)

In [None]:
_, X_Euc_idx_full, _, _ = compute_similarity(X_full_norm, 'euclidean', train_len, num_processors)
Euc_idx_arr_test_glob, Euc_y_test_arr_glob = process_idx_arr_for_test(train_idx, test_idx, X_Euc_idx_full, y_full)
glob_Euc_idx_y_dict_test = {"idx": Euc_idx_arr_test_glob, "label": Euc_y_test_arr_glob}

In [None]:
base_glob_Euc_performance_KNN = {"AUPRC": [], "AUROC": []}
base_glob_Euc_performance_LR = {"AUPRC": [], "AUROC": []}
for k in tqdm(k_sizes_test):
    base_AUPRC_glob_KNN, base_AUROC_glob_KNN = KNN(glob_Euc_idx_y_dict_test, k, y_test)
    base_glob_Euc_performance_KNN["AUPRC"].append(base_AUPRC_glob_KNN)
    base_glob_Euc_performance_KNN["AUROC"].append(base_AUROC_glob_KNN)
    base_AUPRC_glob_LR, base_AUROC_glob_LR = predict_by_LR(X_train, X_test, glob_Euc_idx_y_dict_test, k, y_test)
    base_glob_Euc_performance_LR["AUPRC"].append(base_AUPRC_glob_LR)
    base_glob_Euc_performance_LR["AUROC"].append(base_AUROC_glob_LR)

Performance 6: Global Cosine Distance

In [None]:
_, X_Cos_idx_full, _, _ = compute_similarity(X_full_norm, 'cosine', train_len, num_processors)
Cos_idx_arr_test_glob, Cos_y_test_arr_glob = process_idx_arr_for_test(train_idx, test_idx, X_Cos_idx_full, y_full)
glob_Cos_idx_y_dict_test = {"idx": Cos_idx_arr_test_glob, "label": Cos_y_test_arr_glob}

In [None]:
base_glob_Cos_performance_KNN = {"AUPRC": [], "AUROC": []}
base_glob_Cos_performance_LR = {"AUPRC": [], "AUROC": []}
for k in tqdm(k_sizes_test):
    base_AUPRC_glob_KNN, base_AUROC_glob_KNN = KNN(glob_Cos_idx_y_dict_test, k, y_test)
    base_glob_Cos_performance_KNN["AUPRC"].append(base_AUPRC_glob_KNN)
    base_glob_Cos_performance_KNN["AUROC"].append(base_AUROC_glob_KNN)
    base_AUPRC_glob_LR, base_AUROC_glob_LR = predict_by_LR(X_train, X_test, glob_Cos_idx_y_dict_test, k, y_test)
    base_glob_Cos_performance_LR["AUPRC"].append(base_AUPRC_glob_LR)
    base_glob_Cos_performance_LR["AUROC"].append(base_AUROC_glob_LR)

# Plot the Final Performance of Personalized Models 

In [None]:
# save results
# List of dictionaries and their desired filenames
dicts_to_save = {
    "base_glob_Euc_performance_KNN": base_glob_Euc_performance_KNN,
    "base_glob_Cos_performance_KNN": base_glob_Cos_performance_KNN,
    "base_fix_distance_performance_nw_KNN": base_fix_distance_performance_nw_KNN,
    "base_opt_distance_performance_nw_KNN": base_opt_distance_performance_nw_KNN,
    "final_model_performance_nw_KNN": final_model_performance_nw_KNN,
    "final_model_performance_wt_KNN": final_model_performance_wt_KNN,
    "base_glob_Euc_performance_LR": base_glob_Euc_performance_LR,
    "base_glob_Cos_performance_LR": base_glob_Cos_performance_LR,
    "base_fix_distance_performance_nw_LR": base_fix_distance_performance_nw_LR,
    "base_opt_distance_performance_nw_LR": base_opt_distance_performance_nw_LR,
    "final_model_performance_nw_LR": final_model_performance_nw_LR,
    "final_model_performance_wt_LR": final_model_performance_wt_LR,
}

# Directory to save the JSON files
output_dir = "./Results_dict/External_validation/"

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save each dictionary as a JSON file
for filename, data in dicts_to_save.items():
    file_path = os.path.join(output_dir, f"{filename}.json")
    with open(file_path, "w") as json_file:
        json.dump(data, json_file, indent=4)  # Use indent=4 for pretty formatting
    print(f"Saved {filename} to {file_path}")

In [None]:
# Directory containing the JSON files
input_dir = "./Results_dict/External_validation/"

# Dictionary to store the loaded data
loaded_dicts = {}

# Load each JSON file and assign it to the corresponding variable name
for filename in os.listdir(input_dir):
    if filename.endswith(".json"):  # Process only JSON files
        file_path = os.path.join(input_dir, filename)
        variable_name = filename.replace(".json", "")  # Remove .json to create the variable name
        with open(file_path, "r") as json_file:
            globals()[variable_name] = json.load(json_file)

In [None]:
from utils.Plotting import plot_final_performance_metrics

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(13, 9.5)) 

plot_final_performance_metrics(axs[0, 0], k_sizes_test, "AUPRC", base_glob_Euc_performance_KNN, base_glob_Cos_performance_KNN, base_fix_distance_performance_nw_KNN, base_opt_distance_performance_nw_KNN, final_model_performance_nw_KNN, final_model_performance_wt_KNN, "KNN")
plot_final_performance_metrics(axs[0, 1], k_sizes_test, "AUROC", base_glob_Euc_performance_KNN, base_glob_Cos_performance_KNN, base_fix_distance_performance_nw_KNN, base_opt_distance_performance_nw_KNN, final_model_performance_nw_KNN, final_model_performance_wt_KNN, "KNN")
plot_final_performance_metrics(axs[1, 0], k_sizes_test, "AUPRC", base_glob_Euc_performance_LR, base_glob_Cos_performance_LR, base_fix_distance_performance_nw_LR, base_opt_distance_performance_nw_LR, final_model_performance_nw_LR, final_model_performance_wt_LR, "Personalized LR")
plot_final_performance_metrics(axs[1, 1], k_sizes_test, "AUROC", base_glob_Euc_performance_LR, base_glob_Cos_performance_LR, base_fix_distance_performance_nw_LR, base_opt_distance_performance_nw_LR, final_model_performance_nw_LR, final_model_performance_wt_LR, "Personalized LR")
add_subplot_index(axs, 2, 2)
plt.tight_layout()
plt.show()
save_figure(fig, figure_folder, "Final_model_performance_external")