In [19]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", context="notebook",font_scale=1.2)
import pandas as pd
# disable setting with copy warning
pd.options.mode.chained_assignment = None
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import matplotlib.pyplot as plt
import pydicom
import json

In [20]:
PV = "age"
if PV == "age":
    groups = ["old", "young"]
else:
    groups = ["male", "female"]

In [21]:
def load_csvs_into_df(dir):
    dirs = [os.path.join(dir,d) for d in os.listdir(dir) if os.path.isdir(os.path.join(dir,d))]
    dfs = []
    sample_dfs = []
    for d in dirs:
        seeds = [f for f in os.listdir(os.path.join('./',d)) if f.startswith('seed')]
        # get last number from directory name
        train_loader_num = int(d.split('_')[-1])
        for seed in seeds:
            results = [f for f in os.listdir(os.path.join(d,seed)) if f.startswith('test_results')][0]
            sample_info = [f for f in os.listdir(os.path.join(d,seed)) if f.startswith('train_loader')][0]
            results_df = pd.read_csv(os.path.join(d, seed, results))
            sample_df = pd.read_csv(os.path.join(d, seed, sample_info))
            results_df["train_loader_num"] = train_loader_num
            sample_df["train_loader_num"] = train_loader_num
            dfs.append(results_df)
            sample_dfs.append(sample_df)
    return pd.concat(dfs, ignore_index=True), pd.concat(sample_dfs, ignore_index=True)

In [22]:
df_1, df_samples_1 = load_csvs_into_df('./2023-10-13 11:56:26-FAE-rsna-age-bs32-dataset-distillation-nsamples1-noDP')
df_1["num_samples"] = 1
df_samples_1["num_samples"] = 1

In [23]:
relevant_cols = ["test/lungOpacity_old_subgroupAUROC","test/lungOpacity_young_subgroupAUROC", "train_loader_num"]
df_1 = df_1[relevant_cols]
df_1["AUROC_sum"] = df_1["test/lungOpacity_old_subgroupAUROC"] + df_1["test/lungOpacity_young_subgroupAUROC"]
# join df_1 with df_samples_1
df_1 = df_1.merge(df_samples_1, on="train_loader_num")

In [24]:
score_vars = ["test/lungOpacity_old_subgroupAUROC","test/lungOpacity_young_subgroupAUROC", "AUROC_sum"]

In [25]:
subsets = []
for mode in ["min","max"]:
    for score_var in score_vars:
        for subset_size in [1,3,5,10,15,25,30,35,40,45,50]:
            df_1.sort_values(by=score_var, ascending=False if mode == "max" else True, inplace=True)
            subsets.append({
                "mode": mode,
                "score_var": score_var,
                "filenames": list(df_1.iloc[0:subset_size]["filenames"]),
                "scores": list(df_1.iloc[0:subset_size][score_var]),
                "size": subset_size
            })
with open('subsets.json', 'w') as outfile:
    json.dump(subsets, outfile)

In [26]:
# second subset
combined_sets = []
for mode in ["min","max"]:
    for subset_size in [1,3,5,10,15,25,30,35,40,45,50]:
        # filter subset list 
        subset = [s for s in subsets if s["mode"] == mode and s["size"] == subset_size and s["score_var"] != "AUROC_sum"]
        combined_set = {
            "mode": mode,
            "score_vars": [subset[0]["score_var"], subset[1]["score_var"]],
            "filenames": {subset[0]["score_var"]: subset[0]["filenames"], subset[1]["score_var"]: subset[1]["filenames"]},
            "scores": {subset[0]["score_var"]: subset[0]["scores"], subset[1]["score_var"]: subset[1]["scores"]},
            "size": subset_size*2
        }
        combined_sets.append(combined_set)
with open('subsets_combined.json', 'w') as outfile:
    json.dump(combined_sets, outfile)