In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", context="notebook",font_scale=1.2)
import pandas as pd
# disable setting with copy warning
pd.options.mode.chained_assignment = None
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import matplotlib.pyplot as plt
import pydicom
import json

In [3]:
PV = "age"
if PV == "age":
    groups = ["old", "young"]
else:
    groups = ["male", "female"]

In [4]:
def load_csvs_into_df(dir):
    dirs = [os.path.join(dir,d) for d in os.listdir(dir) if os.path.isdir(os.path.join(dir,d))]
    dfs = []
    sample_dfs = []
    for d in dirs:
        seeds = [f for f in os.listdir(os.path.join('./',d)) if f.startswith('seed')]
        # get last number from directory name
        train_loader_num = int(d.split('_')[-1])
        for seed in seeds:
            results = [f for f in os.listdir(os.path.join(d,seed)) if f.startswith('test_results')][0]
            sample_info = [f for f in os.listdir(os.path.join(d,seed)) if f.startswith('train_loader')][0]
            results_df = pd.read_csv(os.path.join(d, seed, results))
            sample_df = pd.read_csv(os.path.join(d, seed, sample_info))
            results_df["train_loader_num"] = train_loader_num
            sample_df["train_loader_num"] = train_loader_num
            dfs.append(results_df)
            sample_dfs.append(sample_df)
    return pd.concat(dfs, ignore_index=True), pd.concat(sample_dfs, ignore_index=True)

In [5]:
def get_subsets(df, df_samples, model:str, dataset:str, score_variables):
    df = df[score_variables+["train_loader_num", "model"]]
    # join df_1 with df_samples_1
    df = df.merge(df_samples, on="train_loader_num")
    subset_by_score_var = {}
    for score_var in score_vars:
        df.sort_values(by=score_var, ascending=False, inplace=True)
        df = df.iloc[0:500]
        subset_by_score_var[score_var] = {
            "idx_map": list(df["index_mapping"]),
            "labels": list(df["labels"]),
            "meta": list(df["meta"]),
            "filenames": list(df["filenames"]),
            "scores": list(df[score_var]),
        }
    return subset_by_score_var

In [6]:
subsets = {}

# RSNA Balanced

In [7]:
current_file_dir = "src/logs_persist/dataset_distillation/distillation_paper"
path_to_logs = os.path.join(current_file_dir, "2023-11-03 14:14:39-FAE-rsna-balanced-bs32-balanced-dataset-distillation-fullmodel-nsamples1-noDP")

In [8]:
score_vars = ["test/AUROC", "test/old_subgroupAUROC","test/young_subgroupAUROC","test/male_subgroupAUROC","test/female_subgroupAUROC"]

In [10]:
df_fae, df_samples_fae= load_csvs_into_df(path_to_logs)
df_fae["model"] = "FAE"

In [11]:
subsets["rsna"] = get_subsets(df_fae, df_samples_fae, "FAE", "RSNA", score_vars)

## Save

In [12]:
with open(f"src/subsets.json", "w") as f:
    json.dump(subsets, f)