In [29]:
import os
import pickle
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import sys
sys.path.append(str(Path.cwd().parent))

In [30]:
# Directory containing pickle files
directory = Path("../results/classification/quantitative")

# Optionally filter for specific substrings
pkl_files = [f for f in directory.iterdir() if f.suffix == ".pkl"]

# Efficient loader
def load_file(file_path):
    try:
        with open(file_path, "rb") as f:
            data = pickle.load(f)
            if isinstance(data, dict):
                return file_path.name, data
    except (EOFError, pickle.UnpicklingError) as e:
        print(f"Warning: Failed to load {file_path.name}: {e}")
    except Exception as e:
        print(f"Unexpected error with {file_path.name}: {e}")
    return None

# Load files using multithreading (I/O bound)
all_data = {}
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(load_file, f) for f in pkl_files]
    for future in as_completed(futures):
        result = future.result()
        if result:
            filename, data = result
            all_data[filename] = data

files = list(all_data.keys())


In [31]:
dictionary

{'prox_method': 'rfgap',
 'conformity_k': 50,
 'random_state': 732181,
 'oob_score_': 0.964824120603015,
 'diff_proba_auc': 0.9882716008358128,
 'diff_proba_auc_test': 0.9827427757289594,
 'conformity_auc': 0.996740796032977,
 'conformity_auc_test': 0.9930149849153908,
 'ice_auc': 0.9723633543323503,
 'ice_auc_test': 0.972585042506119}

In [32]:
records = []

for file, dictionary in all_data.items():
    records.append(dictionary)

# Create the DataFrame
df = pd.DataFrame(records)


# df = df.dropna(
#     subset=[col for col in df.columns if col not in ['file', 'dataset_name', 'method_name', 'missing_type', 'pct', 'random_state']],
#     how='all'
# )


# df.drop(columns=['file'], inplace=True)
print(df.shape)



(2933, 13)


In [33]:
file

'wdbc_rfgap_k5_rs732181.pkl'

In [None]:
# Main methods: RF-ICE, Conformity (based on three proximity types), difference in probabilities (independent of proximities)

In [43]:
df.columns

Index(['prox_method', 'conformity_k', 'random_state', 'oob_score_',
       'diff_proba_auc', 'diff_proba_auc_test', 'conformity_auc',
       'conformity_auc_test', 'ice_auc', 'ice_auc_test', 'name', 'n_features',
       'n_samples'],
      dtype='object')

In [53]:
df[['name', 'random_state', 'prox_method', 'ice_auc', 'oob_score_']].groupby(['name', 'random_state', 'prox_method']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ice_auc,oob_score_
name,random_state,prox_method,Unnamed: 3_level_1,Unnamed: 4_level_1
analcatdata_authorship,54887,oob,0.983180,0.989796
analcatdata_authorship,54887,original,0.983180,0.989796
analcatdata_authorship,54887,rfgap,0.983180,0.989796
analcatdata_authorship,110269,oob,0.978248,0.991497
analcatdata_authorship,110269,original,0.978248,0.991497
...,...,...,...,...
wdbc,671156,original,0.977950,0.957286
wdbc,671156,rfgap,0.977950,0.957286
wdbc,732181,oob,0.972363,0.964824
wdbc,732181,original,0.972363,0.964824


In [39]:
df[['n_samples', 'conformity_k']].sort_values(by=['conformity_k'], ascending=False)

Unnamed: 0,n_samples,conformity_k
2024,1055,200
2114,1055,200
2108,1055,200
2317,958,200
2316,958,200
...,...,...
1080,1000,1
1073,1000,1
1069,1000,1
1068,1000,1
