In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib import ticker

from constants import cat_name_mapping, exclude_models_w_mae
from notebooks.helper import save_or_show

sns.set_style('ticks')

In [None]:
SAVE = False
# f'/home/space/diverse_priors/results/plots/distribution_similarities'
storing_path = Path(
    f'/Users/lciernik/Documents/TUB/projects/divers_prios/results/analysis_model_similarities_across_datasets/distribution_similarities'
)
if SAVE:
    storing_path.mkdir(parents=True, exist_ok=True)

In [None]:
from helper import load_model_configs_and_allowed_models

model_configs, allowed_models = load_model_configs_and_allowed_models(
    path='../scripts/models_config_wo_barlowtwins_n_alignment.json',
    exclude_models=[],
    exclude_alignment=True,
)
orig_cols = {'Objective pair': 'objective', 'Architecture pair': 'architecture_class', 'Dataset pair': 'dataset_class',
             'Model size pair': 'size_class'}

In [None]:
# base_path_aggregated = '/home/space/diverse_priors/results/aggregated'
base_path_aggregated = Path('/Users/lciernik/Documents/TUB/projects/divers_prios/results/aggregated')

### Config similarity data
sim_data = pd.read_csv(base_path_aggregated / 'model_sims/all_metric_ds_model_pair_similarity.csv')

In [None]:
sim_data = sim_data[sim_data['Model 1'].isin(allowed_models) & sim_data['Model 2'].isin(allowed_models)]

In [None]:
pair_columns = sorted(['Objective pair', 'Architecture pair', 'Dataset pair', 'Model size pair'])

In [None]:
for col in pair_columns:
    sim_data[col] = sim_data[col].apply(eval)
    sim_data[f"M1 {col}"] = sim_data[col].apply(lambda x: cat_name_mapping[x[0]])
    sim_data[f"M2 {col}"] = sim_data[col].apply(lambda x: cat_name_mapping[x[1]])
    sim_data[col] = sim_data[col].apply(lambda x: f"{cat_name_mapping[x[0]]} – {cat_name_mapping[x[1]]}")

In [None]:
for col in pair_columns:
    sim_data[col] = sim_data[col].apply(str)

In [None]:
def get_box_plt_sim_distributions(all_data, curr_pair_columns):
    # n = sim_data['Similarity metric'].nunique()
    n = all_data['Similarity metric'].nunique()
    m = len(curr_pair_columns)
    cm = 0.393701
    fontsize_title = 16
    fontsize_lbl = 16
    fontsize_ticks = 14
    fig, axes = plt.subplots(n, m, figsize=(12 * cm * m, 6 * cm * n), sharey=True, sharex='col')
    for i, metric in enumerate(all_data['Similarity metric'].unique()):
        for j, col in enumerate(curr_pair_columns):
            ax = axes[i, j]

            data = all_data[all_data['Similarity metric'] == metric]
            # data = data[data[f"M1 {col}"] == data[f"M2 {col}"]]  # Only same categories

            data = data.sort_values(by=col)
            sns.boxenplot(
                data=data,
                x=col,
                y='Similarity value',
                ax=ax,
                hue=col,
                palette='tab10',
            )
            ax.tick_params(axis='x',  # Apply to both x and y axes
                           which='major',  # Apply to major ticks
                           rotation=90,
                           labelsize=fontsize_ticks)
            ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f'))

            title = col if i == 0 else ''
            ax.set_title(title, fontsize=fontsize_title)
            ax.set_xlabel('', fontsize=fontsize_lbl)

            ylbl = f'{metric}\nSimilarity value' if j == 0 else ''
            ax.set_ylabel(ylbl, fontsize=fontsize_lbl)

            # ax.axhline(0.7, c='r', ls=":", zorder=-1)

    fig.subplots_adjust(wspace=0.1, hspace=0.1)
    return fig


from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages(storing_path/'output.pdf')
fig = get_box_plt_sim_distributions(sim_data, pair_columns)
fig.suptitle(f"No fixed models", fontsize=16)
pdf.savefig(fig, bbox_inches='tight')
# save_or_show(fig, storing_path / f'all_categories.pdf', SAVE)

In [None]:
settings_to_fix_cat = [
    {'fix_cols': ['Architecture pair'],
     'fix_vals': ['CNN']},

    {'fix_cols': ['Architecture pair'],
     'fix_vals': ['TX']},

    {'fix_cols': ['Dataset pair'],
     'fix_vals': ['IN1k']},

    {'fix_cols': ['Dataset pair'],
     'fix_vals': ['XLarge DS']},

    {'fix_cols': ['Model size pair'],
     'fix_vals': ['small']},

    {'fix_cols': ['Model size pair'],
     'fix_vals': ['medium']},

    {'fix_cols': ['Model size pair'],
     'fix_vals': ['large']},

    {'fix_cols': ['Objective pair'],
     'fix_vals': ['Img-Txt']},

    {'fix_cols': ['Objective pair'],
     'fix_vals': ['SSL']},

    {'fix_cols': ['Objective pair'],
     'fix_vals': ['Sup']},

    {'fix_cols': ['Architecture pair', 'Objective pair'],
     'fix_vals': ['CNN', 'Sup']},

    {'fix_cols': ['Architecture pair', 'Objective pair'],
     'fix_vals': ['TX', 'Sup']},

    {'fix_cols': ['Dataset pair', 'Architecture pair'],
     'fix_vals': ['IN1k', 'CNN']},

    {'fix_cols': ['Dataset pair', 'Architecture pair'],
     'fix_vals': ['IN1k', 'TX']},

    {'fix_cols': ['Dataset pair', 'Architecture pair'],
     'fix_vals': ['Large DS', 'TX']},

    {'fix_cols': ['Dataset pair', 'Objective pair'],
     'fix_vals': ['IN1k', 'SSL']},

    {'fix_cols': ['Dataset pair', 'Objective pair'],
     'fix_vals': ['IN1k', 'Sup']},

    {'fix_cols': ['Dataset pair', 'Objective pair'],
     'fix_vals': ['IN21k', 'Sup']},
]


In [None]:
all_model_sets = {}

for setting in settings_to_fix_cat:
    subset_data = sim_data.copy()
    curr_pair_cols = pair_columns.copy()
    all_fixed_vals = []
    for col_name, fix_value in zip(setting['fix_cols'], setting['fix_vals']):
        m1_col = f'M1 {col_name}'
        m2_col = f'M2 {col_name}'
        subset_data = subset_data[(subset_data[m1_col] == fix_value) & (subset_data[m2_col] == fix_value)]
        curr_pair_cols.remove(col_name)
        all_fixed_vals.append(fix_value)

    model_set = set()
    model_set.update(subset_data['Model 1'].unique())
    model_set.update(subset_data['Model 2'].unique())
    model_set = sorted(list(model_set))
    tmp = [orig_cols[c] for c in curr_pair_cols]
    model_set = [(mid, [cat_name_mapping[k] for k in model_configs.loc[mid, tmp].to_list()]) for mid in model_set]
    print(f"Models:")
    for mid in model_set:
        print(f"Model: {mid[0]}, {mid[1]}")
    setting_name = '_'.join(setting['fix_vals'])
    all_model_sets[setting_name] = model_set
    fig = get_box_plt_sim_distributions(subset_data, curr_pair_cols)
    fig.suptitle(f"Fixed values: {', '.join(all_fixed_vals)}", fontsize=16)
    pdf.savefig(fig, bbox_inches='tight')
    plt.show(fig)

In [None]:
pdf.close()

In [None]:
import json

with open(storing_path/'output.json', 'w') as json_file:
    json.dump(all_model_sets, json_file, indent=4)