In [1]:
import os
import sys
from pathlib import Path

os.chdir(Path(os.getcwd()).parents[0] / "src")
sys.path.append(os.getcwd())

import glob
import numpy as np
import torch
import pandas as pd

from scipy.stats import pearsonr
from scipy.stats import sem

from utils.plot_utils import *


def NormalizeData(data, min, max):
    return (data - min) / ((max - min) + 0.00000000001)

  from .autonotebook import tqdm as notebook_tqdm


### Import Evaluation Scores

In [2]:
# File loading per dataset

file_image_inet = "/Image/eval_imagenet_dataset.npz"
file_image_oct = "/Image/eval_oct_dataset.npz"
file_image_r45 = "/Image/eval_resisc45_dataset.npz"

file_volume_adr = "/Volume/eval_AdrenalMNIST3D_dataset.npz"
file_volume_org = "/Volume/eval_OrganMNIST3D_dataset.npz"
file_volume_ves = "/Volume/eval_VesselMNIST3D_dataset.npz"

file_pc_coma = "/Point_Cloud/eval_coma_dataset.npz"
file_pc_m40 = "/Point_Cloud/eval_modelnet40_dataset.npz"
file_pc_shpn = "/Point_Cloud/eval_shapenet_dataset.npz"

file_loc = str(Path(os.getcwd()).parents[0]) + "/data/evaluation"

file = np.load(file_loc + file_image_inet, allow_pickle=True)
arr_image_inet = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_image_oct, allow_pickle=True)
arr_image_oct = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_image_r45, allow_pickle=True)
arr_image_r45 = [file["arr_0"], file["arr_1"], file["arr_2"]]

file = np.load(file_loc + file_volume_adr, allow_pickle=True)
arr_volume_adr = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_volume_org, allow_pickle=True)
arr_volume_org = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_volume_ves, allow_pickle=True)
arr_volume_ves = [file["arr_0"], file["arr_1"], file["arr_2"]]

file = np.load(file_loc + file_pc_coma, allow_pickle=True)
arr_pc_coma = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_pc_m40, allow_pickle=True)
arr_pc_m40 = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_pc_shpn, allow_pickle=True)
arr_pc_shpn = [file["arr_0"], file["arr_1"], file["arr_2"]]

### Ranking Computation

In [48]:
# Full Ranking
arr_image = [arr_image_inet, arr_image_oct, arr_image_r45]
arr_volume = [arr_volume_adr, arr_volume_org, arr_volume_ves]
arr_pc = [arr_pc_coma, arr_pc_m40, arr_pc_shpn]
arr_modalities = [arr_image, arr_volume, arr_pc]

arr_ranking = np.empty(
    [3, 3, 14, 20], dtype=float
)  # , dataset, model, xai, eval
arr_ranking[:] = np.nan

bup_order = [0, 1, 2, 4, 5, 7, 9, 12, 17]


for dataset in range(3):
    for model in range(3):
        for xai in range(14):
            for eval in range(20):
                ranking = np.median(
                    arr_modalities[0][dataset][model][:14, eval, :], -1
                ).argsort()  # compute ranking based on median obs score
                if eval in bup_order:
                    ranking = ranking[
                        ::-1
                    ]  # reverse ranking to bottom up if larger is better

                pos = (
                    ranking.argsort()[xai] + 1
                )  # get rankin position of xai method (+1 so ranking starts at 1 and not 0)
                arr_ranking[dataset, model, xai, eval] = pos

In [66]:
l = []
for i in range(3):
    for j in range(14):
        l.append(np.std(arr_ranking[0,i,j,17:])) # datasets , models

np.round(np.mean(l),2)

0.29

In [72]:
F = 3.1
R = 2.48
C = 0.51

np.round(0.5 * F + (7/20) * R + (3/20) * C,2)

2.49

### Variance test versus random 

In [40]:
from scipy.stats import randint, levene
import scipy
alpha = 0.1

arr_sign_test = np.empty(
    [3, 3, 14, 3], dtype=int # dataset, model, xai methods, eval_criteria
)
random_sample = randint.rvs(0,15,size = 999999)


for crit in range(3):
    for modality in range(2):
        for method in range(14):
            for model in range(3):
                for dataset in range(3):
                    metric_sample = [arr_ranking[dataset, model, method, :10], arr_ranking[dataset, model, method, 10:17], arr_ranking[dataset, model, method, 17:20]][crit]

                    if metric_sample.var() > 16.25:
                        arr_sign_test[dataset, model, method, crit] = 0
                    else:
                        test_pvalue = levene(random_sample, metric_sample, center = "median").pvalue
                        arr_sign_test[dataset, model, method, crit] = 1 if test_pvalue < alpha else 0

KeyboardInterrupt: 

In [5]:
table_sign_test = np.empty(
    [14, 3], dtype=float # xai methods, eval_criteria
)

for method in range(14):
    for crit in range(3):
        table_sign_test[method, crit] = np.round(arr_sign_test[:,:,method, crit].mean(),2)

table_sign_test = pd.DataFrame(table_sign_test).transpose()

table_sign_test = table_sign_test.append(pd.DataFrame(np.round(np.average(table_sign_test, axis=0, weights=[0.5, 0.35, 0.15]), 2).reshape(1,-1)), ignore_index=True)
table_sign_test["mean"] = np.round(table_sign_test.mean(axis=1), 2)

table_sign_test.columns = [
    "OC",
    "LI",
    "KS",
    "VG",
    "IxG",
    "GB",
    "GC",
    "SC",
    "C+",
    "IG",
    "EG",
    "DL",
    "DLS",
    "LRP",
    "Average"
]

table_sign_test.index = ["Faithfulness", "Robustness", "Complexity", "Weighted Average"]
table_sign_test.to_csv(os.getcwd().split("src")[0] + "data/figures/variance_sign_test.csv")
table_sign_test

  table_sign_test = table_sign_test.append(pd.DataFrame(np.round(np.average(table_sign_test, axis=0, weights=[0.5, 0.35, 0.15]), 2).reshape(1,-1)), ignore_index=True)


Unnamed: 0,OC,LI,KS,VG,IxG,GB,GC,SC,C+,IG,EG,DL,DLS,LRP,Average
Faithfulness,0.0,0.56,0.78,0.67,0.22,0.22,0.56,0.11,0.56,0.33,1.0,0.22,0.67,0.22,0.44
Robustness,0.33,0.56,1.0,0.56,0.0,0.56,0.44,0.22,0.33,0.11,0.33,0.0,0.89,0.33,0.4
Complexity,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Weighted Average,0.27,0.63,0.89,0.68,0.26,0.46,0.58,0.28,0.55,0.35,0.77,0.26,0.8,0.38,0.51


### Metric differences Figure

In [13]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

colors = list(map(px.colors.qualitative.G10.__getitem__, [0,2,3,4]))
metrics_faith = [
    "Faithfulness Corr.",
    "Faithfulness Est.",
    "Monotonicity Corr.",
    "Pixel Flipping",
    "Region Pert.",
    "Insertion",
    "Deletion",
    "IROF",
    "ROAD",
    "Sufficiency",
]
metrics_robust = [
    "Local Lipschitz Est.",
    "Max Sensitivity",
    "Continuity",
    "Rel. Input Stab.",
    "Rel. Output Stab.",
    "Rel. Repr. Stab.",
    "Infidelity",
]
metrics_complex = ["Sparseness", "Complexity", "Eff. Complexity"]

methods = [1, 6, 9, 12]

fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=["Faithfulness Metrics", "Robustness Metrics", "Complexity Metrics"],
    column_widths=[0.5, 0.35, 0.15],
    shared_yaxes=True,
)

df_table = pd.DataFrame(arr_ranking[0, 0, :, :10])
df_table.index = [
    "OC",
    "LIME",
    "KS",
    "VG",
    "IxG",
    "GB",
    "GC",
    "SC",
    "C+",
    "IG",
    "EG",
    "DL",
    "DLS",
    "LRP",
]

faith = df_table.iloc[methods, :].transpose()

df_table = pd.DataFrame(arr_ranking[0, 0, :, 10:17])
robust = df_table.iloc[methods, :].transpose()

df_table = pd.DataFrame(arr_ranking[0, 0, :, 17:20])
complex = df_table.iloc[methods, :].transpose()


for i in range(len(faith.columns)):
    fig.add_trace(
        go.Scatter(
            x=metrics_faith,
            y=faith.iloc[:, i],
            mode="lines+markers",
            name=faith.columns[i],
            marker=dict(color=colors[i], size=8),
        ),
        col=1,
        row=1,
    )

    fig.add_trace(
        go.Scatter(
            x=["Average"],
            y=[np.mean(faith.iloc[:, i])],
            mode="markers+text",
            text=[str(np.mean(faith.iloc[:, i]))],
            textposition="middle left",
            showlegend=False,
            marker=dict(color=colors[i], size=8, symbol="square"),
        ),
        col=1,
        row=1,
    )

for i in range(len(robust.columns)):
    fig.add_trace(
        go.Scatter(
            x=metrics_robust,
            y=robust.iloc[:, i],
            mode="lines+markers",
            showlegend=False,
            marker=dict(color=colors[i], size=8),
        ),
        col=2,
        row=1,
    )

    fig.add_trace(
        go.Scatter(
            x=["Average"],
            y=[np.mean(robust.iloc[:, i]), 1],
            mode="markers+text",
            text=[str(np.round(np.mean(robust.iloc[:, i]), 1))],
            textposition="middle left",
            showlegend=False,
            marker=dict(color=colors[i], size=8, symbol="square"),
        ),
        col=2,
        row=1,
    )

for i in range(len(complex.columns)):
    fig.add_trace(
        go.Scatter(
            x=metrics_complex,
            y=complex.iloc[:, i],
            mode="lines+markers",
            showlegend=False,
            marker=dict(color=colors[i], size=8),
        ),
        col=3,
        row=1,
    )

    fig.add_trace(
        go.Scatter(
            x=["Average"],
            y=[np.mean(complex.iloc[:, i])],
            mode="markers+text",
            text=[str(np.round(np.mean(complex.iloc[:, i])))],
            textposition="middle left",
            showlegend=False,
            marker=dict(color=colors[i], size=8, symbol="square"),
        ),
        col=3,
        row=1,
    )

fig.update_yaxes(
    autorange="reversed",
    range=[1, 14],
    tickvals=[1, 5, 10, 14],
    zeroline=False,
    showticklabels=True,
)

fig.update_yaxes(title="Rank", col = 1, row = 1)

fig.update_xaxes(tickangle=35)

fig.update_layout(
    height=400,
    width=1500,
    legend_title_text="XAI Method",
    template="plotly_white",
    font=dict(
        family="Helvetica",
        color="#000000",
        size = 12,
    ),
    title_font=dict(
        family="Helvetica",
        color="#000000",
        size = 12
    ),
)

fig = left_align_facet_plot_titles(fig)
fig.write_image(os.getcwd().split("src")[0] + "data/figures/meta_eval_example.png", scale=2)
fig.show()

### Average ranking disagreement

In [7]:
from scipy.spatial import distance_matrix

list = []

for model in range(3):
    for dataset in range(3):
        for method in range(14):
            list.append(
                distance_matrix(
                    np.expand_dims(arr_ranking[dataset, model, method, :10], 0).T,
                    np.expand_dims(arr_ranking[dataset, model, method, :10], 0).T,
                )
            )

dist_matrix = np.round(np.mean(np.array(list[:42]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_faith_resnet50 = dist_matrix.where(np.tril(np.ones(dist_matrix.shape)).astype(bool))

dist_matrix = np.round(np.mean(np.array(list[42:84]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_faith_effnetb0 = dist_matrix.where(np.tril(np.ones(dist_matrix.shape)).astype(bool))

dist_matrix = np.round(np.mean(np.array(list[84:]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_faith_vit = dist_matrix.where(np.tril(np.ones(dist_matrix.shape)).astype(bool))

list = []

for model in range(3):
    for dataset in range(3):
        for method in range(14):
            list.append(
                distance_matrix(
                    np.expand_dims(arr_ranking[dataset, model, method, 10:17], 0).T,
                    np.expand_dims(arr_ranking[dataset, model, method, 10:17], 0).T,
                )
            )

dist_matrix = np.round(np.mean(np.array(list[:42]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_robust_resnet50 = dist_matrix.where(np.tril(np.ones(dist_matrix.shape)).astype(bool))

dist_matrix = np.round(np.mean(np.array(list[42:84]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_robust_effnetb0 = dist_matrix.where(np.tril(np.ones(dist_matrix.shape)).astype(bool))

dist_matrix = np.round(np.mean(np.array(list[84:]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_robust_vit = dist_matrix.where(np.tril(np.ones(dist_matrix.shape)).astype(bool))

In [12]:
import plotly.graph_objects as go
import plotly.express as px

colors = px.colors.qualitative.G10

fig = make_subplots(
    rows=2,
    cols=3,
    subplot_titles=["ResNet50","EfficientNetb0","DeiT ViT", "ResNet50","EfficientNetb0","DeiT ViT"],
    #column_widths=[0.7, 0.3],
)

for i in range(3):
    fig.add_trace(
        go.Heatmap(
            z=[dist_faith_resnet50,dist_faith_effnetb0,dist_faith_vit][i],
            x=metrics_faith,
            y=metrics_faith,
            texttemplate="%{z}",
            colorscale="RdYlGn",
            reversescale=True,
            zmin=0.5,
            zmax=6.1,
            colorbar=dict(ticks="outside", thickness=10),
        ),
        col = i + 1,
        row = 1

    )

for i in range(3):
    fig.add_trace(
        go.Heatmap(
            z=[dist_robust_resnet50,dist_robust_effnetb0,dist_robust_vit][i],
            x=metrics_robust,
            y=metrics_robust,
            texttemplate="%{z}",
            colorscale="RdYlGn",
            reversescale=True,
            zmin=0.5,
            zmax=6.1,
            colorbar=dict(ticks="outside", thickness=10),
        ),
        col = i + 1,
        row = 2

    )

fig.update_yaxes(showgrid=False)
fig.update_yaxes(title ="Faithfulness Metrics",row = 1, col = 1)
fig.update_yaxes(title ="Robustness Metrics",row = 2, col = 1)

fig.update_layout(
    font=dict(family="Helvetica", color="#000000", size=13),
    template="plotly_white",
    height=800,
    width=1600,
    title_font=dict(
        family="Helvetica",
        color="#000000",
        size= 14
    ),
)

fig = left_align_facet_plot_titles(fig)

fig.write_image(os.getcwd().split("src")[0] + "data/figures/meta_eval_dist.png", scale=3)
fig.show()