# Compare the Heatmaps of Two different Models

## Load Libraries and Modules

In [None]:
%matplotlib inline

import os
import h5py
import numpy as np
import pandas as pd
import gc as gci

import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.metrics import confusion_matrix, roc_curve, auc

import tensorflow as tf
from tensorflow import keras

print("TF  Version",tf.__version__)

In [None]:
# check and set path before loading modules
print(os.getcwd())
INPUT_DIR = "/tf/notebooks/bule/explainable_AI/"
OUTPUT_DIR = "/tf/notebooks/bule/explainable_AI/"
if os.getcwd() != OUTPUT_DIR:
    os.chdir(OUTPUT_DIR)
    
import functions_metrics as fm
import functions_read_data as rdat
import functions_model_definition as md
import functions_gradcam as gc
import functions_occlusion as oc
import functions_plot_heatmap as phm

## Load Data

In [None]:
model_1 = "10Fold_CIB"
model_2 = "10Fold_CIBLSX"

In [None]:
meta_dat = pd.read_csv(INPUT_DIR + "pictures/" + model_1 + "/all_tab_results_hm_unc_" + model_1 + "_M1_avg_gc_bothcl.csv")
meta_dat = meta_dat.sort_values(by=["p_id"])

In [None]:
all_heatmaps_1 = np.load(INPUT_DIR + "pictures/" + model_1 + "/all_ensemble_heatmaps_" + model_1 + "_M1_avg_gc_bothcl.npy").squeeze()
all_heatmaps_2 = np.load(INPUT_DIR + "pictures/" + model_2 + "/all_ensemble_heatmaps_" + model_2 + "_M1_avg_gc_bothcl.npy").squeeze()

In [None]:
mean_heatmaps_1 = np.load(INPUT_DIR + "pictures/" + model_1 + "/all_heatmaps_" + model_1 + "_M1_avg_gc_bothcl.npy")
mean_heatmaps_2 = np.load(INPUT_DIR + "pictures/" + model_2 + "/all_heatmaps_" + model_2 + "_M1_avg_gc_bothcl.npy")
mean_heatmaps_1 = np.moveaxis(mean_heatmaps_1, -1, 1)
mean_heatmaps_2 = np.moveaxis(mean_heatmaps_2, -1, 1)

In [None]:
 #should be (n_patients, n_heatmaps, 128, 128, 28)
print(all_heatmaps_1.shape, all_heatmaps_2.shape)
print(mean_heatmaps_1.shape, mean_heatmaps_2.shape)

In [None]:
all_heatmaps_1_2 = np.concatenate((all_heatmaps_1, all_heatmaps_2), axis=1)
all_heatmaps_1_2.shape

In [None]:
# create new df for all distances
dist_dat = meta_dat[["p_id", "mrs", "unfavorable"]].copy()
dist_dat["val_fold"] = meta_dat[["fold0", "fold1", "fold2", "fold3", "fold4", "fold5", "fold6", "fold7", "fold8", "fold9"]].eq("val").idxmax(axis=1)
dist_dat["test_fold"] = meta_dat[["fold0", "fold1", "fold2", "fold3", "fold4", "fold5", "fold6", "fold7", "fold8", "fold9"]].eq("test").idxmax(axis=1)

### Calculate Mean for Both Model Types

In [None]:
dist_dat["A_mean"] = np.mean(all_heatmaps_1, axis=(1,2,3,4))
dist_dat["B_mean"] = np.mean(all_heatmaps_2, axis=(1,2,3,4))
dist_dat["AB_mean"] = np.mean(all_heatmaps_1_2, axis=(1,2,3,4))

### Calculate SD for Both Model Types

In [None]:
def calc_sd(hm):
    sd_hm = np.sqrt(np.mean(np.var(hm, axis = 1), axis=(1,2,3)))
    return sd_hm

In [None]:
dist_dat["A_sd"] = calc_sd(all_heatmaps_1)
dist_dat["B_sd"] = calc_sd(all_heatmaps_2)
dist_dat["AB_sd"] = calc_sd(all_heatmaps_1_2)

In [None]:
dist_dat

### Loop Over All Patients and Calculate Different Distance Measures

Calculate distance measures between all 10 heatmaps.

- cosine distance (angle between vectors, extend is not considered)
- euclidean (raw direct distance)

In [None]:
import itertools
from tqdm import tqdm
from scipy.spatial.distance import pdist

def calc_dist_of_all(all_hms, dist_mes, n_heatmaps=10):
    # all_hms: numpy array of shape (n_patients, n_heatmaps, 128, 128, 28)
    # dist_mes: distance metric to use (e.g. "euclidean", "cosine", "correlation"), must be supported by scipy.spatial.distance.pdist
    #
    # returns: 3 lists of distances for A, AB, B (all distantces between heatmaps of the same group)

    combi = [comb for comb in itertools.combinations(range(n_heatmaps), 2)] # all combinations of 2

    A = []
    AB = []
    B = []
    
    for i in tqdm(range(all_hms.shape[0])):
        res_hm = all_hms[i].reshape(10, -1)

        distances = pdist(res_hm, metric=dist_mes) 

        a = []
        ab = []
        b = []

        for i in range(n_heatmaps):
            for j in range(i+1, n_heatmaps):
                if i < n_heatmaps/2 and j < n_heatmaps/2:
                    a.append(distances[combi.index((i, j))])
                elif i < n_heatmaps/2 and j >= n_heatmaps/2:
                    ab.append(distances[combi.index((i, j))])
                else:
                    b.append(distances[combi.index((i, j))])

        A.append(a)
        AB.append(ab)
        B.append(b)

    return A, AB, B

In [None]:
dist_dat["A_euclidean"], dist_dat["AB_euclidean"], dist_dat["B_euclidean"] = calc_dist_of_all(
    all_heatmaps_1_2, "euclidean")

In [None]:
dist_dat["A_cosine"], dist_dat["AB_cosine"], dist_dat["B_cosine"] = calc_dist_of_all(
    all_heatmaps_1_2, "cosine")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

data = dist_dat.filter(like='cosin').iloc[10][0:3]

sns.boxplot(data=data, color='lightgray', width=0.4)
sns.stripplot(data=data)
plt.xticks(ticks=[0, 1, 2], labels=['a', 'ab', 'b'])
plt.xlabel('Groups')
plt.ylabel('Values')
plt.title('Stripplot and Boxplot')
plt.show()

### Add distant measure of averaged heatmaps

- cosine
- euclidean

In [None]:
def calc_dist_of_two(hm_a, hm_b, dist_mes, adjustment=0):
    # hm_a: numpy array of shape (n_patients, 1, 128, 128, 28)
    # hm_b: numpy array of shape (n_patients, 1, 128, 128, 28)
    # dist_mes: distance metric to use (e.g. "euclidean", "cosine", "correlation"), must be supported by scipy.spatial.distance.pdist
    # adjustment: value to add to the heatmsas before calculating the distance 
    #             (useful for heatmaps which have both negative and positive values and cosine distance)
    #
    # returns: list of distances between all heatmaps

    hm_ab = np.concatenate((hm_a, hm_b), axis=1).reshape(hm_a.shape[0], 2, -1) + adjustment

    distance = []

    for i in range(hm_ab.shape[0]):
        distances = pdist(hm_ab[i], metric=dist_mes)
        distance.append(distances)

    return np.array(distance)


In [None]:
dist_dat["avg_AB_cosine"] = calc_dist_of_two(mean_heatmaps_1, mean_heatmaps_2, "cosine")
dist_dat["avg_AB_euclidean"] = calc_dist_of_two(mean_heatmaps_1, mean_heatmaps_2, "euclidean")

## View Results

In [None]:
def summarize_array(arr, decimals=None):
    summary = {
        'Min': np.min(arr),
        '1st Qu': np.percentile(arr, 25),
        'Median': np.median(arr),
        '3rd Qu': np.percentile(arr, 75),
        'Max': np.max(arr),
        'Mean': np.mean(arr),
        'Std Dev': np.std(arr)
    }
    
    if decimals is not None:
        for key in summary:
            summary[key] = round(summary[key], decimals)
    
    return summary


In [None]:
dist_dat

In [None]:
# Example usage
summary = summarize_array(dist_dat["avg_AB_cosine"], decimals=2)
print(summary)

In [None]:
np.corrcoef(dist_dat["avg_AB_cosine"], dist_dat["avg_AB_euclidean"])

In [None]:
import itertools

data = [
    list(itertools.chain.from_iterable(dist_dat['A_euclidean'])),
    list(itertools.chain.from_iterable(dist_dat['AB_euclidean'])),
    list(itertools.chain.from_iterable(dist_dat['B_euclidean']))
    ]

sns.boxplot(data=data, color='lightgray', width=0.4)
sns.stripplot(data=data, alpha=0.25)
plt.xticks(ticks=[0, 1, 2], labels=['A', 'AB', 'B'])
plt.xlabel('Groups')
plt.ylabel('Values')
plt.title('Stripplot and Boxplot')
plt.show()


In [None]:
from sklearn.decomposition import PCA

# Reshape mean_heatmaps_1 to have the first dimension as the number of samples
n_samples = mean_heatmaps_1.shape[0]
mean_heatmaps_1_reshaped = mean_heatmaps_1.reshape(n_samples, -1)

# Perform PCA
pca1 = PCA(n_components=5)
components1 = pca1.fit_transform(mean_heatmaps_1_reshaped)

pca1.explained_variance_ratio_


In [None]:
# Reshape mean_heatmaps_1 to have the first dimension as the number of samples
n_samples = mean_heatmaps_2.shape[0]
mean_heatmaps_2_reshaped = mean_heatmaps_2.reshape(n_samples, -1)

# Perform PCA
pca2 = PCA(n_components=5)
components2 = pca2.fit_transform(mean_heatmaps_2_reshaped)

pca2.explained_variance_ratio_

In [None]:
components2 = pca1.transform(mean_heatmaps_2_reshaped)


In [None]:
print(components1[:10, 0])
print(components2[:10, 0])


In [None]:
summarize_array(components1[:, 0], decimals=2)

In [None]:
summarize_array(components2[:, 0], decimals=2)

In [None]:
np.corrcoef(components1[:, 0], components2[:, 0])


In [None]:
import matplotlib.pyplot as plt

data = (mean_heatmaps_1 - mean_heatmaps_2)[0].reshape(-1)

plt.hist(data, bins=10)
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.show()

In [None]:
data.mean()

#### Check some patients 

In [None]:
measure = "avg_AB_cosine"
dist_dat.sort_values(by=measure, ascending=False)[["p_id", "mrs", "unfavorable", measure]].head(10)

In [None]:
index = 247 # 220, 11 

In [None]:
print(mean_heatmaps_1[index].min(), mean_heatmaps_1[index].max())

In [None]:
all_heatmaps_1[index].max(axis=(1,2,3))

In [None]:
asdf = (np.maximum(-all_heatmaps_1[index], 0)/-all_heatmaps_1[index].max()).mean(axis=0)
# asdf = (np.maximum(all_heatmaps_1[index], 0)/all_heatmaps_1[index].max()).mean(axis=3)

In [None]:
qwer = ((asdf - asdf.min()) / (asdf.max() - asdf.min()))

In [None]:
qwer.mean(axis=2).max()

In [None]:
# plt.imshow(qwer.mean(axis=2), cmap='bwr', vmin=-1, vmax=1)
plt.imshow(mean_heatmaps_1[index].squeeze().mean(axis=2), cmap='bwr', vmin=-1, vmax=1)
plt.colorbar()
plt.show()


In [None]:
meta_dat.iloc[index]
