# Transfer Learning

Imports

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..' + os.sep + '..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from sklearn.covariance import LedoitWolf
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import math
import xgboost as xgb
import prince
import scipy
import plotly



In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [4]:
from experiment_parameters.model_builder.Model import XGBoostModel
from experiment_parameters.TrainerFactory import dataset_model_dictionary

2025-11-08 21:47:04.745482: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-08 21:47:05.007624: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
CONST_ROUTE_MAIN_DIR = ".." + os.sep + ".." + os.sep

In [6]:
def get_data_from_route(dataset_name, type_of_partition, additional_parameter):
    if type_of_partition == "manual":
        path_to_train_datasets = CONST_ROUTE_MAIN_DIR + "data" + os.sep + "partitioned_training_data" + os.sep + type_of_partition + os.sep + additional_parameter
    else:
        path_to_train_datasets = CONST_ROUTE_MAIN_DIR + "data" + os.sep + "partitioned_training_data" + os.sep + type_of_partition + os.sep + "dataset_" + dataset_name + os.sep + "alpha_" + additional_parameter

    return path_to_train_datasets

In [7]:
def get_results_from_route(dataset_name, type_of_partition, additional_parameter):
    # path_to_result_dataframes = CONST_ROUTE_MAIN_DIR + "results" + os.sep + "FedAvg" + os.sep + dataset_name + os.sep + type_of_partition + os.sep + additional_parameter + os.sep + "mlp"
    if type_of_partition == "dirichlet":
        path_to_result_dataframes = CONST_ROUTE_MAIN_DIR + "results" + os.sep + "dataframes" + os.sep + "FedAvg" + os.sep + dataset_name + os.sep + type_of_partition + os.sep + "alpha_" + additional_parameter + os.sep + "mlp"
    else:
        path_to_result_dataframes = CONST_ROUTE_MAIN_DIR + "results" + os.sep + "dataframes" + os.sep + "FedAvg" + os.sep + dataset_name + os.sep + type_of_partition + os.sep + additional_parameter + os.sep + "mlp"
    return path_to_result_dataframes

In [8]:
def get_distances_from_route(dataset_name, type_of_partition, additional_parameter):
    path_to_result_dataframes = CONST_ROUTE_MAIN_DIR + "results" + os.sep + "distances_values" + os.sep + dataset_name + os.sep + type_of_partition + os.sep + additional_parameter
    return path_to_result_dataframes

In [9]:
from itertools import combinations_with_replacement

def number_of_clients_and_all_combinations(path_to_train_datasets):
    num_clients = int(len(os.listdir(path_to_train_datasets)) / 4)
    client_numbers_original_order = list(range(num_clients))
    client_numbers_reverse_order = list(range(num_clients - 1, -1, -1))

    all_combinations = list(combinations_with_replacement(client_numbers_original_order, 2)) + list(combinations_with_replacement(client_numbers_reverse_order, 2))
    all_combinations = sorted(list(set(all_combinations)))
    print(all_combinations)
    return num_clients, all_combinations

In [10]:
def downcast_types(dataframe):
    for column in dataframe.select_dtypes("int"):
        dataframe[column] = dataframe[column].astype("int16")

    for column in dataframe.select_dtypes("float"):
        dataframe[column] = dataframe[column].astype("float32")

    return dataframe

# Distances measuring functions

In [11]:
from metrics.Distances import compute_coupling, compute_CE
from experiment_parameters.TrainerFactory import dataset_model_dictionary
import torch

use_cuda = torch.cuda.is_available()
dtype = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor



In [12]:
from geomloss import SamplesLoss

def wasserstein_distance(src_x, tar_x):
    # Define a Sinkhorn (~Wasserstein) loss between sampled measures
    loss = SamplesLoss(loss="sinkhorn", p=2, blur=0.01, scaling=0.8) # Although the euclidean distance usually square's root the results, it is not done here. No clue.

    L = loss(torch.tensor(src_x.values).type(dtype), torch.tensor(tar_x.values).type(dtype))  # By default, use constant weights = 1/number of samples
    if use_cuda:
        torch.cuda.synchronize()
    return 2 * L.item()

In [13]:
def gaussian_mmd_distance(src_x, tar_x):
    # Define a Gaussian MMD loss between sampled measures
    loss = SamplesLoss(loss="gaussian", blur=0.05) # Although the euclidean distance usually square's root the results, it is not done here. No clue.

    L = loss(torch.tensor(src_x.values).type(dtype), torch.tensor(tar_x.values).type(dtype))  # By default, use constant weights = 1/number of samples
    if use_cuda:
        torch.cuda.synchronize()
    return L.item()

In [14]:
# global_X_training, global_y_training = dataset_model_dictionary["covertype"]().get_dataset().get_training_data()
# global_X_test, global_y_test = dataset_model_dictionary["covertype"]().get_dataset().get_test_data()

# display(wasserstein_distance(global_X_training, global_X_test))
# display(gaussian_mmd_distance(global_X_training, global_X_test))

In [15]:
# def optimal_transport_conditional_entropy(src_x, src_y, tar_x, tar_y):
#     P, W = compute_coupling(src_x, tar_x, src_y, tar_y)
#     # compute the conditonal entropy (ce)
#     ce = compute_CE(P, src_y, tar_y)
#     print('Wasserstein distance:%.4f, Conditonal Entropy: %.4f' % (W, ce))

#     return W, ce

In [16]:
# display(optimal_transport_conditional_entropy(torch.tensor(global_X_training.to_numpy(), dtype=torch.float), np.argmax(global_y_training, axis=1).astype(int), torch.tensor(global_X_test.to_numpy(), dtype=torch.float), np.argmax(global_y_test, axis=1).astype(int)))

In [17]:
# def regularized_h_score(features: np.ndarray, labels: np.ndarray):
#     r"""
#     Regularized H-score in `Newer is not always better: Rethinking transferability metrics, their peculiarities, stability and performance (NeurIPS 2021)
#     <https://openreview.net/pdf?id=iz_Wwmfquno>`_.

#     The  regularized H-Score :math:`\mathcal{H}_{\alpha}` can be described as:

#     .. math::
#         \mathcal{H}_{\alpha}=\operatorname{tr}\left(\operatorname{cov}_{\alpha}(f)^{-1}\left(1-\alpha \right)\operatorname{cov}\left(\mathbb{E}[f \mid y]\right)\right)

#     where :math:`f` is the features extracted by the model to be ranked, :math:`y` is the groud-truth label vector and :math:`\operatorname{cov}_{\alpha}` the  Ledoit-Wolf
#     covariance estimator with shrinkage parameter :math:`\alpha`
#     Args:
#         features (np.ndarray):features extracted by pre-trained model.
#         labels (np.ndarray):  groud-truth labels.

#     Shape:
#         - features: (N, F), with number of samples N and feature dimension F.
#         - labels: (N, ) elements in [0, :math:`C_t`), with target class number :math:`C_t`.
#         - score: scalar.
#     """
#     f = features.astype('float64')
#     f = f - np.mean(f, axis=0, keepdims=True)  # Center the features for correct Ledoit-Wolf Estimation
#     y = labels

#     C = int(y.max() + 1)
#     g = np.zeros_like(f)

#     cov = LedoitWolf(assume_centered=False).fit(f)
#     alpha = cov.shrinkage_
#     covf_alpha = cov.covariance_

#     for i in range(C):
#         Ef_i = np.mean(f[y == i, :], axis=0)
#         g[y == i] = Ef_i

#     covg = np.cov(g, rowvar=False)
#     score = np.trace(np.dot(np.linalg.pinv(covf_alpha, rcond=1e-15), (1 - alpha) * covg))

#     return score

In [18]:
def negative_conditional_entropy(source_labels: np.ndarray, target_labels: np.ndarray):
    r"""
    Negative Conditional Entropy in `Transferability and Hardness of Supervised
    Classification Tasks (ICCV 2019) <https://arxiv.org/pdf/1908.08142v1.pdf>`_.

    The NCE :math:`\mathcal{H}` can be described as:

    .. math::
        \mathcal{H}=-\sum_{y \in \mathcal{C}_t} \sum_{z \in \mathcal{C}_s} \hat{P}(y, z) \log \frac{\hat{P}(y, z)}{\hat{P}(z)}

    where :math:`\hat{P}(z)` is the empirical distribution and :math:`\hat{P}\left(y \mid z\right)` is the empirical
    conditional distribution estimated by source and target label.

    Args:
        source_labels (np.ndarray): predicted source labels.
        target_labels (np.ndarray): groud-truth target labels.

    Shape:
        - source_labels: (N, ) elements in [0, :math:`C_s`), with source class number :math:`C_s`.
        - target_labels: (N, ) elements in [0, :math:`C_t`), with target class number :math:`C_t`.
    """
    C_t = int(np.max(target_labels) + 1)
    C_s = int(np.max(source_labels) + 1)
    N = len(source_labels)

    joint = np.zeros((C_t, C_s), dtype=float)  # placeholder for the joint distribution, shape [C_t, C_s]
    for s, t in zip(source_labels, target_labels):
        s = int(s)
        t = int(t)
        joint[t, s] += 1.0 / N
    p_z = joint.sum(axis=0, keepdims=True)

    p_target_given_source = (joint / p_z).T  # P(y | z), shape [C_s, C_t]
    mask = p_z.reshape(-1) != 0  # valid Z, shape [C_s]
    p_target_given_source = p_target_given_source[mask] + 1e-20  # remove NaN where p(z) = 0, add 1e-20 to avoid log (0)
    entropy_y_given_z = np.sum(- p_target_given_source * np.log(p_target_given_source), axis=1, keepdims=True)
    conditional_entropy = np.sum(entropy_y_given_z * p_z.reshape((-1, 1))[mask])

    return -conditional_entropy

In [19]:
# from numba import njit

# def log_maximum_evidence(features: np.ndarray, targets: np.ndarray, regression=False, return_weights=False):
#     r"""
#     Log Maximum Evidence in `LogME: Practical Assessment of Pre-trained Models
#     for Transfer Learning (ICML 2021) <https://arxiv.org/pdf/2102.11005.pdf>`_.

#     Args:
#         features (np.ndarray): feature matrix from pre-trained model.
#         targets (np.ndarray): targets labels/values.
#         regression (bool, optional): whether to apply in regression setting. (Default: False)
#         return_weights (bool, optional): whether to return bayesian weight. (Default: False)

#     Shape:
#         - features: (N, F) with element in [0, :math:`C_t`) and feature dimension F, where :math:`C_t` denotes the number of target class
#         - targets: (N, ) or (N, C), with C regression-labels.
#         - weights: (F, :math:`C_t`).
#         - score: scalar.
#     """
#     f = features.astype(np.float64)
#     y = targets
#     if regression:
#         y = targets.astype(np.float64)

#     fh = f
#     f = f.transpose()
#     D, N = f.shape
#     v, s, vh = np.linalg.svd(f @ fh, full_matrices=True)

#     evidences = []
#     weights = []
#     if regression:
#         C = y.shape[1]
#         for i in range(C):
#             y_ = y[:, i]
#             evidence, weight = each_evidence(y_, f, fh, v, s, vh, N, D)
#             evidences.append(evidence)
#             weights.append(weight)
#     else:
#         C = int(y.max() + 1)
#         for i in range(C):
#             y_ = (y == i).astype(np.float64)
#             evidence, weight = each_evidence(y_, f, fh, v, s, vh, N, D)
#             evidences.append(evidence)
#             weights.append(weight)

#     score = np.mean(evidences)
#     weights = np.vstack(weights)

#     if return_weights:
#         return score, weights
#     else:
#         return score


# @njit
# def each_evidence(y_, f, fh, v, s, vh, N, D):
#     """
#     compute the maximum evidence for each class
#     """
#     alpha = 1.0
#     beta = 1.0
#     lam = alpha / beta
#     print(f.shape)
#     print(y_.shape)
#     tmp = (vh @ (f @ y_))

#     for _ in range(11):
#         # should converge after at most 10 steps
#         # typically converge after two or three steps
#         gamma = (s / (s + lam)).sum()
#         m = v @ (tmp * beta / (alpha + beta * s))
#         alpha_de = (m * m).sum()
#         alpha = gamma / alpha_de
#         beta_de = ((y_ - fh @ m) ** 2).sum()
#         beta = (N - gamma) / beta_de
#         new_lam = alpha / beta
#         if np.abs(new_lam - lam) / lam < 0.01:
#             break
#         lam = new_lam

#     evidence = D / 2.0 * np.log(alpha) \
#                + N / 2.0 * np.log(beta) \
#                - 0.5 * np.sum(np.log(alpha + beta * s)) \
#                - beta / 2.0 * beta_de \
#                - alpha / 2.0 * alpha_de \
#                - N / 2.0 * np.log(2 * np.pi)

#     return evidence / N, m

In [20]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from metrics.Evaluator import evaluator
from xgboost import XGBClassifier

def accuracy(y_test, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    ground_truth_np = np.argmax(y_test, axis=1)
    return accuracy_score(ground_truth_np, y_pred)

def degradation_decomp(source_X, source_y, other_X_raw, other_y_raw, best_method, column_names, data_sum=20000, K=8, domain_classifier=None, draw_calibration=False, save_calibration_png='calibration.png'):
    perm1 = np.random.permutation(other_X_raw.shape[0])
    other_X = other_X_raw[perm1[:data_sum],:]
    other_y = other_y_raw[perm1[:data_sum]]

    piA = np.zeros(source_X.shape[0])
    piB = np.zeros(other_X.shape[0])
    permA = np.random.permutation(piA.shape[0])
    permB = np.random.permutation(piB.shape[0])

    kf = KFold(n_splits=K, shuffle=False)
    A_train_index_list = []
    A_test_index_list = []
    B_train_index_list = []
    B_test_index_list = []
    for i, (train_index, test_index) in enumerate(kf.split(source_X)):
        A_train_index_list.append(train_index)
        A_test_index_list.append(test_index)
    for i, (train_index, test_index) in enumerate(kf.split(other_X)):
        B_train_index_list.append(train_index)
        B_test_index_list.append(test_index)

    for i in range(K):
        trainX = np.concatenate([source_X[permA[A_train_index_list[i]]],other_X[permB[B_train_index_list[i]]]], axis=0)
        trainT = np.zeros(trainX.shape[0])
        trainT[len(A_train_index_list[i]):] = 1.0

        if domain_classifier is None:
            model = XGBClassifier(random_state=0).fit(trainX, trainT)
        else:
            model = domain_classifier.fit(trainX, trainT)

        piA[permA[A_test_index_list[i]]] = model.predict_proba(source_X[permA[A_test_index_list[i]]])[:,1]
        piB[permB[B_test_index_list[i]]] = model.predict_proba(other_X[permB[B_test_index_list[i]]])[:,1]

    # if draw_calibration:
    #     plot_calibration(piA, piB, save_dir=save_calibration_png)

    alpha = (other_X.shape[0])/ (source_X.shape[0]+other_X.shape[0])
    wA = piA / ((1-alpha)*piA + alpha * (1-piA))
    wB = (1-piB) / ((1-alpha)*piB + alpha * (1-piB))
    # Changing to support the model type of XGBoost.
    # accuracyA = best_method.score(source_X, source_y)
    # accuracyB = best_method.score(other_X, other_y)
    pd_source_X = pd.DataFrame(source_X, columns=column_names)
    pd_other_X = pd.DataFrame(other_X, columns=column_names)
    accuracyA = accuracy(source_y, best_method.predict_proba(pd_source_X))
    accuracyB = accuracy(other_y, best_method.predict_proba(pd_other_X))
    wA = wA / np.sum(wA)
    wB = wB / np.sum(wB)
    # predA = (best_method.predict(source_X) == source_y)
    # predB = (best_method.predict(other_X) == other_y)
    predA = (np.argmax(best_method.predict_proba(pd_source_X), axis=1) == np.argmax(source_y, axis=1))
    predB = (np.argmax(best_method.predict_proba(pd_other_X), axis=1) == np.argmax(other_y, axis=1))
    sx_A = np.dot(wA, predA)
    sx_B = np.dot(wB, predB)
    return accuracyA, accuracyB, sx_A, sx_B


In [21]:
def y_shift(src_x, src_y, tar_x, tar_y, tree_model, column_names):
    p2p, q2q, p2s, s2q = degradation_decomp(src_x, src_y, tar_x, tar_y, tree_model, column_names, data_sum=20000, K=8, draw_calibration=False, save_calibration_png='calibration.png')
    # print(f"Total Performance Degradation is {p2p-q2q}")
    # print(f"Proportion of Y|X-shift is {(p2s-s2q)/(p2p-q2q)}")
    perf_degradation = p2p-q2q
    proportion_yshift = (p2s-s2q)/(p2p-q2q)
    return perf_degradation, proportion_yshift

In [22]:
def task_agnostic_data_valuation(src_x, tar_x):
    cov_mat_src = src_x.cov()
    src_eig_vals, src_eig_vecs = np.linalg.eig((cov_mat_src.T @ cov_mat_src) * (1 / len(src_x)))
    cov_mat_tar = ((tar_x.cov().T @ tar_x.cov()) * (1 / len(tar_x)))
    tar_eig_vals = [np.sqrt(np.sum(np.square(cov_mat_tar.dot(eigen_vec)))) for eigen_vec in src_eig_vecs]
    src_eig_vals = np.array(src_eig_vals)
    tar_eig_vals = np.array(tar_eig_vals)
    diversity, relevance = 1, 1
    for src_eig, tar_eig in zip(src_eig_vals, tar_eig_vals):
        diversity *= np.power((abs(src_eig - tar_eig) / max(src_eig, tar_eig)), 1 / len(src_eig_vals))
        relevance *= np.power((min(src_eig, tar_eig) / max(src_eig, tar_eig)), 1 / len(src_eig_vals))
    return relevance, diversity

In [23]:
# def compute_volumes(X, d=1):
def compute_volumes(datasets, d=1):
    d = datasets[0].shape[1]
    for i in range(len(datasets)):
        datasets[i] = datasets[i].reshape(-1 ,d)

    X = np.concatenate(datasets, axis=0).reshape(-1, d)
    volumes = np.zeros(len(datasets))
    for i, dataset in enumerate(datasets):
        volumes[i] = np.sqrt(np.linalg.det( dataset.T @ dataset ) + 1e-8)

    volume_all = np.sqrt(np.linalg.det(X.T @ X) + 1e-8).round(3)
    return volumes, volume_all
    # return volume_all

In [24]:
from math import ceil, floor
from collections import defaultdict, Counter

import torch
import numpy as np
from torch import stack, cat, zeros_like, pinverse

def compute_X_tilde_and_counts(X, omega):
    """
    Compresses the original feature matrix X to  X_tilde with the specified omega.

    Returns:
       X_tilde: compressed np.ndarray
       cubes: a dictionary of cubes with the respective counts in each dcube
    """
    D = X.shape[1]

    # assert 0 < omega <= 1, "omega must be within range [0,1]."

    m = ceil(1.0 / omega) # number of intervals for each dimension

    cubes = Counter() # a dictionary to store the freqs
    # key: (1,1,..)  a d-dimensional tuple, each entry between [0, m-1]
    # value: counts

    Omega = defaultdict(list)
    # Omega = {}

    min_ds = torch.min(X, axis=0).values

    # a dictionary to store cubes of not full size
    for x in X:
        cube = []
        for d, xd in enumerate(x - min_ds):
            d_index = floor(xd / omega)
            cube.append(d_index)

        cube_key = tuple(cube)
        cubes[cube_key] += 1

        Omega[cube_key].append(x)

        '''
        if cube_key in Omega:

            # Implementing mean() to compute the average of all rows which fall in the cube

            Omega[cube_key] = Omega[cube_key] * (1 - 1.0 / cubes[cube_key]) + 1.0 / cubes[cube_key] * x
            # Omega[cube_key].append(x)
        else:
             Omega[cube_key] = x
        '''
    X_tilde = stack([stack(list(value)).mean(axis=0) for key, value in Omega.items()])

    # X_tilde = stack(list(Omega.values()))

    return X_tilde, cubes

def compute_robust_volumes(X_tildes, dcube_collections):

    N = sum([len(X_tilde) for X_tilde in X_tildes])
    alpha = 1.0 / (10 * N) # it means we set beta = 10
    # print("alpha is :{}, and (1 + alpha) is :{}".format(alpha, 1 + alpha))

    volumes, volume_all = compute_volumes(X_tildes, d=X_tildes[0].shape[1])
    robust_volumes = np.zeros_like(volumes)
    for i, (volume, hypercubes) in enumerate(zip(volumes, dcube_collections)):
        rho_omega_prod = 1.0
        for cube_index, freq_count in hypercubes.items():

            # if freq_count == 1: continue # volume does not monotonically increase with omega
            # commenting this if will result in volume monotonically increasing with omega
            rho_omega = (1 - alpha**(freq_count + 1)) / (1 - alpha)

            rho_omega_prod *= rho_omega

        robust_volumes[i] = (volume * rho_omega_prod).round(3)
    return robust_volumes


def robust_volume(Xs, omega=0.1):
    # M = len(Xs)
    D = Xs.shape[1]
    # orderings = list(permutations(range(M)))

    # s_values = torch.zeros(M)
    # monte_carlo_s_values = torch.zeros(M)

    # s_value_robust = torch.zeros(M)
    # monts_carlo_s_values_robust = torch.zeros(M)

    # Monte-carlo : shuffling the ordering and taking the first K orderings
    # random.shuffle(orderings)
    # K = 4 # number of permutations to sample
    # for ordering_count, ordering in enumerate(orderings):

        # prefix_vol = 0
        # prefix_robust_vol = 0
        # for position, i in enumerate(ordering):

        #     curr_indices = set(ordering[:position+1])

    # curr_train_X = torch.cat(torch.tensor(Xs.values)).reshape(-1, D)
    curr_train_X = torch.tensor(Xs.values).reshape(-1, D)

    # curr_train_X = torch.tensor(Xs.values)

    # curr_vol = torch.sqrt(torch.linalg.det(curr_train_X.T @ curr_train_X) + 1e-8)


    # marginal = curr_vol - prefix_vol
    # prefix_vol = curr_vol
    # s_values[i] += marginal

    X_tilde, cubes = compute_X_tilde_and_counts(curr_train_X, omega)

    robust_vol = compute_robust_volumes([X_tilde], [cubes])[0]

    return robust_vol

In [25]:
def get_xgb_tree(train_x, train_y, test_x, test_y):
    parameters_dict = {"batch_size": 64}
    d_matrix = xgb.DMatrix(train_x, label=np.argmax(train_y, axis=1))
    d_test_matrix = xgb.DMatrix(test_x, label=np.argmax(test_y, axis=1))
    if train_y.shape[1] == 2:
        parameters_dict["objective"] = "binary:logistic"
        parameters_dict["eval_metric"] = "logloss"
    elif train_y.shape[1] > 2:
        parameters_dict["objective"] = "multi:softprob"
        parameters_dict['num_class'] = train_y.shape[1]
        parameters_dict["disable_default_eval_metric"] = 1
        parameters_dict["eval_metric"] = "mlogloss"
    tree_model = xgb.train(parameters_dict, d_matrix, evals=[(d_matrix, "train"), (d_test_matrix, "validate")], num_boost_round=500, early_stopping_rounds=10)
    tree_model = XGBoostModel(tree_model)
    return tree_model

In [26]:
from sklearn.neighbors import KernelDensity

def hellinger_distance(src_y, tar_y, type_continuous=False):
    # if type_continuous:
    #     kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(src_y.reshape(-1, 1))
    #     src_y = kde.score_samples(src_y.reshape(-1, 1))

    #     kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(tar_y.reshape(-1, 1))
    #     tar_y = kde.score_samples(tar_y.reshape(-1, 1))

    # else:
    total_instances_per_label_src = np.sum(src_y, axis=0)
    total_instances_per_label_tar = np.sum(tar_y, axis=0)
    p = np.divide(total_instances_per_label_src, np.sum(total_instances_per_label_src))
    q = np.divide(total_instances_per_label_tar, np.sum(total_instances_per_label_tar))
    
    return (1/math.sqrt(2)) * np.sqrt(np.sum(np.square(np.sqrt(p) - np.sqrt(q))))

In [27]:
# def hellinger_distance(p, q):
#     """Hellinger distance between two discrete distributions.
#        In pure Python.
#        Fastest version."""
#     z = np.sqrt(p) - np.sqrt(q)
#     return np.sqrt(z @ z / 2)

In [28]:
from util import OptunaConnection
from experiment_parameters.model_builder.ModelBuilder import Director, get_training_configuration
from experiment_parameters.model_builder.Model import XGBoostModel, KerasModel
from sklearn.metrics import log_loss, mean_absolute_error
import gc

director = Director()

def get_parameters(trial, model_type):
    parameters = get_training_configuration(trial=trial, model_type=model_type)
    return parameters

def get_mlp(input_dim, num_classes, parameters):
    return director.create_mlp(input_parameters=input_dim, num_classes=num_classes, parameters=parameters)

def performance_degradation(train_src_x, train_src_y, train_tar_x, train_tar_y, test_tar_x, test_tar_y, dataset_name):
    if dataset_name == "har":
        study = OptunaConnection.load_study("mlp_har")
    elif dataset_name == "edge-iot-coreset":
        study = OptunaConnection.load_study("mlp_edge_iiot_coreset")
    elif dataset_name == "electric-consumption":
        study = OptunaConnection.load_study("mlp_electric_consumption")
    best_trial = study.best_trial
    parameters_dict = get_training_configuration(best_trial, "mlp")

    model_src = get_mlp(train_src_x.shape[1], train_src_y.shape[1], parameters_dict)
    model_src.fit(train_src_x, train_src_y, epochs=30)

    model_tar = get_mlp(train_src_x.shape[1], train_src_y.shape[1], parameters_dict)
    model_tar.fit(train_tar_x, train_tar_y, epochs=30)

    if train_src_y.shape[1] == 1:
        metric_list = ["MAE"]
        evaluation_result_src = evaluator(test_tar_x, test_tar_y, model_src, metric_list=metric_list).get_value_of_metric("MAE")
        evaluation_result_tar = evaluator(test_tar_x, test_tar_y, model_tar, metric_list=metric_list).get_value_of_metric("MAE")
    else:
        metric_list = ["CrossEntropyLoss"]
        evaluation_result_src = evaluator(test_tar_x, test_tar_y, model_src, metric_list=metric_list).get_value_of_metric("CrossEntropyLoss")
        evaluation_result_tar = evaluator(test_tar_x, test_tar_y, model_tar, metric_list=metric_list).get_value_of_metric("CrossEntropyLoss")

    return evaluation_result_src - evaluation_result_tar

!Comment about the prince library.

In [29]:
def compute_all_distances_and_volumes(dataset_name, path_to_train_datasets):
    wassersteinDataframe = pd.DataFrame()
    gaussianMMDDataframe = pd.DataFrame()
    relevanceDataframe = pd.DataFrame()
    diversityDataframe = pd.DataFrame()
    volumeDataframe = pd.Series()

    performanceDegradation = pd.DataFrame()
    negativeConditionalEntropyDataframe = pd.DataFrame()
    yShiftDataframe = pd.DataFrame()
    hellingerDistanceDataframe = pd.DataFrame()
    
    # regularizedHScoreDataframe = pd.DataFrame()
    classification = False

    global_X_training, global_y_training = dataset_model_dictionary[dataset_name]().get_dataset().get_training_data()
    global_X_test, global_y_test = dataset_model_dictionary[dataset_name]().get_dataset().get_test_data()
    column_names = global_X_training.columns.values.tolist()

    if len(global_y_training.shape) > 1: # Classification problem
        classification = True

    # global_X_training = downcast_types(global_X_training)
    # global_X_test = downcast_types(global_X_test)

    num_clients, all_combinations = number_of_clients_and_all_combinations(path_to_train_datasets)
    # volumeDataframe.loc["Global"] = compute_volumes(prince.PCA(n_components=10).fit_transform(pd.concat([global_X_training, global_X_test]).reset_index(drop=True)).to_numpy())
    # volumeDataframe.loc["Global"] = robust_volume(prince.PCA(n_components=10).fit_transform(pd.concat([global_X_training, global_X_test])))

    for source_client in range(num_clients):
        src_train_x = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(source_client) + "_X_training.csv", index_col=0)
        src_train_y = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(source_client) + "_y_training.csv", index_col=0)
        src_test_x = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(source_client) + "_X_test.csv", index_col=0)
        src_test_y = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(source_client) + "_y_test.csv", index_col=0)

        src_train_x = downcast_types(src_train_x)
        src_test_x = downcast_types(src_test_x)

        # tree_model = get_xgb_tree(src_train_x, src_train_y, src_test_x, src_test_y)
        # wassersteinDataframe.loc[source_client, "Global"], conditionalEntropy.loc[source_client, "Global"] = optimal_transport_conditional_entropy(torch.tensor(pd.concat([src_train_x, src_test_x]).to_numpy(), dtype=torch.float),
        #                                                                 np.argmax(pd.concat([src_train_y, src_test_y]).astype(int).to_numpy(), axis=1),
        #                                                                 torch.tensor(pd.concat([global_X_training, global_X_test]).to_numpy(), dtype=torch.float),
        #                                                                 np.argmax(pd.concat([global_y_training, global_y_test]).astype(int).to_numpy(), axis=1))
        # wassersteinDataframe.loc[source_client, "Global"]= wasserstein_distance(pd.concat([src_train_x, src_test_x]),
        #                                                                         pd.concat([global_X_training, global_X_test]))
        # gaussianMMDDataframe.loc[source_client, "Global"]= gaussian_mmd_distance(pd.concat([src_train_x, src_test_x]),
        #                                                                         pd.concat([global_X_training, global_X_test]))
        # relevanceDataframe.loc[source_client, "Global"], diversityDataframe.loc[source_client, "Global"] = task_agnostic_data_valuation(
        #     prince.PCA(n_components=10).fit_transform(pd.concat([src_train_x, src_test_x])),
        #     prince.PCA(n_components=10).fit_transform(pd.concat([global_X_training, global_X_test]))
        # )
        # volumeDataframe.loc[source_client] = robust_volume(prince.PCA(n_components=10).fit_transform(pd.concat([src_train_x, src_test_x])))

        # if classification:
        #     _, yShiftDataframe.loc[source_client, "Global"] = y_shift(pd.concat([src_train_x, src_test_x]).to_numpy(),
        #                                                             pd.concat([src_train_y, src_test_y]).to_numpy(),
        #                                                             pd.concat([global_X_training, global_X_test]).to_numpy(),
        #                                                             pd.concat([global_y_training, global_y_test]).to_numpy(),
        #                                                             tree_model,
        #                                                             column_names)
            # negativeConditionalEntropyDataframe.loc[source_client, "Global"] = negative_conditional_entropy(np.argmax(pd.concat([src_train_y, src_test_y]).astype(int).to_numpy(), axis=1), np.argmax(pd.concat([global_y_training, global_y_test]).astype(int).to_numpy(), axis=1))

        performanceDegradation.loc[source_client, "Global"] = performance_degradation(src_train_x, src_train_y, global_X_training, global_y_training, global_X_test, global_y_test, dataset_name=dataset_name)
        # if dataset_name == "electric-consumption":
        #     hellingerDistanceDataframe.loc[source_client, "Global"] = hellinger_distance(pd.concat([src_train_y, src_test_y]).to_numpy(), pd.concat([global_y_training, global_y_test]).to_numpy(), type_continuous=True)
        # else:
        #     hellingerDistanceDataframe.loc[source_client, "Global"] = hellinger_distance(pd.concat([src_train_y, src_test_y]).to_numpy(), pd.concat([global_y_training, global_y_test]).to_numpy())

        gc.collect()
        for target_client in range(num_clients):
            tar_train_x = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(target_client) + "_X_training.csv", index_col=0)
            tar_train_y = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(target_client) + "_y_training.csv", index_col=0)
            tar_test_x = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(target_client) + "_X_test.csv", index_col=0)
            tar_test_y = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(target_client) + "_y_test.csv", index_col=0)

            tar_train_x = downcast_types(tar_train_x)
            tar_test_x = downcast_types(tar_test_x)

            performanceDegradation.loc[source_client, target_client] = performance_degradation(src_train_x, src_train_y, tar_train_x, tar_train_y, tar_test_x, tar_test_y, dataset_name=dataset_name)

            # wassersteinDataframe.loc[source_client, target_client]= wasserstein_distance(pd.concat([src_train_x, src_test_x]),
            #                                                                     pd.concat([tar_train_x, tar_test_x]))
            # gaussianMMDDataframe.loc[source_client, target_client]= gaussian_mmd_distance(pd.concat([src_train_x, src_test_x]),
            #                                                                     pd.concat([tar_train_x, tar_test_x]))
            # relevanceDataframe.loc[source_client, target_client], diversityDataframe.loc[source_client, target_client] = task_agnostic_data_valuation(
            #     prince.PCA(n_components=10).fit_transform(pd.concat([src_train_x, src_test_x])),
            #     prince.PCA(n_components=10).fit_transform(pd.concat([tar_train_x, tar_test_x]))
            # )

            # if classification:
            #     _, yShiftDataframe.loc[source_client, target_client] = y_shift(pd.concat([src_train_x, src_test_x]).to_numpy(),
            #                                                                 pd.concat([src_train_y, src_test_y]).to_numpy(),
            #                                                                 pd.concat([tar_train_x, tar_test_x]).to_numpy(),
            #                                                                 pd.concat([tar_train_y, tar_test_y]).to_numpy(),
            #                                                                 tree_model,
            #                                                                 column_names)
                # negativeConditionalEntropyDataframe.loc[source_client, target_client] = negative_conditional_entropy(np.argmax(pd.concat([src_train_y, src_test_y]).astype(int).to_numpy(), axis=1), np.argmax(pd.concat([tar_train_y, tar_test_y]).astype(int).to_numpy(), axis=1))
            # if dataset_name == "electric-consumption":
            #     hellingerDistanceDataframe.loc[source_client, target_client] = hellinger_distance(pd.concat([src_train_y, src_test_y]).to_numpy(), pd.concat([tar_train_y, tar_test_y]).to_numpy(), type_continuous=True)
            # else:
            #     hellingerDistanceDataframe.loc[source_client, target_client] = hellinger_distance(pd.concat([src_train_y, src_test_y]).to_numpy(), pd.concat([tar_train_y, tar_test_y]).to_numpy())
            gc.collect()


    return wassersteinDataframe, gaussianMMDDataframe, performanceDegradation, yShiftDataframe, negativeConditionalEntropyDataframe, relevanceDataframe, diversityDataframe, volumeDataframe, hellingerDistanceDataframe

In [None]:
def compute_distances_and_values(dataset_name, type_of_partition, additional_parameter):
    path_to_result_dataframes = get_distances_from_route(dataset_name, type_of_partition, additional_parameter)
    path_to_train_datasets = get_data_from_route(dataset_name, type_of_partition, additional_parameter)
    wassersteinDataframe, gaussianMMDDataframe, performanceDegradation, yShiftDataframe, negativeConditionalEntropy, relevance, diversity, volume, hellinger = compute_all_distances_and_volumes(dataset_name, path_to_train_datasets)
    
    os.makedirs(path_to_result_dataframes, exist_ok=True)

    if yShiftDataframe.shape[0] > 0:
        yShiftDataframe.to_csv(path_to_result_dataframes + os.sep + "yShiftDataframe.csv")
    
    if negativeConditionalEntropy.shape[0] > 0:
        negativeConditionalEntropy.to_csv(path_to_result_dataframes + os.sep + "negativeConditionalEntropy.csv")

    if performanceDegradation.shape[0] > 0:
        performanceDegradation.to_csv(path_to_result_dataframes + os.sep + "performanceDegradation.csv")

    if hellinger.shape[0] > 0:
        hellinger.to_csv(path_to_result_dataframes + os.sep + "hellinger.csv")

    if wassersteinDataframe.shape[0] > 0:
        wassersteinDataframe.to_csv(path_to_result_dataframes + os.sep + "wasserstein.csv")

    if gaussianMMDDataframe.shape[0] > 0:
        gaussianMMDDataframe.to_csv(path_to_result_dataframes + os.sep + "gaussian_mmd.csv")
    # conditionalEntropy.to_csv(path_to_result_dataframes + os.sep + "conditionalEntropy.csv")

    if relevance.shape[0] > 0:
        relevance.to_csv(path_to_result_dataframes + os.sep + "relevance.csv")

    if diversity.shape[0] > 0:
        diversity.to_csv(path_to_result_dataframes + os.sep + "diversity.csv")
    
    if volume.shape[0] > 0:
        volume.to_csv(path_to_result_dataframes + os.sep + "volume.csv")
    

In [None]:
training_configurations_for_distances = [
    # ("adult", "dirichlet", "0.1"),
    # ("adult", "dirichlet", "1"),
    # ("adult", "dirichlet", "10"),
    # ("adult", "dirichlet", "100"),
    # ("adult", "manual", "Adult_FeatureSkew_Education_Doctorate"),
    # ("adult", "manual", "Adult_FeatureSkew_Education_Masters"),
    # ("adult", "manual", "Adult_FeatureSkew_Occupation"),
    # ("adult", "manual", "Adult_FeatureSkew_Occupation_ExecManagerial"),
    # ("adult", "manual", "Adult_FeatureSkew_Studies"),
    ("har", "dirichlet", "1"),
    ("har", "dirichlet", "10"),
    ("har", "dirichlet", "100"),
    ("har", "manual", "HAR_1_Maverick_1_label_skew"),
    ("har", "manual", "HAR_1_Maverick_1_LessLabels"),
    ("har", "manual", "HAR_1_Maverick_1_MissingTwoLabels"),
    ("har", "manual", "HAR_1_Maverick_Laying"),
    ("har", "manual", "HAR_1_Maverick_1_HellingerTrap"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_1"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_2"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_3"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_4"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_5"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_2"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_3"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_4"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_5"),
    # # ("covertype", "dirichlet", "1"),
    # # ("covertype", "dirichlet", "10"),
    # # ("covertype", "dirichlet", "100"),
    # # ("covertype", "manual", "Covertype_Maverick_LeastNumerousClass"),
    # # ("covertype", "manual", "Covertype_Maverick_MostNumerousClass"),
    # # ("covertype", "manual", "Covertype_FS_WildernessArea"),
    # # ("new_adult", "dirichlet", "1"),
    # # ("new_adult", "dirichlet", "10"),
    # # ("new_adult", "dirichlet", "100"),
    # # ("new_adult", "manual", "New_Adult_AL_CO_CT_MA")
    # ("edge-iot-coreset", "dirichlet", "10"),
    # ("edge-iot-coreset", "dirichlet", "100"),
    # ("edge-iot-coreset", "dirichlet", "1000"),
    # ("edge-iot-coreset", "dirichlet", "10000"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_LeastAttack"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_OnlyNormal"),
    ("electric-consumption", "manual", "Electric_Consumption_Random_Sampling"),
    ("electric-consumption", "manual", "ElectricConsumptionFacilityType"),
    ("electric-consumption", "manual", "ElectricConsumptionNoniid"),
    ("electric-consumption", "manual", "ElectricConsumptionStateFactor"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore")
]

In [None]:
for dataset_name, type_of_partition, additional_parameter in training_configurations_for_distances:
    compute_distances_and_values(dataset_name, type_of_partition, additional_parameter)

In [None]:
def get_results_of_training(dataset_name, type_of_partition, additional_parameter):
    results_dataframe = pd.Series()

    path_to_result_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    # metrics = []
    # result = []

    results_dataframe.loc["Dataset"] = dataset_name
    results_dataframe.loc["Type_of_partition"] = type_of_partition
    results_dataframe.loc["dirichlet/NamePartition"] = additional_parameter

    for file in os.listdir(path_to_result_dataframes + os.sep + "Evaluation"):
        # spearman_rank_distance_sv = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "Evaluator", "Distance", "SpearmanRank", "p-value"])
        metric_name = file.split("_")[1]

        if file != "Evaluation_F1Score":
            evaluation_dataframe = pd.read_csv(path_to_result_dataframes + os.sep + "Evaluation" + os.sep + file)
            results_dataframe.loc[metric_name] = evaluation_dataframe.loc[evaluation_dataframe.index[-1], "Global"]

    return results_dataframe

In [None]:
results_dataframe = pd.DataFrame()

for dataset_name, type_of_partition, additional_parameter in training_configurations_for_distances:
    # display(get_results_of_training(dataset_name, type_of_partition, additional_parameter))
    results_dataframe = pd.concat([results_dataframe, get_results_of_training(dataset_name, type_of_partition, additional_parameter).to_frame().T], ignore_index=True)

In [None]:
display(results_dataframe)

## Computing distances and value for the different experiments

In [None]:
training_configurations = [
    # ("adult", "dirichlet", "0.1"),
    # ("adult", "dirichlet", "1"),
    # ("adult", "dirichlet", "10"),
    # ("adult", "dirichlet", "100"),
    # ("adult", "manual", "Adult_FeatureSkew_Education_Doctorate"),
    # ("adult", "manual", "Adult_FeatureSkew_Education_Masters"),
    # ("adult", "manual", "Adult_FeatureSkew_Occupation"),
    # ("adult", "manual", "Adult_FeatureSkew_Occupation_ExecManagerial"),
    # ("adult", "manual", "Adult_FeatureSkew_Studies"),
    # ("har", "dirichlet", "1"),
    # ("har", "dirichlet", "10"),
    # ("har", "dirichlet", "100"),
    # ("har", "manual", "HAR_1_Maverick_1_label_skew"),
    # ("har", "manual", "HAR_1_Maverick_1_LessLabels"),
    # ("har", "manual", "HAR_1_Maverick_1_MissingTwoLabels"),
    # ("har", "manual", "HAR_1_Maverick_1_HellingerTrap"),
    # ("har", "manual", "HAR_1_Maverick_Laying"),
    # ("har", "manual", "HAR_1_Maverick_Balanced_Laying_1"),
    # ("har", "manual", "HAR_1_Maverick_Balanced_Laying_2"),
    # ("har", "manual", "HAR_1_Maverick_Balanced_Laying_3"),
    # ("har", "manual", "HAR_1_Maverick_Balanced_Laying_4"),
    # ("har", "manual", "HAR_1_Maverick_Balanced_Laying_5"),
    # ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs"),
    # ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_2"),
    # ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_3"),
    # ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_4"),
    # ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_5"),
    # ("covertype", "dirichlet", "1"),
    # ("covertype", "dirichlet", "10"),
    # ("covertype", "dirichlet", "100"),
    # ("covertype", "manual", "Covertype_Maverick_LeastNumerousClass"),
    # ("covertype", "manual", "Covertype_Maverick_MostNumerousClass"),
    # ("covertype", "manual", "Covertype_FS_WildernessArea"),
    # ("new_adult", "dirichlet", "1"),
    # ("new_adult", "dirichlet", "10"),
    # ("new_adult", "dirichlet", "100"),
    # ("new_adult", "manual", "New_Adult_AL_CO_CT_MA")
    # ("edge-iot-coreset", "dirichlet", "10"),
    # ("edge-iot-coreset", "dirichlet", "100"),
    # ("edge-iot-coreset", "dirichlet", "1000"),
    # ("edge-iot-coreset", "dirichlet", "10000"),
    # ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_LeastAttack"),
    # ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_OnlyNormal"),
    # ("electric-consumption", "manual", "Electric_Consumption_Random_Sampling"),
    # ("electric-consumption", "manual", "ElectricConsumptionFacilityType"),
    # ("electric-consumption", "manual", "ElectricConsumptionNoniid"),
    # ("electric-consumption", "manual", "ElectricConsumptionStateFactor"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore")
]

# training_configurations = [
#     ("adult", "manual", "Adult_FeatureSkew_Occupation")

In [None]:
def compute_spearman_correlation_sv(dataset_name, type_of_partition, additional_parameter):
    path_to_result_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    spearman_rank_sv_metric_centralized_decentralized = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "SpearmanRank", "p-value"])
    spearman_sv_metrics = []
    spearman_sv_values = []
    spearman_sv_p_values = []

    for file in os.listdir(path_to_result_dataframes + os.sep + "Shapley_Value"):
        results_dataframe = pd.read_csv(path_to_result_dataframes + os.sep + "Shapley_Value" + os.sep + file)
        # display(path_to_result_dataframes)
        # display(file)
        spearman_sv_metrics.append(file.split('_')[1])
        if "SV_F1Score" == file:
            grouped_sv = results_dataframe.groupby(["Evaluator", "Classes"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Classes", "Round"], axis=1)
            spearman_sv_values.append(0)
            spearman_sv_p_values.append(0)
        else:
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            # display(grouped_sv)
        # spearman_rank, pvalue = scipy.stats.spearmanr(grouped_sv.loc["Centralized"], grouped_sv.loc["Aggregated"])
            def statistic(x): # permute only `x`
                return scipy.stats.spearmanr(x, grouped_sv.loc["Aggregated"]).statistic
            res_exact = scipy.stats.permutation_test((grouped_sv.loc["Centralized"],), statistic,
                permutation_type='pairings')
            # res_asymptotic = scipy.stats.spearmanr(grouped_sv.loc["Centralized"], grouped_sv.loc["Aggregated"])
            # res_exact.pvalue # asymptotic pvalue is too low
            # display(res_exact.statistic)
            # display(res_exact.pvalue)
            spearman_sv_values.append(res_exact.statistic)
            spearman_sv_p_values.append(res_exact.pvalue)
            # display(spearman_sv_values)
            # display(spearman_sv_p_values)

    spearman_rank_sv_metric_centralized_decentralized["Dataset"] = [dataset_name for _ in range(len(spearman_sv_metrics))]
    spearman_rank_sv_metric_centralized_decentralized["TypePartition"] = [type_of_partition for _ in range(len(spearman_sv_metrics))]
    spearman_rank_sv_metric_centralized_decentralized["AdditionalParameter"] = [additional_parameter for _ in range(len(spearman_sv_metrics))]
    spearman_rank_sv_metric_centralized_decentralized["Metric"] = spearman_sv_metrics
    spearman_rank_sv_metric_centralized_decentralized["SpearmanRank"] = spearman_sv_values
    spearman_rank_sv_metric_centralized_decentralized["p-value"] = spearman_sv_p_values

    return spearman_rank_sv_metric_centralized_decentralized

In [None]:
def compute_pearson_correlation_sv(dataset_name, type_of_partition, additional_parameter):
    path_to_result_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    pearson_rank_sv_metric_centralized_decentralized = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "PearsonCorrelation", "p-value"])
    pearson_sv_metrics = []
    pearson_sv_values = []
    pearson_sv_p_values = []

    for file in os.listdir(path_to_result_dataframes + os.sep + "Shapley_Value"):
        results_dataframe = pd.read_csv(path_to_result_dataframes + os.sep + "Shapley_Value" + os.sep + file)
        # display(path_to_result_dataframes)
        # display(file)
        pearson_sv_metrics.append(file.split('_')[1])
        if file == "SV_F1Score":
            grouped_sv = results_dataframe.groupby(["Evaluator", "Classes"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Classes", "Round"], axis=1)
            pearson_sv_values.append(0)
            pearson_sv_p_values.append(0)
        elif file == "SV_CosineSimilarity":
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            pearson_sv_values.append(0)
            pearson_sv_p_values.append(0)
        else:
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            # def statistic(x): # permute only `x`
            #     return scipy.stats.pearsonr(x, grouped_sv.loc["Aggregated"]).statistic
            # res_exact = scipy.stats.permutation_test((grouped_sv.loc["Centralized"],), statistic,
            #     permutation_type='pairings')
            res_exact = scipy.stats.pearsonr(grouped_sv.loc["Centralized"], grouped_sv.loc["Aggregated"])
            pearson_sv_values.append(res_exact.statistic)
            pearson_sv_p_values.append(res_exact.pvalue)

    pearson_rank_sv_metric_centralized_decentralized["Dataset"] = [dataset_name for _ in range(len(pearson_sv_metrics))]
    pearson_rank_sv_metric_centralized_decentralized["TypePartition"] = [type_of_partition for _ in range(len(pearson_sv_metrics))]
    pearson_rank_sv_metric_centralized_decentralized["AdditionalParameter"] = [additional_parameter for _ in range(len(pearson_sv_metrics))]
    pearson_rank_sv_metric_centralized_decentralized["Metric"] = pearson_sv_metrics
    pearson_rank_sv_metric_centralized_decentralized["PearsonCorrelation"] = pearson_sv_values
    pearson_rank_sv_metric_centralized_decentralized["p-value"] = pearson_sv_p_values

    return pearson_rank_sv_metric_centralized_decentralized

In [None]:
spearman_rank_sv_metric_centralized_decentralized = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "SpearmanRank", "p-value"])
pearson_correlation_sv_metric_centralized_decentralized = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "PearsonCorrelation", "p-value"])


for dataset_name, type_of_partition, additional_parameter in training_configurations:
    spearman_rank_sv_metric_centralized_decentralized = pd.concat([spearman_rank_sv_metric_centralized_decentralized, compute_spearman_correlation_sv(dataset_name, type_of_partition, additional_parameter)], ignore_index=True)
    pearson_correlation_sv_metric_centralized_decentralized = pd.concat([pearson_correlation_sv_metric_centralized_decentralized, compute_pearson_correlation_sv(dataset_name, type_of_partition, additional_parameter)], ignore_index=True)

In [None]:
spearman_rank_sv_metric_centralized_decentralized_no_f1 = spearman_rank_sv_metric_centralized_decentralized[(spearman_rank_sv_metric_centralized_decentralized["Metric"] != "F1Score") & (spearman_rank_sv_metric_centralized_decentralized["Metric"] != "CosineSimilarity")]

spearman_rank_sv_metric_centralized_decentralized_no_f1["SpearmanRank"] = spearman_rank_sv_metric_centralized_decentralized_no_f1["SpearmanRank"].astype('float')
spearman_rank_sv_metric_centralized_decentralized_no_f1["p-value"] = spearman_rank_sv_metric_centralized_decentralized_no_f1["p-value"].astype('float')

pearson_correlation_sv_metric_centralized_decentralized_no_f1 = pearson_correlation_sv_metric_centralized_decentralized[(pearson_correlation_sv_metric_centralized_decentralized["Metric"] != "F1Score") & (spearman_rank_sv_metric_centralized_decentralized["Metric"] != "CosineSimilarity")]

pearson_correlation_sv_metric_centralized_decentralized_no_f1["PearsonCorrelation"] = pearson_correlation_sv_metric_centralized_decentralized_no_f1["PearsonCorrelation"].astype('float')
pearson_correlation_sv_metric_centralized_decentralized_no_f1["p-value"] = pearson_correlation_sv_metric_centralized_decentralized_no_f1["p-value"].astype('float')

display(spearman_rank_sv_metric_centralized_decentralized_no_f1.groupby(["Dataset"]).mean(numeric_only=True))

display(spearman_rank_sv_metric_centralized_decentralized_no_f1.groupby(["Dataset", "Metric"]).mean(numeric_only=True))

display(spearman_rank_sv_metric_centralized_decentralized_no_f1.groupby(["Metric"]).mean(numeric_only=True))

display(spearman_rank_sv_metric_centralized_decentralized_no_f1[spearman_rank_sv_metric_centralized_decentralized_no_f1["SpearmanRank"] != 1.0])

In [None]:
display(spearman_rank_sv_metric_centralized_decentralized_no_f1)

In [None]:
display(pearson_correlation_sv_metric_centralized_decentralized_no_f1.groupby(["Dataset"]).mean(numeric_only=True))

display(pearson_correlation_sv_metric_centralized_decentralized_no_f1.groupby(["Dataset", "Metric"]).mean(numeric_only=True))

display(pearson_correlation_sv_metric_centralized_decentralized_no_f1[pearson_correlation_sv_metric_centralized_decentralized_no_f1["PearsonCorrelation"] != 1.0])

In [None]:
path_to_results_dataframes = get_results_from_route("har", "manual", "HAR_1_Maverick_Balanced_Laying_1")

results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_CrossEntropyLoss")
grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)

In [32]:
def compute_spearman_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss):
    path_to_distance_dataframes = get_distances_from_route(dataset_name, type_of_partition, additional_parameter)
    path_to_results_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    spearman_rank_sv_metric_centralized_decentralized = pd.DataFrame()
    distances = []
    spearman_values = []
    spearman_p_values = []

    print(f"Dataset name: {dataset_name}")
    print(f"Type_of_partition name: {type_of_partition}")
    print(f"Additional_parameter name: {additional_parameter}")
    
    if type_of_loss == "MAE":
        results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_MAE")
    else:
        results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_CrossEntropyLoss")
    
    grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
    grouped_sv.drop("Aggregated", inplace=True)

    # display(grouped_sv)

    for file in os.listdir(path_to_distance_dataframes):
        distances.append(file.split('.')[0])
        if file == "volume.csv":
            volume_series = pd.read_csv(path_to_distance_dataframes + os.sep + file, index_col=0)
            volume_series.drop("Global", inplace=True)
            def statistic(x): # permute only `x`
                return scipy.stats.spearmanr(x, grouped_sv.loc["Centralized"]).statistic
            res_exact = scipy.stats.permutation_test((volume_series.loc[:],), statistic,
                permutation_type='pairings')
            spearman_values.append(res_exact.statistic[0])
            spearman_p_values.append(res_exact.pvalue[0])

        else:
            distance_dataframe = pd.read_csv(path_to_distance_dataframes + os.sep + file, index_col=0)
            spearman_list = []
            pvalue_list = []
            for index, row in grouped_sv.iterrows():
                if index == "Centralized":
                    # def statistic(x): # permute only `x`
                    #     return scipy.stats.spearmanr(x, grouped_sv.loc["Centralized"]).statistic
                    # res_exact = scipy.stats.permutation_test((distance_dataframe.loc[:, "Global"],), statistic,
                    #     permutation_type='pairings')
                    res_exact = scipy.stats.spearmanr(grouped_sv.loc["Centralized"], distance_dataframe.loc[:, "Global"])
                    spearman_list.append(res_exact.statistic)
                    pvalue_list.append(res_exact.pvalue)
                else:
                    # def statistic(x): # permute only `x`
                    #     return scipy.stats.spearmanr(x, grouped_sv.loc[str(index)]).statistic
                    # res_exact = scipy.stats.permutation_test((distance_dataframe.loc[:, str(index)],), statistic,
                    #     permutation_type='pairings')
                    res_exact = scipy.stats.spearmanr(grouped_sv.loc[str(index)], distance_dataframe.loc[:, str(index)])
                    spearman_list.append(res_exact.statistic)
                    pvalue_list.append(res_exact.pvalue)

            # Fisher transformation for averaging correlations.
            spearman_values.append(np.tanh(np.mean(np.arctanh(spearman_list))))
            spearman_p_values.append(np.mean(pvalue_list))

    spearman_rank_sv_metric_centralized_decentralized["Dataset"] = [dataset_name for _ in range(len(distances))]
    spearman_rank_sv_metric_centralized_decentralized["TypePartition"] = [type_of_partition for _ in range(len(distances))]
    spearman_rank_sv_metric_centralized_decentralized["AdditionalParameter"] = [additional_parameter for _ in range(len(distances))]
    spearman_rank_sv_metric_centralized_decentralized["Distance"] = distances
    spearman_rank_sv_metric_centralized_decentralized["SpearmanCorrelation"] = spearman_values
    spearman_rank_sv_metric_centralized_decentralized["p-value-spearman"] = spearman_p_values

    return spearman_rank_sv_metric_centralized_decentralized

In [33]:
def compute_pearson_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss="CrossEntropyLoss"):
    path_to_distance_dataframes = get_distances_from_route(dataset_name, type_of_partition, additional_parameter)
    path_to_results_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    pearson_sv_metric_centralized_decentralized = pd.DataFrame()
    distances = []
    pearson_values = []
    pearson_p_values = []
    variance_scenario = []

    print(f"Dataset name: {dataset_name}")
    print(f"Type_of_partition name: {type_of_partition}")
    print(f"Additional_parameter name: {additional_parameter}")

    if type_of_loss == "MAE":
        results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_MAE")
    else:
        results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_CrossEntropyLoss")
        
    grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
    grouped_sv.drop("Aggregated", inplace=True)

    # display(grouped_sv)

    for file in os.listdir(path_to_distance_dataframes):
        distances.append(file.split('.')[0])
        if file == "volume.csv":
            volume_series = pd.read_csv(path_to_distance_dataframes + os.sep + file, index_col=0)
            volume_series.drop("Global", inplace=True)
            res_exact = scipy.stats.pearsonr(grouped_sv.loc["Centralized"], volume_series.to_numpy().reshape(-1))
            pearson_values.append(res_exact.statistic)
            pearson_p_values.append(res_exact.pvalue)
            # variance_scenario.append(np.var(grouped_sv.loc["Centralized"]))
        else:
            distance_dataframe = pd.read_csv(path_to_distance_dataframes + os.sep + file, index_col=0)
            if file == "yShiftDataframe.csv":
                distance_dataframe.replace([np.inf, -np.inf], 0, inplace=True)
                distance_dataframe.replace([np.NAN], 0, inplace=True)
            pearsonr_list = []
            pvalue_list = []
            for index, row in grouped_sv.iterrows():
                if index == "Centralized":
                    res_exact = scipy.stats.pearsonr(grouped_sv.loc["Centralized"], distance_dataframe.loc[:, "Global"])
                    pearsonr_list.append(res_exact.statistic)
                    pvalue_list.append(res_exact.pvalue)
                else:            
                    res_exact = scipy.stats.pearsonr(grouped_sv.loc[str(index)], distance_dataframe.loc[:, str(index)])
                    pearsonr_list.append(res_exact.statistic)
                    pvalue_list.append(res_exact.pvalue)

            # variance_scenario.append(np.var(grouped_sv.loc["Centralized"]))
            # Fisher transformation for averaging correlations.
            pearson_values.append(np.tanh(np.mean(np.arctanh(pearsonr_list))))
            pearson_p_values.append(np.mean(pvalue_list))

    pearson_sv_metric_centralized_decentralized["Dataset"] = [dataset_name for _ in range(len(distances))]
    pearson_sv_metric_centralized_decentralized["TypePartition"] = [type_of_partition for _ in range(len(distances))]
    pearson_sv_metric_centralized_decentralized["AdditionalParameter"] = [additional_parameter for _ in range(len(distances))]
    pearson_sv_metric_centralized_decentralized["Distance"] = distances
    pearson_sv_metric_centralized_decentralized["PearsonCorrelation"] = pearson_values
    pearson_sv_metric_centralized_decentralized["p-value-pearson"] = pearson_p_values
    # pearson_sv_metric_centralized_decentralized["VarianceSV"] = variance_scenario

    return pearson_sv_metric_centralized_decentralized

In [None]:
distances_to_compare = [
    # ("adult", "dirichlet", "0.1"),
    # ("adult", "dirichlet", "1"),
    # ("adult", "dirichlet", "10"),
    # ("adult", "dirichlet", "100"),
    # ("adult", "manual", "Adult_FeatureSkew_Education_Doctorate"),
    # ("adult", "manual", "Adult_FeatureSkew_Education_Masters"),
    # ("adult", "manual", "Adult_FeatureSkew_Occupation"),
    # ("adult", "manual", "Adult_FeatureSkew_Occupation_ExecManagerial"),
    # ("adult", "manual", "Adult_FeatureSkew_Studies"),
    ("har", "dirichlet", "1", "CrossEntropyLoss"),
    ("har", "dirichlet", "10", "CrossEntropyLoss"),
    ("har", "dirichlet", "100", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_label_skew", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_LessLabels", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_MissingTwoLabels", "CrossEntropyLoss"),
    # ("har", "manual", "HAR_1_Maverick_1_HellingerTrap", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Laying", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_1", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_2", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_3", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_4", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_5", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_2", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_3", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_4", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_5", "CrossEntropyLoss"),
    # ("covertype", "dirichlet", "1"),
    # ("covertype", "dirichlet", "10"),
    # ("covertype", "dirichlet", "100"),
    # ("covertype", "manual", "Covertype_Maverick_LeastNumerousClass"),
    # ("covertype", "manual", "Covertype_Maverick_MostNumerousClass"),
    # ("covertype", "manual", "Covertype_FS_WildernessArea"),
    # ("new_adult", "dirichlet", "1"),
    # ("new_adult", "dirichlet", "10"),
    # ("new_adult", "dirichlet", "100"),
    # ("new_adult", "manual", "New_Adult_AL_CO_CT_MA")
    ("edge-iot-coreset", "dirichlet", "10", "CrossEntropyLoss"),
    ("edge-iot-coreset", "dirichlet", "100", "CrossEntropyLoss"),
    ("edge-iot-coreset", "dirichlet", "1000", "CrossEntropyLoss"),
    ("edge-iot-coreset", "dirichlet", "10000", "CrossEntropyLoss"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_LeastAttack", "CrossEntropyLoss"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_OnlyNormal", "CrossEntropyLoss"),
    ("electric-consumption", "manual", "Electric_Consumption_Random_Sampling", "MAE"),
    ("electric-consumption", "manual", "ElectricConsumptionFacilityType", "MAE"),
    ("electric-consumption", "manual", "ElectricConsumptionNoniid", "MAE"),
    ("electric-consumption", "manual", "ElectricConsumptionStateFactor", "MAE"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized", "MAE"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore", "MAE")
]

In [45]:
spearman_rank_sv_distance = pd.DataFrame()
pearson_rank_sv_distance = pd.DataFrame()


for dataset_name, type_of_partition, additional_parameter, type_of_loss in distances_to_compare:
    spearman_rank_sv_distance = pd.concat([spearman_rank_sv_distance, compute_spearman_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss)], ignore_index=True)
    pearson_rank_sv_distance = pd.concat([pearson_rank_sv_distance, compute_pearson_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss)], ignore_index=True)

correlation_sv_distance = spearman_rank_sv_distance.copy()
correlation_sv_distance["PearsonCorrelation"] = pearson_rank_sv_distance["PearsonCorrelation"]
correlation_sv_distance["p-value-pearson"] = pearson_rank_sv_distance["p-value-pearson"]
# correlation_sv_distance["VarianceSV"] = pearson_rank_sv_distance["VarianceSV"]

Dataset name: har
Type_of_partition name: dirichlet
Additional_parameter name: 1
Dataset name: har
Type_of_partition name: dirichlet
Additional_parameter name: 1
Dataset name: har
Type_of_partition name: dirichlet
Additional_parameter name: 10



divide by zero encountered in arctanh


divide by zero encountered in arctanh


divide by zero encountered in arctanh


divide by zero encountered in arctanh



Dataset name: har
Type_of_partition name: dirichlet
Additional_parameter name: 10
Dataset name: har
Type_of_partition name: dirichlet
Additional_parameter name: 100
Dataset name: har
Type_of_partition name: dirichlet
Additional_parameter name: 100
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_label_skew
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_label_skew
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_LessLabels
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_LessLabels
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_MissingTwoLabels
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_MissingTwoLabels
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_HellingerTrap



divide by zero encountered in arctanh



Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_HellingerTrap
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Laying
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Laying
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Balanced_Laying_1
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Balanced_Laying_1
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Balanced_Laying_2
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Balanced_Laying_2
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Balanced_Laying_3
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Balanced_Laying_3
Dataset name: har
Type_of_partition name: m


divide by zero encountered in arctanh


divide by zero encountered in arctanh


divide by zero encountered in arctanh



Dataset name: electric-consumption
Type_of_partition name: manual
Additional_parameter name: ElectricConsumptionFacilityType
Dataset name: electric-consumption
Type_of_partition name: manual
Additional_parameter name: ElectricConsumptionNoniid
Dataset name: electric-consumption
Type_of_partition name: manual
Additional_parameter name: ElectricConsumptionNoniid
Dataset name: electric-consumption
Type_of_partition name: manual
Additional_parameter name: ElectricConsumptionStateFactor
Dataset name: electric-consumption
Type_of_partition name: manual
Additional_parameter name: ElectricConsumptionStateFactor



divide by zero encountered in arctanh


divide by zero encountered in arctanh


divide by zero encountered in arctanh


divide by zero encountered in arctanh


divide by zero encountered in arctanh


divide by zero encountered in arctanh


divide by zero encountered in arctanh



Dataset name: electric-consumption
Type_of_partition name: manual
Additional_parameter name: ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized
Dataset name: electric-consumption
Type_of_partition name: manual
Additional_parameter name: ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized
Dataset name: electric-consumption
Type_of_partition name: manual
Additional_parameter name: ElectricConsumption_FeatureSkew_MaverickGroceryStore
Dataset name: electric-consumption
Type_of_partition name: manual
Additional_parameter name: ElectricConsumption_FeatureSkew_MaverickGroceryStore


In [None]:
display(spearman_rank_sv_distance)

In [46]:
display(correlation_sv_distance[correlation_sv_distance["Distance"] == "performanceDegradation"])

Unnamed: 0,Dataset,TypePartition,AdditionalParameter,Distance,SpearmanCorrelation,p-value-spearman,PearsonCorrelation,p-value-pearson
0,har,dirichlet,1,performanceDegradation,-1.0,0.136397,-0.739649,0.211859
9,har,dirichlet,10,performanceDegradation,-0.829407,0.083019,-0.895273,0.023951
18,har,dirichlet,100,performanceDegradation,0.0007,0.574081,0.095417,0.53469
27,har,manual,HAR_1_Maverick_1_label_skew,performanceDegradation,-0.46952,0.309531,-0.86072,0.29971
36,har,manual,HAR_1_Maverick_1_LessLabels,performanceDegradation,-0.670174,0.221135,-0.839781,0.148267
45,har,manual,HAR_1_Maverick_1_MissingTwoLabels,performanceDegradation,-0.315995,0.362822,-0.486999,0.202048
54,har,manual,HAR_1_Maverick_1_HellingerTrap,performanceDegradation,-0.589194,0.291905,-0.833615,0.230653
63,har,manual,HAR_1_Maverick_Laying,performanceDegradation,-0.414618,0.3369,-0.739578,0.280458
72,har,manual,HAR_1_Maverick_Balanced_Laying_1,performanceDegradation,-0.59431,0.298389,-0.782848,0.262692
81,har,manual,HAR_1_Maverick_Balanced_Laying_2,performanceDegradation,-0.257906,0.523555,-0.714719,0.309511


In [47]:
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
np.random.seed(1)

#garbage graph
fig = px.scatter(x=[0, 1, 2, 3, 4], y=[0, 1, 4, 9, 16])
fig.show()
fig.write_image("random.pdf")


distance_dictionary = {
    "performanceDegradation": "Performance Degradation (I)",
    "yShiftDataframe": "X->Y Shift (I)",
    "relevance": "Relevance (D)",
    "diversity": "Diversity (D)",
    "hellinger": "Hellinger Distance (I)",
    "wasserstein": "Wasserstein Distance (I)",
    "gaussian_mmd": "MMD - Gaussian Kernel (I)",
    "volume": "Volume of data (D)"
}

fig = go.Figure()
for metric, name_of_metric in distance_dictionary.items():
    sliced_df = correlation_sv_distance[correlation_sv_distance["Distance"] == metric]
    fig.add_trace(go.Box(y=sliced_df["PearsonCorrelation"], name=name_of_metric))


fig.update_layout(title_text="Pearson Correlation between Distance and Value", showlegend=False)

fig.show()

fig.write_image("images/distance_value_pearson_correlation.pdf")


distutils Version classes are deprecated. Use packaging.version instead.



In [64]:
distances_only_mavericks_to_compare = [
    # ("adult", "dirichlet", "0.1"),
    # ("adult", "dirichlet", "1"),
    # ("adult", "dirichlet", "10"),
    # ("adult", "dirichlet", "100"),
    # ("adult", "manual", "Adult_FeatureSkew_Education_Doctorate"),
    # ("adult", "manual", "Adult_FeatureSkew_Education_Masters"),
    # ("adult", "manual", "Adult_FeatureSkew_Occupation"),
    # ("adult", "manual", "Adult_FeatureSkew_Occupation_ExecManagerial"),
    # ("adult", "manual", "Adult_FeatureSkew_Studies"),
    # ("har", "dirichlet", "1", "CrossEntropyLoss"),
    # ("har", "dirichlet", "10", "CrossEntropyLoss"),
    # ("har", "dirichlet", "100", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_label_skew", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_LessLabels", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_MissingTwoLabels", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Laying", "CrossEntropyLoss"),
    # ("har", "manual", "HAR_1_Maverick_1_HellingerTrap", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_1", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_2", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_3", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_4", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_5", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_2", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_3", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_4", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_5", "CrossEntropyLoss"),
    # ("covertype", "dirichlet", "1"),
    # ("covertype", "dirichlet", "10"),
    # ("covertype", "dirichlet", "100"),
    # ("covertype", "manual", "Covertype_Maverick_LeastNumerousClass"),
    # ("covertype", "manual", "Covertype_Maverick_MostNumerousClass"),
    # ("covertype", "manual", "Covertype_FS_WildernessArea"),
    # ("new_adult", "dirichlet", "1"),
    # ("new_adult", "dirichlet", "10"),
    # ("new_adult", "dirichlet", "100"),
    # ("new_adult", "manual", "New_Adult_AL_CO_CT_MA")
    # ("edge-iot-coreset", "dirichlet", "10", "CrossEntropyLoss"),
    # ("edge-iot-coreset", "dirichlet", "100", "CrossEntropyLoss"),
    # ("edge-iot-coreset", "dirichlet", "1000", "CrossEntropyLoss"),
    # ("edge-iot-coreset", "dirichlet", "10000", "CrossEntropyLoss"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_LeastAttack", "CrossEntropyLoss"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_OnlyNormal", "CrossEntropyLoss"),
    # ("electric-consumption", "manual", "Electric_Consumption_Random_Sampling", "MAE"),
    # ("electric-consumption", "manual", "ElectricConsumptionFacilityType", "MAE"),
    # ("electric-consumption", "manual", "ElectricConsumptionNoniid", "MAE"),
    # ("electric-consumption", "manual", "ElectricConsumptionStateFactor", "MAE"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized", "MAE"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore", "MAE")
]

In [65]:
spearman_rank_sv_distance = pd.DataFrame()
pearson_rank_sv_distance = pd.DataFrame()


for dataset_name, type_of_partition, additional_parameter, type_of_loss in distances_only_mavericks_to_compare:
    spearman_rank_sv_distance = pd.concat([spearman_rank_sv_distance, compute_spearman_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss)], ignore_index=True)
    pearson_rank_sv_distance = pd.concat([pearson_rank_sv_distance, compute_pearson_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss)], ignore_index=True)

correlation_sv_distance_only_mavericks = spearman_rank_sv_distance.copy()
correlation_sv_distance_only_mavericks["PearsonCorrelation"] = pearson_rank_sv_distance["PearsonCorrelation"]
correlation_sv_distance_only_mavericks["p-value-pearson"] = pearson_rank_sv_distance["p-value-pearson"]
# correlation_sv_distance["VarianceSV"] = pearson_rank_sv_distance["VarianceSV"]

Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_label_skew
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_label_skew
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_LessLabels
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_LessLabels
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_MissingTwoLabels
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_1_MissingTwoLabels
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Laying
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Laying
Dataset name: har
Type_of_partition name: manual
Additional_parameter name: HAR_1_Maverick_Balanced_Laying_1
Dataset name: har
Type_of_partition name: manual
Additional


divide by zero encountered in arctanh


divide by zero encountered in arctanh



In [66]:
correlation_sv_distance_only_mavericks[correlation_sv_distance_only_mavericks["Distance"] == "performanceDegradation"]

Unnamed: 0,Dataset,TypePartition,AdditionalParameter,Distance,SpearmanCorrelation,p-value-spearman,PearsonCorrelation,p-value-pearson
0,har,manual,HAR_1_Maverick_1_label_skew,performanceDegradation,-0.46952,0.309531,-0.86072,0.29971
9,har,manual,HAR_1_Maverick_1_LessLabels,performanceDegradation,-0.670174,0.221135,-0.839781,0.148267
18,har,manual,HAR_1_Maverick_1_MissingTwoLabels,performanceDegradation,-0.315995,0.362822,-0.486999,0.202048
27,har,manual,HAR_1_Maverick_Laying,performanceDegradation,-0.414618,0.3369,-0.739578,0.280458
36,har,manual,HAR_1_Maverick_Balanced_Laying_1,performanceDegradation,-0.59431,0.298389,-0.782848,0.262692
45,har,manual,HAR_1_Maverick_Balanced_Laying_2,performanceDegradation,-0.257906,0.523555,-0.714719,0.309511
54,har,manual,HAR_1_Maverick_Balanced_Laying_3,performanceDegradation,-0.201825,0.616456,-0.655634,0.478997
63,har,manual,HAR_1_Maverick_Balanced_Laying_4,performanceDegradation,-0.118655,0.54388,-0.412555,0.359112
72,har,manual,HAR_1_Maverick_Balanced_Laying_5,performanceDegradation,-0.368818,0.45545,-0.739509,0.382954
81,har,manual,HAR_1_Maverick_1_Balanced_WalkingUpstairs,performanceDegradation,-0.241428,0.534357,-0.720465,0.304173


In [68]:
import plotly.graph_objects as go
import numpy as np
np.random.seed(1)

distance_dictionary = {
    "performanceDegradation": "Performance Degradation (I)",
    "yShiftDataframe": "X->Y Shift (I)",
    "relevance": "Relevance (D)",
    "diversity": "Diversity (D)",
    "hellinger": "Hellinger Distance (I)",
    "wasserstein": "Wasserstein Distance (I)",
    "gaussian_mmd": "MMD - Gaussian Kernel (I)",
    "volume": "Volume of data (D)"
}

fig = go.Figure()
for metric, name_of_metric in distance_dictionary.items():
    sliced_df = correlation_sv_distance_only_mavericks[correlation_sv_distance_only_mavericks["Distance"] == metric]
    fig.add_trace(go.Box(y=sliced_df["PearsonCorrelation"], name=name_of_metric))


fig.update_layout(title_text="Pearson Correlation between Distance and Value only Mavericks", showlegend=False)

fig.show()

fig.write_image("images/distance_value_only_mavericks_pearson_correlation.pdf")

In [None]:
display(correlation_sv_distance[correlation_sv_distance["Dataset"] == "electric-consumption"])

In [None]:
display(correlation_sv_distance.groupby(["Dataset", "Distance"]).mean(numeric_only=True))

In [None]:
display(correlation_sv_distance.groupby(["TypePartition", "Distance"]).mean(numeric_only=True))

In [None]:
display(correlation_sv_distance.groupby(["Dataset", "VarianceSV"]).mean(numeric_only=True))

In [None]:
display(correlation_sv_distance.groupby(["Dataset", "TypePartition", "Distance"]).mean(numeric_only=True))

In [None]:
# feature_skew = spearman_rank_sv_distance[spearman_rank_sv_distance["Evaluator"] == "Centralized"]

# display(feature_skew)
# display(feature_skew[feature_skew["Evaluator"] == "Centralized"])

In [None]:
# display(spearman_rank_sv_distance.groupby(["Dataset"]).mean(numeric_only=True))
# display(spearman_rank_sv_distance.groupby(["Dataset", "Distance", "Evaluator"]).mean(numeric_only=True))
# display(spearman_rank_sv_distance.groupby(["Dataset", "AdditionalParameter", "Distance", "Evaluator"]).mean(numeric_only=True))
# display(spearman_rank_sv_distance.groupby(["Dataset", "TypePartition"]).mean(numeric_only=True))
# display(spearman_rank_sv_distance.groupby(["Metric"]).mean(numeric_only=True))
# display(spearman_rank_sv_distance.groupby(["Distance"]).mean(numeric_only=True))
# display(spearman_rank_sv_distance.groupby(["Dataset", "Distance"]).mean(numeric_only=True))
# display(spearman_rank_sv_distance.groupby("Dataset").mean())

In [None]:
training_configurations_mavericks = [
    # ("adult", "manual", "Adult_FeatureSkew_Education_Doctorate", "Maverick_Education_Doctorate", 5, 4),
    # ("adult", "manual", "Adult_FeatureSkew_Education_Masters", "Maverick_Education_Masters", 5, 4),
    # ("adult", "manual", "Adult_FeatureSkew_Occupation_ExecManagerial", "Maverick_Occupation", 5, 4),
    ("har", "manual", "HAR_1_Maverick_1_label_skew", "WalkingUpstairsNoBalance", 6, 5),
    ("har", "manual", "HAR_1_Maverick_Laying", "LayingNoBalance", 6, 0),
    ("har", "manual", "HAR_1_Maverick_1_LessLabels", "OneClient4Classes", 6, 5),
    ("har", "manual", "HAR_1_Maverick_1_MissingTwoLabels", "OneClient3Classes", 6, 5),
    ("har", "manual", "HAR_1_Maverick_1_HellingerTrap", "HellingerTrap", 6, 5),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_1", "Laying", 6, 0),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_2", "Laying", 6, 0),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_3", "Laying", 6, 0),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_4", "Laying", 6, 0),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_5", "Laying", 6, 0),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs", "WalkingUpstairs", 6, 5),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_2", "WalkingUpstairs", 6, 5),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_3", "WalkingUpstairs", 6, 5),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_4", "WalkingUpstairs", 6, 5),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_5", "WalkingUpstairs", 6, 5),
    # ("covertype", "manual", "Covertype_Maverick_LeastNumerousClass", "LeastNumerous", 4, 3),
    # ("covertype", "manual", "Covertype_Maverick_MostNumerousClass", "MostNumerous", 4, 3),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_LeastAttack", "LeastAttack", 4, 3),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_OnlyNormal", "OnlyNormal", 4, 3),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized", "MultiFamily", 5, 4),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore", "GroceryStore", 5, 4)
]

In [None]:
def ranking_Mavericks(dataset_name, type_of_partition, additional_parameter, group_partitions, num_clients, maverick):
    # distances = ["wasserstein.csv", "gaussian_mmd.csv", "negativeConditionalEntropy.csv", "hellinger.csv", "performanceDegradation.csv", "yShiftDataframe.csv", "relevance.csv", "diversity.csv"]
    # singleValue = ["volume.csv"]
    maverick_ranked = pd.DataFrame()

    path_to_result_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    # spearman_rank_distance_sv = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "Evaluator", "Distance", "SpearmanRank", "p-value"])
    ranked_sv_maverick = []
    clients_sv = {k: [] for k in range(num_clients)}
    sv_metric = []
    sv_metric_c_a = []
    # spearman_sv_distance_spearman_values = []
    # spearman_sv_distance_p_values = []

    for sv_file in os.listdir(path_to_result_dataframes + os.sep + "Shapley_Value"):
        results_dataframe = pd.read_csv(path_to_result_dataframes + os.sep + "Shapley_Value" + os.sep + sv_file)
        sv_metric_name = sv_file.split("_")[1]

        if sv_file == "SV_F1Score":
            # For now, this one will not be used
            grouped_sv = results_dataframe.groupby(["Evaluator", "Classes"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Classes", "Round"], axis=1)

        elif sv_file == "SV_CosineSimilarity":
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            sv_metric.append(sv_metric_name)
            sv_metric_c_a.append("C")
            for client in clients_sv.keys():
                clients_sv[client].append(-1 * grouped_sv.loc["Centralized", str(client)])
        else:
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            sv_metric.append(sv_metric_name)
            sv_metric.append(sv_metric_name)
            sv_metric_c_a.append("C")
            sv_metric_c_a.append("A")
            for client in clients_sv.keys():
                # The values are multiplied for -1 to prepare the data for a posterior sort.
                clients_sv[client].append(-1 * grouped_sv.loc["Centralized", str(client)])
                clients_sv[client].append(-1 * grouped_sv.loc["Aggregated", str(client)])

    maverick_ranked["Dataset"] = [dataset_name for _ in range(len(sv_metric))]
    maverick_ranked["GroupPartitions"] = [group_partitions for _ in range(len(sv_metric))]
    maverick_ranked["Metric"] = sv_metric
    maverick_ranked["C/A"] = sv_metric_c_a

    for client in clients_sv.keys():
        maverick_ranked[client] = clients_sv[client]

    for index, row in maverick_ranked.iterrows():
        sorted_clients = np.argsort(maverick_ranked.loc[index, [k for k in range(num_clients)]])
        for client, position in zip(sorted_clients, range(1, 7)):
            maverick_ranked.loc[index, client] = position

    return maverick_ranked

In [None]:
maverick_ranked = pd.DataFrame()

for dataset_name, type_of_partition, additional_parameter, group_partitions, num_clients, maverick in training_configurations_mavericks:
    maverick_ranked = pd.concat([maverick_ranked, ranking_Mavericks(dataset_name, type_of_partition, additional_parameter, group_partitions, num_clients, maverick)], ignore_index=True)

In [None]:
display(maverick_ranked[maverick_ranked["GroupPartitions"] == "GroceryStore"])

In [None]:
display(maverick_ranked[maverick_ranked["Dataset"] == "edge-iot-coreset"])

In [None]:
display(maverick_ranked[(maverick_ranked["Dataset"] == "electric-consumption") & (maverick_ranked["C/A"] == "C")].groupby(["Dataset", "GroupPartitions", "Metric"]).mean(numeric_only=True))

# Trying code

In [None]:
training_configurations_mavericks = [
    ("adult", "manual", "Adult_FeatureSkew_Education_Doctorate", "Maverick_Education_Doctorate", 5, 4, "CrossEntropyLoss"),
    ("adult", "manual", "Adult_FeatureSkew_Education_Masters", "Maverick_Education_Masters", 5, 4, "CrossEntropyLoss"),
    ("adult", "manual", "Adult_FeatureSkew_Occupation_ExecManagerial", "Maverick_Occupation", 5, 4, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_label_skew", "WalkingUpstairsNoBalance", 6, 5, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Laying", "LayingNoBalance", 6, 0, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_LessLabels", "OneClient4Classes", 6, 5, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_MissingTwoLabels", "OneClient3Classes", 6, 5, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_HellingerTrap", "HellingerTrap", 6, 5, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_1", "Laying_1", 6, 0, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_2", "Laying_2", 6, 0, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_3", "Laying_3", 6, 0, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_4", "Laying_4", 6, 0, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_5", "Laying_5", 6, 0, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs", "WalkingUpstairs_1", 6, 5, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_2", "WalkingUpstairs_2", 6, 5, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_3", "WalkingUpstairs_3", 6, 5, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_4", "WalkingUpstairs_4", 6, 5, "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_5", "WalkingUpstairs_5", 6, 5, "CrossEntropyLoss"),
    # ("covertype", "manual", "Covertype_Maverick_LeastNumerousClass", "LeastNumerous", 4, 3),
    # ("covertype", "manual", "Covertype_Maverick_MostNumerousClass", "MostNumerous", 4, 3),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_LeastAttack", "LeastAttack", 4, 3, "CrossEntropyLoss"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_OnlyNormal", "OnlyNormal", 4, 3, "CrossEntropyLoss"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized", "MultiFamily", 5, 4, "MAE"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore", "GroceryStore", 5, 4, "MAE")
]

In [None]:
def maverick_vs_distance(dataset_name, type_of_partition, additional_parameter, group_partitions, num_clients, maverick):
    value_distance_dataframe = pd.DataFrame()

    path_to_result_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)
    path_to_distance_dataframes = get_distances_from_route(dataset_name, type_of_partition, additional_parameter)

    sv_value_centralized = []
    clients_sv = {k: [] for k in range(num_clients)}
    sv_metric = []

    for sv_file in os.listdir(path_to_result_dataframes + os.sep + "Shapley_Value"):
        results_dataframe = pd.read_csv(path_to_result_dataframes + os.sep + "Shapley_Value" + os.sep + sv_file)
        sv_metric_name = sv_file.split("_")[1]
        if sv_file == "SV_CosineSimilarity":
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            sv_metric.append(sv_metric_name)
            # sv_value_centralized.append(grouped_sv.loc["Centralized"])
            for client in clients_sv.keys():
                clients_sv[client].append(grouped_sv.loc["Centralized", str(client)])
        elif sv_file == "SV_CrossEntropyLoss":
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            sv_metric.append(sv_metric_name)
            for client in clients_sv.keys():
                clients_sv[client].append(grouped_sv.loc["Centralized", str(client)])
        elif sv_file == "SV_RMSE":
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            sv_metric.append(sv_metric_name)
            for client in clients_sv.keys():
                clients_sv[client].append(grouped_sv.loc["Centralized", str(client)])
    
    for distance_file in os.listdir(path_to_distance_dataframes):
        distance_dataframe = pd.read_csv(path_to_distance_dataframes + os.sep + distance_file, index_col=0)
        distance_metric_name = distance_file.split(".")[0]
        sv_metric.append(distance_metric_name)
        
        if distance_file == "volume.csv":
            for client in clients_sv.keys():
                clients_sv[client].append(distance_dataframe.loc[str(client)].iloc[0])
        
        else:
            for client in clients_sv.keys():
                clients_sv[client].append(distance_dataframe.T.loc["Global", client])

    value_distance_dataframe["Dataset"] = [dataset_name for _ in range(len(sv_metric))]
    value_distance_dataframe["GroupPartitions"] = [group_partitions for _ in range(len(sv_metric))]
    value_distance_dataframe["Metric"] = sv_metric
    value_distance_dataframe["Maverick"] = maverick
    for client in clients_sv.keys():
        value_distance_dataframe[str(client)] = clients_sv[client]

    return value_distance_dataframe

In [None]:
def ranking_distance_maverick(dataset_name, type_of_partition, additional_parameter, group_partitions, num_clients, maverick, type_of_loss):
    value_distance_dataframe = pd.DataFrame()

    path_to_result_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    if type_of_loss == "MAE":
        results_dataframe = pd.read_csv(path_to_result_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_MAE")
    else:
        results_dataframe = pd.read_csv(path_to_result_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_CrossEntropyLoss")

    grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)

    only_centralized_results = grouped_sv.loc["Centralized"]
    best_valued_index = int(only_centralized_results[only_centralized_results == max(only_centralized_results)].index[0])

    path_to_distance_dataframes = get_distances_from_route(dataset_name, type_of_partition, additional_parameter)

    distance_dictionary = {
        "performanceDegradation": "I",
        "yShiftDataframe": "I",
        "relevance": "D",
        "diversity": "D",
        "hellinger": "I",
        "wasserstein": "I",
        "gaussian_mmd": "I",
        "negativeConditionalEntropy": "D"
        # "volume": "D"
    }

    distance = []
    distance_theoretical_correct_rank = []
    distance_sv_correct_rank = []
    sv_correct_rank = []

    # if best_valued_index == maverick:
    #     sv_correct_rank.append(1)
    # else:
    #     sv_correct_rank.append(0)
    
    for distance_file in os.listdir(path_to_distance_dataframes):
        distance_dataframe = pd.read_csv(path_to_distance_dataframes + os.sep + distance_file, index_col=0)
        distance_metric_name = distance_file.split(".")[0]
        distance.append(distance_metric_name)
        
        if distance_file == "volume.csv":
            volumes = distance_dataframe.iloc[:, 0]
            volumes.pop("Global")
            volumes.reset_index(inplace=True, drop=True)
            if volumes[volumes == max(volumes)].index[0] == maverick:
                distance_theoretical_correct_rank.append(1)
            else:
                distance_theoretical_correct_rank.append(0)
            
            if volumes[volumes == max(volumes)].index[0] == best_valued_index:
                distance_sv_correct_rank.append(1)
            else:
                distance_sv_correct_rank.append(0)
        
        else:
            distance_values = distance_dataframe.loc[:, "Global"]
            if distance_dictionary[distance_metric_name] == "D":
                best_distance_index = int(distance_values[distance_values == max(distance_values)].index[0])
                if best_distance_index == maverick:
                    distance_theoretical_correct_rank.append(1)
                else:
                    distance_theoretical_correct_rank.append(0)

                if best_distance_index == best_valued_index:
                    distance_sv_correct_rank.append(1)
                else:
                    distance_sv_correct_rank.append(0)
            
            else:
                best_distance_index = int(distance_values[distance_values == min(distance_values)].index[0])
                if best_distance_index == maverick:
                    distance_theoretical_correct_rank.append(1)
                else:
                    distance_theoretical_correct_rank.append(0)
                
                if best_distance_index == best_valued_index:
                    distance_sv_correct_rank.append(1)
                else:
                    distance_sv_correct_rank.append(0)

    value_distance_dataframe["Dataset"] = [dataset_name for _ in range(len(distance))]
    value_distance_dataframe["GroupPartitions"] = [group_partitions for _ in range(len(distance))]
    value_distance_dataframe["Distance"] = distance
    value_distance_dataframe["Maverick"] = maverick
    value_distance_dataframe["Correct_Theoretical"] = distance_theoretical_correct_rank
    value_distance_dataframe["Correct_SV"] = distance_sv_correct_rank

    return value_distance_dataframe

In [None]:
distance_and_rank = pd.DataFrame()

for dataset_name, type_of_partition, additional_parameter, group_partitions, num_clients, maverick, type_of_loss in training_configurations_mavericks:
    distance_and_rank = pd.concat([distance_and_rank, ranking_distance_maverick(dataset_name, type_of_partition, additional_parameter, group_partitions, num_clients, maverick, type_of_loss)], ignore_index=True)

In [None]:
display(distance_and_rank[distance_and_rank["Distance"] == "gaussian_mmd"])
display(distance_and_rank[distance_and_rank["Distance"] == "wasserstein"])
display(distance_and_rank[distance_and_rank["Distance"] == "hellinger"])
display(distance_and_rank[distance_and_rank["Distance"] == "volume"])

In [None]:
display(distance_and_rank.groupby(["Distance"]).sum())
display(distance_and_rank.groupby(["Distance"]).count())

In [None]:
type_of_loss = "MAE"
# type_of_loss = "CrossEntropyLoss"

path_to_results_dataframes = get_results_from_route("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore")

if type_of_loss == "MAE":
    results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_MAE")
else:
    results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_CrossEntropyLoss")

display(results_dataframe.groupby("Evaluator").sum())

In [None]:
import plotly.express as px
path_to_dataframes = get_data_from_route("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore")

for source_client in range(5):
    src_train_x = pd.read_csv(path_to_dataframes + os.sep + "client_" + str(source_client) + "_X_training.csv", index_col=0)
    src_train_y = pd.read_csv(path_to_dataframes + os.sep + "client_" + str(source_client) + "_y_training.csv", index_col=0)
    src_test_x = pd.read_csv(path_to_dataframes + os.sep + "client_" + str(source_client) + "_X_test.csv", index_col=0)
    src_test_y = pd.read_csv(path_to_dataframes + os.sep + "client_" + str(source_client) + "_y_test.csv", index_col=0)

    column_names_of_interest:list = [column for column in src_train_x.columns if "facility_type" in column]

    sliced_df_train = src_train_x.loc[:, column_names_of_interest]
    extract_categories = np.argmax(sliced_df_train, axis=1).astype(int)
    facilities = [column_names_of_interest[ind] for ind in extract_categories]

    fig = px.histogram(facilities, title=f"Train y client {source_client}")
    fig.show()

    sliced_df_test = src_test_x.loc[:, column_names_of_interest]
    extract_categories = np.argmax(sliced_df_test, axis=1).astype(int)
    facilities = [column_names_of_interest[ind] for ind in extract_categories]

    fig = px.histogram(facilities, title=f"Test y client {source_client}")
    fig.show()

In [None]:
sliced_dataframe = value_distance.loc[:, "0":"5"]
sliced_dataframe = sliced_dataframe.apply(lambda row: (row - row.mean()) / row.std(), axis=1)
display(sliced_dataframe)

In [None]:
value_distance.loc[:, "0":"5"] = value_distance.loc[:, "0":"5"].apply(lambda row: (row - row.mean()) / row.std(), axis=1)

In [None]:
import plotly.graph_objects as go

# Create random data with numpy
import numpy as np

partitions = np.unique(value_distance["GroupPartitions"])

for partition in partitions:
    fig = go.Figure()
    sliced_dataframe = value_distance[value_distance["GroupPartitions"] == partition]
    metrics = np.unique(sliced_dataframe["Metric"])
    for metric in metrics:
        metric_row = sliced_dataframe[sliced_dataframe["Metric"] == metric]
        fig.add_trace(go.Scatter(y=metric_row.loc[:, "0":"5"].columns, x=metric_row.loc[:, "0":"5"].values[0], name=metric, mode="markers"))
    
    fig.update_layout(title="Values for metric " + partition)
    fig.show()


## Plotly Figures

In [None]:
import plotly.graph_objects as go

# Create random data with numpy
import numpy as np

# path_to_results = distance_values_results + os.sep + "wine" + os.sep + "manual" + os.sep + "Wine_Maverick"
path_to_results = ".." + os.sep + ".." + os.sep + distance_values_results + os.sep + "adult" + os.sep + "manual" + os.sep + "Adult_FeatureSkew_Occupation"

# Wasserstein distance
wasserstein = pd.read_csv(path_to_results + os.sep + "wasserstein.csv", index_col=0)
fig = go.Figure(data=[go.Bar(name="0", x=['0', '1', '2'], y=wasserstein.loc[:, "0"]),
                      go.Bar(name="1" , x=['0', '1', '2'], y=wasserstein.loc[:, "1"]),
                      go.Bar(name="2" , x=['0', '1', '2'], y=wasserstein.loc[:, "2"]),
                      go.Bar(name="Global" , x=['0', '1', '2'], y=wasserstein.loc[:, "Global"])
                      ])

fig.update_layout(
    title=dict(text='Wasserstein Distance among datasets'))

fig.show()


# Conditional Entropy
conditionalEntropy = pd.read_csv(path_to_results + os.sep + "conditionalEntropy.csv", index_col=0)
fig = go.Figure(data=[go.Bar(name="0", x=['0', '1', '2'], y=conditionalEntropy.loc[:, "0"]),
                      go.Bar(name="1" , x=['0', '1', '2'], y=conditionalEntropy.loc[:, "1"]),
                      go.Bar(name="2" , x=['0', '1', '2'], y=conditionalEntropy.loc[:, "2"]),
                      go.Bar(name="Global" , x=['0', '1', '2'], y=conditionalEntropy.loc[:, "Global"])
                      ])

fig.update_layout(
    title=dict(text='Conditional Entropy among datasets'))

fig.show()


# Volume
volume = pd.read_csv(path_to_results + os.sep + "volume.csv", index_col=0)
fig = go.Figure(data=go.Bar(x=["Global", "0", "1", "2"], y=volume.loc[["Global", "0", "1", "2"], "0"]))

fig.update_layout(
    title=dict(text='Volume of the datasets'))

fig.show()


## Shapley values visualization

In [None]:
dataset_name = "electric-consumption"
type_of_partition = "manual"
additional_parameter = "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized"

In [None]:
# hellinger = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/hellinger.csv", index_col=0)

# print("Hellinger")
# display(hellinger)

# # wasserstein = pd.read_csv("../../results/distances_values/adult/manual/Adult_FeatureSkew_Occupation/wasserstein.csv", index_col=0)

# # display(wasserstein)

# negativeConditionalEntropy = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/negativeConditionalEntropy.csv", index_col=0)

# print("Negative Conditional Entropy")
# display(negativeConditionalEntropy)

# yShiftDataframe = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/yShiftDataframe.csv", index_col=0)

# print("YShift")
# display(yShiftDataframe)

# performanceDegradation = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/performanceDegradation.csv", index_col=0)

# print("Performance Degradation")
# display(performanceDegradation)

# relevance = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/relevance.csv", index_col=0)

# print("Relevance")
# display(relevance)

# diversity = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/diversity.csv", index_col=0)

# print("Diversity")
# display(diversity)

# volume = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/volume.csv", index_col=0)

# print("Volume")
# display(volume)

# # sv_results = pd.read_csv("../../results/dataframes/FedAvg/adult/manual/Adult_FeatureSkew_Occupation/mlp/Shapley_Value/SV_CrossEntropyLoss", index_col=0)

# # display(sv_results.groupby(["Evaluator"]).sum())

# sv_results = pd.read_csv("../../results/dataframes/FedAvg/new_adult/manual/Demographic_New_Adult/mlp/Shapley_Value/SV_Accuracy", index_col=0)

# display(sv_results.groupby(["Evaluator"]).sum())

# sv_results = pd.read_csv("../../results/dataframes/FedAvg/new_adult/manual/Demographic_New_Adult/mlp/Shapley_Value/SV_F1Score", index_col=0)

# display(sv_results.groupby(["Evaluator", "Classes"]).sum())

In [None]:
hellinger = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/hellinger.csv", index_col=0)

print("Hellinger")
display(hellinger)

wasserstein = pd.read_csv("../../results/distances_values/adult/manual/Adult_FeatureSkew_Occupation/wasserstein.csv", index_col=0)

display(wasserstein)

negativeConditionalEntropy = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/negativeConditionalEntropy.csv", index_col=0)

print("Negative Conditional Entropy")
display(negativeConditionalEntropy)

yShiftDataframe = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/yShiftDataframe.csv", index_col=0)

print("YShift")
display(yShiftDataframe)

performanceDegradation = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/performanceDegradation.csv", index_col=0)

print("Performance Degradation")
display(performanceDegradation)

relevance = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/relevance.csv", index_col=0)

print("Relevance")
display(relevance)

diversity = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/diversity.csv", index_col=0)

print("Diversity")
display(diversity)

volume = pd.read_csv("../../results/distances_values/new_adult/manual/Demographic_New_Adult/volume.csv", index_col=0)

print("Volume")
display(volume)

# sv_results = pd.read_csv("../../results/dataframes/FedAvg/adult/manual/Adult_FeatureSkew_Occupation/mlp/Shapley_Value/SV_CrossEntropyLoss", index_col=0)

# display(sv_results.groupby(["Evaluator"]).sum())

sv_results = pd.read_csv("../../results/dataframes/FedAvg/new_adult/manual/Demographic_New_Adult/mlp/Shapley_Value/SV_Accuracy", index_col=0)

display(sv_results.groupby(["Evaluator"]).sum())

sv_results = pd.read_csv("../../results/dataframes/FedAvg/new_adult/manual/Demographic_New_Adult/mlp/Shapley_Value/SV_F1Score", index_col=0)

display(sv_results.groupby(["Evaluator", "Classes"]).sum())


In [None]:
# sv_results = pd.read_csv("../../results/dataframes/FedAvg/har/manual/HAR_1_Maverick_1_label_skew/mlp/Shapley_Value/SV_CrossEntropyLoss", index_col=0)

# display(sv_results.groupby(["Evaluator"]).sum())

# sv_results = pd.read_csv("../../results/dataframes/FedAvg/har/manual/HAR_1_Maverick_1_label_skew/mlp/Shapley_Value/SV_Accuracy", index_col=0)

# display(sv_results.groupby(["Evaluator"]).sum())

# sv_results = pd.read_csv("../../results/dataframes/FedAvg/har/manual/HAR_1_Maverick_1_label_skew/mlp/Shapley_Value/SV_F1Score", index_col=0)

# display(sv_results.groupby(["Evaluator", "Classes"]).sum())


# sv_results = pd.read_csv("../../results/dataframes/FedAvg/wine/manual/Wine_Maverick/mlp/Shapley_Value/SV_CrossEntropyLoss", index_col=0)

# display(sv_results.groupby(["Evaluator"]).sum())

# sv_results = pd.read_csv("../../results/dataframes/FedAvg/wine/manual/Wine_Maverick/mlp/Shapley_Value/SV_Accuracy", index_col=0)

# display(sv_results.groupby(["Evaluator"]).sum())

# sv_results = pd.read_csv("../../results/dataframes/FedAvg/wine/manual/Wine_Maverick/mlp/Shapley_Value/SV_F1Score", index_col=0)

# display(sv_results.groupby(["Evaluator", "Classes"]).sum())


import scipy.stats


sv_results = pd.read_csv("../../results/dataframes/FedAvg/adult/manual/Adult_FeatureSkew_Occupation/mlp/Shapley_Value/SV_CrossEntropyLoss", index_col=0)

display((sv_results.groupby(["Evaluator"]).sum()).reindex(sorted(sv_results.columns), axis=1))

sv_results = pd.read_csv("../../results/old_results/manual/Adult_FeatureSkew_Occupation/mlp/Shapley_Value/SV_CrossEntropyLoss", index_col=0)

display((sv_results.groupby(["Evaluator"]).sum()).reindex(sorted(sv_results.columns), axis=1))

sv_results = pd.read_csv("../../results/dataframes/FedAvg/adult/manual/Adult_FeatureSkew_Occupation/mlp/Shapley_Value/SV_Accuracy", index_col=0)

display((sv_results.groupby(["Evaluator"]).sum()).reindex(sorted(sv_results.columns), axis=1))

# sv_results = pd.read_csv("../../results/dataframes/FedAvg/adult/manual/Adult_FeatureSkew_Occupation/mlp/Shapley_Value/SV_F1Score", index_col=0)

# display(sv_results.groupby(["Evaluator", "Classes"]).sum())




sv_results = pd.read_csv("../../results/old_results/manual/Adult_FeatureSkew_Occupation/mlp/Shapley_Value/SV_Accuracy", index_col=0)

grouped_sv = sv_results.groupby(["Evaluator"]).sum().reindex(sorted(sv_results.columns), axis=1).drop(["Evaluator"], axis=1)

display(grouped_sv)

display(scipy.stats.spearmanr(grouped_sv.loc["Centralized"], grouped_sv.loc["Aggregated"]))

# sv_results = pd.read_csv("../../results/old_results/manual/Adult_FeatureSkew_Occupation/mlp/Shapley_Value/SV_F1Score", index_col=0)

# display(sv_results.groupby(["Evaluator", "Classes"]).sum())


# sv_results = pd.read_csv("../../results/dataframes/FedAvg/new_adult/manual/Demographic_New_Adult/mlp/Shapley_Value/SV_CrossEntropyLoss", index_col=0)

# display(sv_results.groupby(["Evaluator"]).sum())

# sv_results = pd.read_csv("../../results/dataframes/FedAvg/new_adult/manual/Demographic_New_Adult/mlp/Shapley_Value/SV_Accuracy", index_col=0)

# display(sv_results.groupby(["Evaluator"]).sum())

# sv_results = pd.read_csv("../../results/dataframes/FedAvg/new_adult/manual/Demographic_New_Adult/mlp/Shapley_Value/SV_F1Score", index_col=0)

# display(sv_results.groupby(["Evaluator", "Classes"]).sum())

# fig = go.Figure(data=[go.Bar(name="0", x=['0', '1', '2'], y=sv_results.loc[:, "0"]),
#                       go.Bar(name="1" , x=['0', '1', '2'], y=sv_results.loc[:, "1"]),
#                       go.Bar(name="2" , x=['0', '1', '2'], y=sv_results.loc[:, "2"]),
#                       go.Bar(name="Global" , x=['0', '1', '2'], y=sv_results.loc[:, "Global"])
#                       ])

# fig.update_layout(
#     title=dict(text='Wasserstein Distance among datasets'))

# fig.show()



## H1 Score Function

In [None]:
URI_partitioned_data = "\\data\\partitioned_training_data" + \
                        "\\manually_partitioned" + \
                        "\\HAR_1_Maverick_1_MissingTwoLabels"

source_dataset_X = pd.read_csv(URI_partitioned_data + "\\" + "client_" + str(0) + "_X" + ".csv", index_col=0).to_numpy()
display(source_dataset_X.shape)

In [None]:
def get_h1_score(source_dataset, target_labels):
    dim_red = False
    n_components = 8
    not_normalized = False
    if dim_red:
        dataset = PCA()


### HAR_1_Maverick_Less_Labels

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
dataframe = pd.DataFrame()

for client_source_number in range(6):
    for client_target_number in range(6):
        source_dataset_X = pd.read_csv(URI_partitioned_data + "\\" + "client_" + str(client_source_number) + "_X" + ".csv", index_col=0).to_numpy()
        target_dataset_y = pd.read_csv(URI_partitioned_data + "\\" + "client_" + str(client_target_number) + "_y" + ".csv", index_col=0).to_numpy()
        dataframe.loc["Client " + str(client_source_number), "Client " + str(client_target_number)] = get_h1_score(source_dataset_X, target_dataset_y)
    
display(dataframe)

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
dataframe = pd.DataFrame()

for client_source_number in range(6):
    for client_target_number in range(6):
        source_dataset_X = pd.read_csv(URI_partitioned_data + "\\" + "client_" + str(client_source_number) + "_X" + ".csv", index_col=0).to_numpy()
        target_dataset_y = pd.read_csv(URI_partitioned_data + "\\" + "client_" + str(client_target_number) + "_y" + ".csv", index_col=0).to_numpy()
        dataframe.loc["Client " + str(client_source_number), "Client " + str(client_target_number)] = get_h1_score(source_dataset_X, target_dataset_y)
    
display(dataframe)

In [None]:
for client_source_number in range(6):
    source_dataset_y = pd.read_csv(URI_partitioned_data + "\\" + "client_" + str(client_source_number) + "_y" + ".csv", index_col=0).to_numpy()
    source_dataset_y = np.array(np.argmax(np.asarray(source_dataset_y), axis=1))
    if client_source_number == 0:
        source_dataset_y[source_dataset_y==2] = 3
    for client_target_number in range(6):
        target_dataset_y = pd.read_csv(URI_partitioned_data + "\\" + "client_" + str(client_target_number) + "_y" + ".csv", index_col=0).to_numpy()
        target_dataset_y = np.array(np.argmax(np.asarray(target_dataset_y), axis=1))
        h_score_value = negative_conditional_entropy(source_dataset_y, target_dataset_y)
        h_score_value *= 1e18
        dataframe.loc["Client " + str(client_source_number), "Client " + str(client_target_number)] = h_score_value
    
display(dataframe)

In [None]:
# from tllib.ranking.logme import log_maximum_evidence
# pd.options.display.float_format = '{:,.2f}'.format
dataframe = pd.DataFrame()

for client_source_number in range(1, 6):
    for client_target_number in range(1, 6):
        source_dataset_X = pd.read_csv(URI_partitioned_data + "\\" + "client_" + str(client_source_number) + "_X" + ".csv", index_col=0).to_numpy()
        print(len(source_dataset_X))
        target_dataset_y = pd.read_csv(URI_partitioned_data + "\\" + "client_" + str(client_target_number) + "_y" + ".csv", index_col=0).to_numpy()
        print(len(target_dataset_y))
        target_dataset_y = target_dataset_y[:, np.any(target_dataset_y > 0, axis=0)]
        log_me = log_maximum_evidence(source_dataset_X, target_dataset_y, regression=True)
        display(log_me)
        dataframe.loc["Client " + str(client_source_number), "Client " + str(client_target_number)] = log_me
    
display(dataframe)