# Transfer Learning

This notebook contains the code of the paper related to measurements and how they relate to the Shapley Value (SV) computation. This code is mainly divided in two parts:

- Computation of the measures in the different datasets.
- Computation of the correlation between the measures and the SV computation in Federated Learning.

## Measure Computation (1st Part)

### Imports

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..' + os.sep + '..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from sklearn.covariance import LedoitWolf
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import math
import xgboost as xgb
import prince
import scipy
import plotly

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
from experiment_parameters.model_builder.Model import XGBoostModel
from experiment_parameters.TrainerFactory import dataset_model_dictionary

### Auxiliary functions

These are mainly functions to read data or results.

In [None]:
CONST_ROUTE_MAIN_DIR = ".." + os.sep + ".." + os.sep

In [None]:
def get_data_from_route(dataset_name, type_of_partition, additional_parameter):
    if type_of_partition == "manual":
        path_to_train_datasets = CONST_ROUTE_MAIN_DIR + "data" + os.sep + "partitioned_training_data" + os.sep + type_of_partition + os.sep + additional_parameter
    else:
        path_to_train_datasets = CONST_ROUTE_MAIN_DIR + "data" + os.sep + "partitioned_training_data" + os.sep + type_of_partition + os.sep + "dataset_" + dataset_name + os.sep + "alpha_" + additional_parameter

    return path_to_train_datasets

In [None]:
def get_results_from_route(dataset_name, type_of_partition, additional_parameter):
    # path_to_result_dataframes = CONST_ROUTE_MAIN_DIR + "results" + os.sep + "FedAvg" + os.sep + dataset_name + os.sep + type_of_partition + os.sep + additional_parameter + os.sep + "mlp"
    if type_of_partition == "dirichlet":
        path_to_result_dataframes = CONST_ROUTE_MAIN_DIR + "results" + os.sep + "dataframes" + os.sep + "FedAvg" + os.sep + dataset_name + os.sep + type_of_partition + os.sep + "alpha_" + additional_parameter + os.sep + "mlp"
    else:
        path_to_result_dataframes = CONST_ROUTE_MAIN_DIR + "results" + os.sep + "dataframes" + os.sep + "FedAvg" + os.sep + dataset_name + os.sep + type_of_partition + os.sep + additional_parameter + os.sep + "mlp"
    return path_to_result_dataframes

In [None]:
def get_distances_from_route(dataset_name, type_of_partition, additional_parameter):
    path_to_result_dataframes = CONST_ROUTE_MAIN_DIR + "results" + os.sep + "distances_values" + os.sep + dataset_name + os.sep + type_of_partition + os.sep + additional_parameter
    return path_to_result_dataframes

In [None]:
from itertools import combinations_with_replacement

def number_of_clients_and_all_combinations(path_to_train_datasets):
    num_clients = int(len(os.listdir(path_to_train_datasets)) / 4)
    client_numbers_original_order = list(range(num_clients))
    client_numbers_reverse_order = list(range(num_clients - 1, -1, -1))

    all_combinations = list(combinations_with_replacement(client_numbers_original_order, 2)) + list(combinations_with_replacement(client_numbers_reverse_order, 2))
    all_combinations = sorted(list(set(all_combinations)))
    print(all_combinations)
    return num_clients, all_combinations

In [None]:
def downcast_types(dataframe):
    for column in dataframe.select_dtypes("int"):
        dataframe[column] = dataframe[column].astype("int16")

    for column in dataframe.select_dtypes("float"):
        dataframe[column] = dataframe[column].astype("float32")

    return dataframe

### Measuring functions

The following code contains the functions for distribution distances, similarity and volume computations.

In [None]:
from metrics.Distances import compute_coupling, compute_CE
from experiment_parameters.TrainerFactory import dataset_model_dictionary
import torch

use_cuda = torch.cuda.is_available()
dtype = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor

Function for the Wasserstein Distance. We use the geomloss library to execute the code.

Paper of reference: 

In [None]:
from geomloss import SamplesLoss

def wasserstein_distance(src_x, tar_x):
    # Define a Sinkhorn (~Wasserstein) loss between sampled measures
    loss = SamplesLoss(loss="sinkhorn", p=2, blur=0.01, scaling=0.8) # Although the euclidean distance usually square's root the results, it is not done here. No clue.

    L = loss(torch.tensor(src_x.values).type(dtype), torch.tensor(tar_x.values).type(dtype))  # By default, use constant weights = 1/number of samples
    if use_cuda:
        torch.cuda.synchronize()
    return 2 * L.item()

Function for the Maximum Mean Discrepancy (MMD), using the Gaussian Kernel. We use the geomloss library to execute the code.

Paper of reference:

In [None]:
def gaussian_mmd_distance(src_x, tar_x):
    # Define a Gaussian MMD loss between sampled measures
    loss = SamplesLoss(loss="gaussian", blur=0.05) # Although the euclidean distance usually square's root the results, it is not done here. No clue.

    L = loss(torch.tensor(src_x.values).type(dtype), torch.tensor(tar_x.values).type(dtype))  # By default, use constant weights = 1/number of samples
    if use_cuda:
        torch.cuda.synchronize()
    return L.item()

Function for negative conditional entropy.

Paper of reference: Negative Conditional Entropy in `Transferability and Hardness of Supervised Classification Tasks (ICCV 2019) [Paper en Arxiv](https://arxiv.org/pdf/1908.08142v1.pdf)

In [None]:
def negative_conditional_entropy(source_labels: np.ndarray, target_labels: np.ndarray):
    r"""
    Negative Conditional Entropy in `Transferability and Hardness of Supervised
    Classification Tasks (ICCV 2019) <https://arxiv.org/pdf/1908.08142v1.pdf>`_.

    The NCE :math:`\mathcal{H}` can be described as:

    .. math::
        \mathcal{H}=-\sum_{y \in \mathcal{C}_t} \sum_{z \in \mathcal{C}_s} \hat{P}(y, z) \log \frac{\hat{P}(y, z)}{\hat{P}(z)}

    where :math:`\hat{P}(z)` is the empirical distribution and :math:`\hat{P}\left(y \mid z\right)` is the empirical
    conditional distribution estimated by source and target label.

    Args:
        source_labels (np.ndarray): predicted source labels.
        target_labels (np.ndarray): groud-truth target labels.

    Shape:
        - source_labels: (N, ) elements in [0, :math:`C_s`), with source class number :math:`C_s`.
        - target_labels: (N, ) elements in [0, :math:`C_t`), with target class number :math:`C_t`.
    """
    C_t = int(np.max(target_labels) + 1)
    C_s = int(np.max(source_labels) + 1)
    N = len(source_labels)

    joint = np.zeros((C_t, C_s), dtype=float)  # placeholder for the joint distribution, shape [C_t, C_s]
    for s, t in zip(source_labels, target_labels):
        s = int(s)
        t = int(t)
        joint[t, s] += 1.0 / N
    p_z = joint.sum(axis=0, keepdims=True)

    p_target_given_source = (joint / p_z).T  # P(y | z), shape [C_s, C_t]
    mask = p_z.reshape(-1) != 0  # valid Z, shape [C_s]
    p_target_given_source = p_target_given_source[mask] + 1e-20  # remove NaN where p(z) = 0, add 1e-20 to avoid log (0)
    entropy_y_given_z = np.sum(- p_target_given_source * np.log(p_target_given_source), axis=1, keepdims=True)
    conditional_entropy = np.sum(entropy_y_given_z * p_z.reshape((-1, 1))[mask])

    return -conditional_entropy

Set of functions for the X->Y shift measure.

Paper of reference:

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from metrics.Evaluator import evaluator
from xgboost import XGBClassifier

def accuracy(y_test, y_pred):
    y_pred = np.argmax(y_pred, axis=1)
    ground_truth_np = np.argmax(y_test, axis=1)
    return accuracy_score(ground_truth_np, y_pred)

def degradation_decomp(source_X, source_y, other_X_raw, other_y_raw, best_method, column_names, data_sum=20000, K=8, domain_classifier=None, draw_calibration=False, save_calibration_png='calibration.png'):
    perm1 = np.random.permutation(other_X_raw.shape[0])
    other_X = other_X_raw[perm1[:data_sum],:]
    other_y = other_y_raw[perm1[:data_sum]]

    piA = np.zeros(source_X.shape[0])
    piB = np.zeros(other_X.shape[0])
    permA = np.random.permutation(piA.shape[0])
    permB = np.random.permutation(piB.shape[0])

    kf = KFold(n_splits=K, shuffle=False)
    A_train_index_list = []
    A_test_index_list = []
    B_train_index_list = []
    B_test_index_list = []
    for i, (train_index, test_index) in enumerate(kf.split(source_X)):
        A_train_index_list.append(train_index)
        A_test_index_list.append(test_index)
    for i, (train_index, test_index) in enumerate(kf.split(other_X)):
        B_train_index_list.append(train_index)
        B_test_index_list.append(test_index)

    for i in range(K):
        trainX = np.concatenate([source_X[permA[A_train_index_list[i]]],other_X[permB[B_train_index_list[i]]]], axis=0)
        trainT = np.zeros(trainX.shape[0])
        trainT[len(A_train_index_list[i]):] = 1.0

        if domain_classifier is None:
            model = XGBClassifier(random_state=0).fit(trainX, trainT)
        else:
            model = domain_classifier.fit(trainX, trainT)

        piA[permA[A_test_index_list[i]]] = model.predict_proba(source_X[permA[A_test_index_list[i]]])[:,1]
        piB[permB[B_test_index_list[i]]] = model.predict_proba(other_X[permB[B_test_index_list[i]]])[:,1]

    # if draw_calibration:
    #     plot_calibration(piA, piB, save_dir=save_calibration_png)

    alpha = (other_X.shape[0])/ (source_X.shape[0]+other_X.shape[0])
    wA = piA / ((1-alpha)*piA + alpha * (1-piA))
    wB = (1-piB) / ((1-alpha)*piB + alpha * (1-piB))
    # Changing to support the model type of XGBoost.
    # accuracyA = best_method.score(source_X, source_y)
    # accuracyB = best_method.score(other_X, other_y)
    pd_source_X = pd.DataFrame(source_X, columns=column_names)
    pd_other_X = pd.DataFrame(other_X, columns=column_names)
    accuracyA = accuracy(source_y, best_method.predict_proba(pd_source_X))
    accuracyB = accuracy(other_y, best_method.predict_proba(pd_other_X))
    wA = wA / np.sum(wA)
    wB = wB / np.sum(wB)
    # predA = (best_method.predict(source_X) == source_y)
    # predB = (best_method.predict(other_X) == other_y)
    predA = (np.argmax(best_method.predict_proba(pd_source_X), axis=1) == np.argmax(source_y, axis=1))
    predB = (np.argmax(best_method.predict_proba(pd_other_X), axis=1) == np.argmax(other_y, axis=1))
    sx_A = np.dot(wA, predA)
    sx_B = np.dot(wB, predB)
    return accuracyA, accuracyB, sx_A, sx_B

def y_shift(src_x, src_y, tar_x, tar_y, tree_model, column_names):
    p2p, q2q, p2s, s2q = degradation_decomp(src_x, src_y, tar_x, tar_y, tree_model, column_names, data_sum=20000, K=8, draw_calibration=False, save_calibration_png='calibration.png')
    # print(f"Total Performance Degradation is {p2p-q2q}")
    # print(f"Proportion of Y|X-shift is {(p2s-s2q)/(p2p-q2q)}")
    perf_degradation = p2p-q2q
    proportion_yshift = (p2s-s2q)/(p2p-q2q)
    return perf_degradation, proportion_yshift


Set of functions for the Similarity and diversity measure.

Paper of reference:

In [None]:
def task_agnostic_data_valuation(src_x, tar_x):
    cov_mat_src = src_x.cov()
    src_eig_vals, src_eig_vecs = np.linalg.eig((cov_mat_src.T @ cov_mat_src) * (1 / len(src_x)))
    cov_mat_tar = ((tar_x.cov().T @ tar_x.cov()) * (1 / len(tar_x)))
    tar_eig_vals = [np.sqrt(np.sum(np.square(cov_mat_tar.dot(eigen_vec)))) for eigen_vec in src_eig_vecs]
    src_eig_vals = np.array(src_eig_vals)
    tar_eig_vals = np.array(tar_eig_vals)
    diversity, relevance = 1, 1
    for src_eig, tar_eig in zip(src_eig_vals, tar_eig_vals):
        diversity *= np.power((abs(src_eig - tar_eig) / max(src_eig, tar_eig)), 1 / len(src_eig_vals))
        relevance *= np.power((min(src_eig, tar_eig) / max(src_eig, tar_eig)), 1 / len(src_eig_vals))
    return relevance, diversity

Set of functions for the Robust Volume measure.

Paper of reference:

In [None]:
from math import ceil, floor
from collections import defaultdict, Counter

import torch
import numpy as np
from torch import stack, cat, zeros_like, pinverse


# def compute_volumes(X, d=1):
def compute_volumes(datasets, d=1):
    d = datasets[0].shape[1]
    for i in range(len(datasets)):
        datasets[i] = datasets[i].reshape(-1 ,d)

    X = np.concatenate(datasets, axis=0).reshape(-1, d)
    volumes = np.zeros(len(datasets))
    for i, dataset in enumerate(datasets):
        volumes[i] = np.sqrt(np.linalg.det( dataset.T @ dataset ) + 1e-8)

    volume_all = np.sqrt(np.linalg.det(X.T @ X) + 1e-8).round(3)
    return volumes, volume_all
    # return volume_all


def compute_X_tilde_and_counts(X, omega):
    """
    Compresses the original feature matrix X to  X_tilde with the specified omega.

    Returns:
       X_tilde: compressed np.ndarray
       cubes: a dictionary of cubes with the respective counts in each dcube
    """
    D = X.shape[1]

    # assert 0 < omega <= 1, "omega must be within range [0,1]."

    m = ceil(1.0 / omega) # number of intervals for each dimension

    cubes = Counter() # a dictionary to store the freqs
    # key: (1,1,..)  a d-dimensional tuple, each entry between [0, m-1]
    # value: counts

    Omega = defaultdict(list)
    # Omega = {}

    min_ds = torch.min(X, axis=0).values

    # a dictionary to store cubes of not full size
    for x in X:
        cube = []
        for d, xd in enumerate(x - min_ds):
            d_index = floor(xd / omega)
            cube.append(d_index)

        cube_key = tuple(cube)
        cubes[cube_key] += 1

        Omega[cube_key].append(x)

        '''
        if cube_key in Omega:

            # Implementing mean() to compute the average of all rows which fall in the cube

            Omega[cube_key] = Omega[cube_key] * (1 - 1.0 / cubes[cube_key]) + 1.0 / cubes[cube_key] * x
            # Omega[cube_key].append(x)
        else:
             Omega[cube_key] = x
        '''
    X_tilde = stack([stack(list(value)).mean(axis=0) for key, value in Omega.items()])

    # X_tilde = stack(list(Omega.values()))

    return X_tilde, cubes

def compute_robust_volumes(X_tildes, dcube_collections):

    N = sum([len(X_tilde) for X_tilde in X_tildes])
    alpha = 1.0 / (10 * N) # it means we set beta = 10
    # print("alpha is :{}, and (1 + alpha) is :{}".format(alpha, 1 + alpha))

    volumes, volume_all = compute_volumes(X_tildes, d=X_tildes[0].shape[1])
    robust_volumes = np.zeros_like(volumes)
    for i, (volume, hypercubes) in enumerate(zip(volumes, dcube_collections)):
        rho_omega_prod = 1.0
        for cube_index, freq_count in hypercubes.items():

            # if freq_count == 1: continue # volume does not monotonically increase with omega
            # commenting this if will result in volume monotonically increasing with omega
            rho_omega = (1 - alpha**(freq_count + 1)) / (1 - alpha)

            rho_omega_prod *= rho_omega

        robust_volumes[i] = (volume * rho_omega_prod).round(3)
    return robust_volumes


def robust_volume(Xs, omega=0.1):
    # M = len(Xs)
    D = Xs.shape[1]
    # orderings = list(permutations(range(M)))

    # s_values = torch.zeros(M)
    # monte_carlo_s_values = torch.zeros(M)

    # s_value_robust = torch.zeros(M)
    # monts_carlo_s_values_robust = torch.zeros(M)

    # Monte-carlo : shuffling the ordering and taking the first K orderings
    # random.shuffle(orderings)
    # K = 4 # number of permutations to sample
    # for ordering_count, ordering in enumerate(orderings):

        # prefix_vol = 0
        # prefix_robust_vol = 0
        # for position, i in enumerate(ordering):

        #     curr_indices = set(ordering[:position+1])

    # curr_train_X = torch.cat(torch.tensor(Xs.values)).reshape(-1, D)
    curr_train_X = torch.tensor(Xs.values).reshape(-1, D)

    # curr_train_X = torch.tensor(Xs.values)

    # curr_vol = torch.sqrt(torch.linalg.det(curr_train_X.T @ curr_train_X) + 1e-8)


    # marginal = curr_vol - prefix_vol
    # prefix_vol = curr_vol
    # s_values[i] += marginal

    X_tilde, cubes = compute_X_tilde_and_counts(curr_train_X, omega)

    robust_vol = compute_robust_volumes([X_tilde], [cubes])[0]

    return robust_vol

In [None]:
def get_xgb_tree(train_x, train_y, test_x, test_y):
    parameters_dict = {"batch_size": 64}
    d_matrix = xgb.DMatrix(train_x, label=np.argmax(train_y, axis=1))
    d_test_matrix = xgb.DMatrix(test_x, label=np.argmax(test_y, axis=1))
    if train_y.shape[1] == 2:
        parameters_dict["objective"] = "binary:logistic"
        parameters_dict["eval_metric"] = "logloss"
    elif train_y.shape[1] > 2:
        parameters_dict["objective"] = "multi:softprob"
        parameters_dict['num_class'] = train_y.shape[1]
        parameters_dict["disable_default_eval_metric"] = 1
        parameters_dict["eval_metric"] = "mlogloss"
    tree_model = xgb.train(parameters_dict, d_matrix, evals=[(d_matrix, "train"), (d_test_matrix, "validate")], num_boost_round=500, early_stopping_rounds=10)
    tree_model = XGBoostModel(tree_model)
    return tree_model

Set of functions for the Hellinger Distance measure.

Paper of reference:

In [None]:
def hellinger_distance(src_y, tar_y, type_continuous=False):
    total_instances_per_label_src = np.sum(src_y, axis=0)
    total_instances_per_label_tar = np.sum(tar_y, axis=0)
    p = np.divide(total_instances_per_label_src, np.sum(total_instances_per_label_src))
    q = np.divide(total_instances_per_label_tar, np.sum(total_instances_per_label_tar))
    
    return (1/math.sqrt(2)) * np.sqrt(np.sum(np.square(np.sqrt(p) - np.sqrt(q))))

Set of functions for the performance degradation measure. Here, we train a model over one client (src) and then evaluate against another client (tar). We then compare this evaluation with a model trained and evaluated in _tar_ data.

In [None]:
from util import OptunaConnection
from experiment_parameters.model_builder.ModelBuilder import Director, get_training_configuration
from experiment_parameters.model_builder.Model import XGBoostModel, KerasModel
from sklearn.metrics import log_loss, mean_absolute_error
import gc

director = Director()

def get_parameters(trial, model_type):
    parameters = get_training_configuration(trial=trial, model_type=model_type)
    return parameters

def get_mlp(input_dim, num_classes, parameters):
    return director.create_mlp(input_parameters=input_dim, num_classes=num_classes, parameters=parameters)

def performance_degradation(train_src_x, train_src_y, train_tar_x, train_tar_y, test_tar_x, test_tar_y, dataset_name):
    if dataset_name == "har":
        study = OptunaConnection.load_study("mlp_har")
    elif dataset_name == "edge-iot-coreset":
        study = OptunaConnection.load_study("mlp_edge_iiot_coreset")
    elif dataset_name == "electric-consumption":
        study = OptunaConnection.load_study("mlp_electric_consumption")
    best_trial = study.best_trial
    parameters_dict = get_training_configuration(best_trial, "mlp")

    model_src = get_mlp(train_src_x.shape[1], train_src_y.shape[1], parameters_dict)
    model_src.fit(train_src_x, train_src_y, epochs=30)

    model_tar = get_mlp(train_src_x.shape[1], train_src_y.shape[1], parameters_dict)
    model_tar.fit(train_tar_x, train_tar_y, epochs=30)

    if train_src_y.shape[1] == 1:
        metric_list = ["MAE"]
        evaluation_result_src = evaluator(test_tar_x, test_tar_y, model_src, metric_list=metric_list).get_value_of_metric("MAE")
        evaluation_result_tar = evaluator(test_tar_x, test_tar_y, model_tar, metric_list=metric_list).get_value_of_metric("MAE")
    else:
        metric_list = ["CrossEntropyLoss"]
        evaluation_result_src = evaluator(test_tar_x, test_tar_y, model_src, metric_list=metric_list).get_value_of_metric("CrossEntropyLoss")
        evaluation_result_tar = evaluator(test_tar_x, test_tar_y, model_tar, metric_list=metric_list).get_value_of_metric("CrossEntropyLoss")

    return evaluation_result_src - evaluation_result_tar

This function then computes all measures for the experiment. Each experiment represents a federated training with an existing distribution among the clients. This function then computes the measures among those datasets and the global dataset, which is the concatenation of all the datasets of the participants. The functions were listed above.

__NOTE__: For the volume function, we use the [Prince](https://maxhalford.github.io/prince/) library, as it requires a full rank matrix as a input.

In [None]:
def compute_all_distances_and_volumes(dataset_name, path_to_train_datasets):
    wassersteinDataframe = pd.DataFrame()
    gaussianMMDDataframe = pd.DataFrame()
    relevanceDataframe = pd.DataFrame()
    diversityDataframe = pd.DataFrame()
    volumeDataframe = pd.Series()

    performanceDegradation = pd.DataFrame()
    negativeConditionalEntropyDataframe = pd.DataFrame()
    yShiftDataframe = pd.DataFrame()
    hellingerDistanceDataframe = pd.DataFrame()
    
    # regularizedHScoreDataframe = pd.DataFrame()
    classification = False

    global_X_training, global_y_training = dataset_model_dictionary[dataset_name]().get_dataset().get_training_data()
    global_X_test, global_y_test = dataset_model_dictionary[dataset_name]().get_dataset().get_test_data()
    column_names = global_X_training.columns.values.tolist()

    if len(global_y_training.shape) > 1: # Classification problem
        classification = True

    global_X_training = downcast_types(global_X_training)
    global_X_test = downcast_types(global_X_test)

    num_clients, all_combinations = number_of_clients_and_all_combinations(path_to_train_datasets)
    volumeDataframe.loc["Global"] = compute_volumes(prince.PCA(n_components=10).fit_transform(pd.concat([global_X_training, global_X_test]).reset_index(drop=True)).to_numpy())
    volumeDataframe.loc["Global"] = robust_volume(prince.PCA(n_components=10).fit_transform(pd.concat([global_X_training, global_X_test])))

    for source_client in range(num_clients):
        src_train_x = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(source_client) + "_X_training.csv", index_col=0)
        src_train_y = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(source_client) + "_y_training.csv", index_col=0)
        src_test_x = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(source_client) + "_X_test.csv", index_col=0)
        src_test_y = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(source_client) + "_y_test.csv", index_col=0)

        src_train_x = downcast_types(src_train_x)
        src_test_x = downcast_types(src_test_x)

        tree_model = get_xgb_tree(src_train_x, src_train_y, src_test_x, src_test_y)
        wassersteinDataframe.loc[source_client, "Global"]= wasserstein_distance(pd.concat([src_train_x, src_test_x]),
                                                                                pd.concat([global_X_training, global_X_test]))
        gaussianMMDDataframe.loc[source_client, "Global"]= gaussian_mmd_distance(pd.concat([src_train_x, src_test_x]),
                                                                                pd.concat([global_X_training, global_X_test]))
        relevanceDataframe.loc[source_client, "Global"], diversityDataframe.loc[source_client, "Global"] = task_agnostic_data_valuation(
            prince.PCA(n_components=10).fit_transform(pd.concat([src_train_x, src_test_x])),
            prince.PCA(n_components=10).fit_transform(pd.concat([global_X_training, global_X_test]))
        )
        volumeDataframe.loc[source_client] = robust_volume(prince.PCA(n_components=10).fit_transform(pd.concat([src_train_x, src_test_x])))

        if classification:
            _, yShiftDataframe.loc[source_client, "Global"] = y_shift(pd.concat([src_train_x, src_test_x]).to_numpy(),
                                                                    pd.concat([src_train_y, src_test_y]).to_numpy(),
                                                                    pd.concat([global_X_training, global_X_test]).to_numpy(),
                                                                    pd.concat([global_y_training, global_y_test]).to_numpy(),
                                                                    tree_model,
                                                                    column_names)
            negativeConditionalEntropyDataframe.loc[source_client, "Global"] = negative_conditional_entropy(np.argmax(pd.concat([src_train_y, src_test_y]).astype(int).to_numpy(), axis=1), np.argmax(pd.concat([global_y_training, global_y_test]).astype(int).to_numpy(), axis=1))

        performanceDegradation.loc[source_client, "Global"] = performance_degradation(src_train_x, src_train_y, global_X_training, global_y_training, global_X_test, global_y_test, dataset_name=dataset_name)
        if dataset_name != "electric-consumption":
            hellingerDistanceDataframe.loc[source_client, "Global"] = hellinger_distance(pd.concat([src_train_y, src_test_y]).to_numpy(), pd.concat([global_y_training, global_y_test]).to_numpy())

        for target_client in range(num_clients):
            tar_train_x = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(target_client) + "_X_training.csv", index_col=0)
            tar_train_y = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(target_client) + "_y_training.csv", index_col=0)
            tar_test_x = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(target_client) + "_X_test.csv", index_col=0)
            tar_test_y = pd.read_csv(path_to_train_datasets + os.sep + "client_" + str(target_client) + "_y_test.csv", index_col=0)

            tar_train_x = downcast_types(tar_train_x)
            tar_test_x = downcast_types(tar_test_x)

            performanceDegradation.loc[source_client, target_client] = performance_degradation(src_train_x, src_train_y, tar_train_x, tar_train_y, tar_test_x, tar_test_y, dataset_name=dataset_name)

            wassersteinDataframe.loc[source_client, target_client]= wasserstein_distance(pd.concat([src_train_x, src_test_x]),
                                                                                pd.concat([tar_train_x, tar_test_x]))
            gaussianMMDDataframe.loc[source_client, target_client]= gaussian_mmd_distance(pd.concat([src_train_x, src_test_x]),
                                                                                pd.concat([tar_train_x, tar_test_x]))
            relevanceDataframe.loc[source_client, target_client], diversityDataframe.loc[source_client, target_client] = task_agnostic_data_valuation(
                prince.PCA(n_components=10).fit_transform(pd.concat([src_train_x, src_test_x])),
                prince.PCA(n_components=10).fit_transform(pd.concat([tar_train_x, tar_test_x]))
            )

            if classification:
                _, yShiftDataframe.loc[source_client, target_client] = y_shift(pd.concat([src_train_x, src_test_x]).to_numpy(),
                                                                            pd.concat([src_train_y, src_test_y]).to_numpy(),
                                                                            pd.concat([tar_train_x, tar_test_x]).to_numpy(),
                                                                            pd.concat([tar_train_y, tar_test_y]).to_numpy(),
                                                                            tree_model,
                                                                            column_names)
                negativeConditionalEntropyDataframe.loc[source_client, target_client] = negative_conditional_entropy(np.argmax(pd.concat([src_train_y, src_test_y]).astype(int).to_numpy(), axis=1), np.argmax(pd.concat([tar_train_y, tar_test_y]).astype(int).to_numpy(), axis=1))
            if dataset_name != "electric-consumption":
                hellingerDistanceDataframe.loc[source_client, target_client] = hellinger_distance(pd.concat([src_train_y, src_test_y]).to_numpy(), pd.concat([tar_train_y, tar_test_y]).to_numpy())


    return wassersteinDataframe, gaussianMMDDataframe, performanceDegradation, yShiftDataframe, negativeConditionalEntropyDataframe, relevanceDataframe, diversityDataframe, volumeDataframe, hellingerDistanceDataframe

This function goes then over every single experiment, and prints all the results from the measures. They can be found in the folder /results/distances_values/{dataset_name}/{type_of_partition}/{name_of_experiment}

In [None]:
def compute_distances_and_values(dataset_name, type_of_partition, additional_parameter):
    path_to_result_dataframes = get_distances_from_route(dataset_name, type_of_partition, additional_parameter)
    path_to_train_datasets = get_data_from_route(dataset_name, type_of_partition, additional_parameter)
    wassersteinDataframe, gaussianMMDDataframe, performanceDegradation, yShiftDataframe, negativeConditionalEntropy, relevance, diversity, volume, hellinger = compute_all_distances_and_volumes(dataset_name, path_to_train_datasets)
    
    os.makedirs(path_to_result_dataframes, exist_ok=True)

    if yShiftDataframe.shape[0] > 0:
        yShiftDataframe.to_csv(path_to_result_dataframes + os.sep + "yShiftDataframe.csv")
    
    if negativeConditionalEntropy.shape[0] > 0:
        negativeConditionalEntropy.to_csv(path_to_result_dataframes + os.sep + "negativeConditionalEntropy.csv")

    if performanceDegradation.shape[0] > 0:
        performanceDegradation.to_csv(path_to_result_dataframes + os.sep + "performanceDegradation.csv")

    if hellinger.shape[0] > 0:
        hellinger.to_csv(path_to_result_dataframes + os.sep + "hellinger.csv")

    if wassersteinDataframe.shape[0] > 0:
        wassersteinDataframe.to_csv(path_to_result_dataframes + os.sep + "wasserstein.csv")

    if gaussianMMDDataframe.shape[0] > 0:
        gaussianMMDDataframe.to_csv(path_to_result_dataframes + os.sep + "gaussian_mmd.csv")
    # conditionalEntropy.to_csv(path_to_result_dataframes + os.sep + "conditionalEntropy.csv")

    if relevance.shape[0] > 0:
        relevance.to_csv(path_to_result_dataframes + os.sep + "relevance.csv")

    if diversity.shape[0] > 0:
        diversity.to_csv(path_to_result_dataframes + os.sep + "diversity.csv")
    
    if volume.shape[0] > 0:
        volume.to_csv(path_to_result_dataframes + os.sep + "volume.csv")
    

All the experiments for which we will compute the measures.

In [None]:
training_configurations_for_distances = [
    ("har", "dirichlet", "1"),
    ("har", "dirichlet", "10"),
    ("har", "dirichlet", "100"),
    ("har", "manual", "HAR_1_Maverick_1_label_skew"),
    ("har", "manual", "HAR_1_Maverick_1_LessLabels"),
    ("har", "manual", "HAR_1_Maverick_1_MissingTwoLabels"),
    ("har", "manual", "HAR_1_Maverick_Laying"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_1"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_2"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_3"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_4"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_5"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_2"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_3"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_4"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_5"),
    ("edge-iot-coreset", "dirichlet", "10"),
    ("edge-iot-coreset", "dirichlet", "100"),
    ("edge-iot-coreset", "dirichlet", "1000"),
    ("edge-iot-coreset", "dirichlet", "10000"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_LeastAttack"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_OnlyNormal"),
    ("electric-consumption", "manual", "Electric_Consumption_Random_Sampling"),
    ("electric-consumption", "manual", "ElectricConsumptionFacilityType"),
    ("electric-consumption", "manual", "ElectricConsumptionNoniid"),
    ("electric-consumption", "manual", "ElectricConsumptionStateFactor"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore")
]

In [None]:
for dataset_name, type_of_partition, additional_parameter in training_configurations_for_distances:
    compute_distances_and_values(dataset_name, type_of_partition, additional_parameter)

This function then reads all the measure results and prints them in a dataset.

In [None]:
def get_results_of_training(dataset_name, type_of_partition, additional_parameter):
    results_dataframe = pd.Series()

    path_to_result_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    # metrics = []
    # result = []

    results_dataframe.loc["Dataset"] = dataset_name
    results_dataframe.loc["Type_of_partition"] = type_of_partition
    results_dataframe.loc["dirichlet/NamePartition"] = additional_parameter

    for file in os.listdir(path_to_result_dataframes + os.sep + "Evaluation"):
        # spearman_rank_distance_sv = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "Evaluator", "Distance", "SpearmanRank", "p-value"])
        metric_name = file.split("_")[1]

        if file != "Evaluation_F1Score":
            evaluation_dataframe = pd.read_csv(path_to_result_dataframes + os.sep + "Evaluation" + os.sep + file)
            results_dataframe.loc[metric_name] = evaluation_dataframe.loc[evaluation_dataframe.index[-1], "Global"]

    return results_dataframe

In [None]:
results_dataframe = pd.DataFrame()

for dataset_name, type_of_partition, additional_parameter in training_configurations_for_distances:
    # display(get_results_of_training(dataset_name, type_of_partition, additional_parameter))
    results_dataframe = pd.concat([results_dataframe, get_results_of_training(dataset_name, type_of_partition, additional_parameter).to_frame().T], ignore_index=True)

In [None]:
display(results_dataframe)

## Computing Correlations between values and distances (2nd Part)

### Computing distances and value for the different experiments

In [None]:
training_configurations = [
    ("har", "dirichlet", "1"),
    ("har", "dirichlet", "10"),
    ("har", "dirichlet", "100"),
    ("har", "manual", "HAR_1_Maverick_1_label_skew"),
    ("har", "manual", "HAR_1_Maverick_1_LessLabels"),
    ("har", "manual", "HAR_1_Maverick_1_MissingTwoLabels"),
    ("har", "manual", "HAR_1_Maverick_Laying"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_1"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_2"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_3"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_4"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_5"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_2"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_3"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_4"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_5"),
    ("edge-iot-coreset", "dirichlet", "10"),
    ("edge-iot-coreset", "dirichlet", "100"),
    ("edge-iot-coreset", "dirichlet", "1000"),
    ("edge-iot-coreset", "dirichlet", "10000"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_LeastAttack"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_OnlyNormal"),
    ("electric-consumption", "manual", "Electric_Consumption_Random_Sampling"),
    ("electric-consumption", "manual", "ElectricConsumptionFacilityType"),
    ("electric-consumption", "manual", "ElectricConsumptionNoniid"),
    ("electric-consumption", "manual", "ElectricConsumptionStateFactor"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore")
]

The following functions compute the spearman and pearson correlation for the sv. In this case, we are comparing the sv computed both in an centralized and distributed setting.

In [None]:
def compute_spearman_correlation_sv(dataset_name, type_of_partition, additional_parameter):
    path_to_result_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    spearman_rank_sv_metric_centralized_decentralized = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "SpearmanRank", "p-value"])
    spearman_sv_metrics = []
    spearman_sv_values = []
    spearman_sv_p_values = []

    for file in os.listdir(path_to_result_dataframes + os.sep + "Shapley_Value"):
        results_dataframe = pd.read_csv(path_to_result_dataframes + os.sep + "Shapley_Value" + os.sep + file)
        # display(path_to_result_dataframes)
        # display(file)
        spearman_sv_metrics.append(file.split('_')[1])
        if "SV_F1Score" == file:
            grouped_sv = results_dataframe.groupby(["Evaluator", "Classes"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Classes", "Round"], axis=1)
            spearman_sv_values.append(0)
            spearman_sv_p_values.append(0)
        else:
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            # display(grouped_sv)
        # spearman_rank, pvalue = scipy.stats.spearmanr(grouped_sv.loc["Centralized"], grouped_sv.loc["Aggregated"])
            def statistic(x): # permute only `x`
                return scipy.stats.spearmanr(x, grouped_sv.loc["Aggregated"]).statistic
            res_exact = scipy.stats.permutation_test((grouped_sv.loc["Centralized"],), statistic,
                permutation_type='pairings')
            # res_asymptotic = scipy.stats.spearmanr(grouped_sv.loc["Centralized"], grouped_sv.loc["Aggregated"])
            # res_exact.pvalue # asymptotic pvalue is too low
            # display(res_exact.statistic)
            # display(res_exact.pvalue)
            spearman_sv_values.append(res_exact.statistic)
            spearman_sv_p_values.append(res_exact.pvalue)
            # display(spearman_sv_values)
            # display(spearman_sv_p_values)

    spearman_rank_sv_metric_centralized_decentralized["Dataset"] = [dataset_name for _ in range(len(spearman_sv_metrics))]
    spearman_rank_sv_metric_centralized_decentralized["TypePartition"] = [type_of_partition for _ in range(len(spearman_sv_metrics))]
    spearman_rank_sv_metric_centralized_decentralized["AdditionalParameter"] = [additional_parameter for _ in range(len(spearman_sv_metrics))]
    spearman_rank_sv_metric_centralized_decentralized["Metric"] = spearman_sv_metrics
    spearman_rank_sv_metric_centralized_decentralized["SpearmanRank"] = spearman_sv_values
    spearman_rank_sv_metric_centralized_decentralized["p-value"] = spearman_sv_p_values

    return spearman_rank_sv_metric_centralized_decentralized

In [None]:
def compute_pearson_correlation_sv(dataset_name, type_of_partition, additional_parameter):
    path_to_result_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    pearson_rank_sv_metric_centralized_decentralized = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "PearsonCorrelation", "p-value"])
    pearson_sv_metrics = []
    pearson_sv_values = []
    pearson_sv_p_values = []

    for file in os.listdir(path_to_result_dataframes + os.sep + "Shapley_Value"):
        results_dataframe = pd.read_csv(path_to_result_dataframes + os.sep + "Shapley_Value" + os.sep + file)
        # display(path_to_result_dataframes)
        # display(file)
        pearson_sv_metrics.append(file.split('_')[1])
        if file == "SV_F1Score":
            grouped_sv = results_dataframe.groupby(["Evaluator", "Classes"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Classes", "Round"], axis=1)
            pearson_sv_values.append(0)
            pearson_sv_p_values.append(0)
        elif file == "SV_CosineSimilarity":
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            pearson_sv_values.append(0)
            pearson_sv_p_values.append(0)
        else:
            grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
            # def statistic(x): # permute only `x`
            #     return scipy.stats.pearsonr(x, grouped_sv.loc["Aggregated"]).statistic
            # res_exact = scipy.stats.permutation_test((grouped_sv.loc["Centralized"],), statistic,
            #     permutation_type='pairings')
            res_exact = scipy.stats.pearsonr(grouped_sv.loc["Centralized"], grouped_sv.loc["Aggregated"])
            pearson_sv_values.append(res_exact.statistic)
            pearson_sv_p_values.append(res_exact.pvalue)

    pearson_rank_sv_metric_centralized_decentralized["Dataset"] = [dataset_name for _ in range(len(pearson_sv_metrics))]
    pearson_rank_sv_metric_centralized_decentralized["TypePartition"] = [type_of_partition for _ in range(len(pearson_sv_metrics))]
    pearson_rank_sv_metric_centralized_decentralized["AdditionalParameter"] = [additional_parameter for _ in range(len(pearson_sv_metrics))]
    pearson_rank_sv_metric_centralized_decentralized["Metric"] = pearson_sv_metrics
    pearson_rank_sv_metric_centralized_decentralized["PearsonCorrelation"] = pearson_sv_values
    pearson_rank_sv_metric_centralized_decentralized["p-value"] = pearson_sv_p_values

    return pearson_rank_sv_metric_centralized_decentralized

In [None]:
spearman_rank_sv_metric_centralized_decentralized = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "SpearmanRank", "p-value"])
pearson_correlation_sv_metric_centralized_decentralized = pd.DataFrame(columns=["Dataset", "TypePartition", "AdditionalParameter", "Metric", "PearsonCorrelation", "p-value"])


for dataset_name, type_of_partition, additional_parameter in training_configurations:
    spearman_rank_sv_metric_centralized_decentralized = pd.concat([spearman_rank_sv_metric_centralized_decentralized, compute_spearman_correlation_sv(dataset_name, type_of_partition, additional_parameter)], ignore_index=True)
    pearson_correlation_sv_metric_centralized_decentralized = pd.concat([pearson_correlation_sv_metric_centralized_decentralized, compute_pearson_correlation_sv(dataset_name, type_of_partition, additional_parameter)], ignore_index=True)

In [None]:
# spearman_rank_sv_metric_centralized_decentralized_no_f1 = spearman_rank_sv_metric_centralized_decentralized[(spearman_rank_sv_metric_centralized_decentralized["Metric"] != "F1Score") & (spearman_rank_sv_metric_centralized_decentralized["Metric"] != "CosineSimilarity")]

# spearman_rank_sv_metric_centralized_decentralized_no_f1["SpearmanRank"] = spearman_rank_sv_metric_centralized_decentralized_no_f1["SpearmanRank"].astype('float')
# spearman_rank_sv_metric_centralized_decentralized_no_f1["p-value"] = spearman_rank_sv_metric_centralized_decentralized_no_f1["p-value"].astype('float')

# pearson_correlation_sv_metric_centralized_decentralized_no_f1 = pearson_correlation_sv_metric_centralized_decentralized[(pearson_correlation_sv_metric_centralized_decentralized["Metric"] != "F1Score") & (spearman_rank_sv_metric_centralized_decentralized["Metric"] != "CosineSimilarity")]

# pearson_correlation_sv_metric_centralized_decentralized_no_f1["PearsonCorrelation"] = pearson_correlation_sv_metric_centralized_decentralized_no_f1["PearsonCorrelation"].astype('float')
# pearson_correlation_sv_metric_centralized_decentralized_no_f1["p-value"] = pearson_correlation_sv_metric_centralized_decentralized_no_f1["p-value"].astype('float')

# display(spearman_rank_sv_metric_centralized_decentralized_no_f1.groupby(["Dataset"]).mean(numeric_only=True))

# display(spearman_rank_sv_metric_centralized_decentralized_no_f1.groupby(["Dataset", "Metric"]).mean(numeric_only=True))

# display(spearman_rank_sv_metric_centralized_decentralized_no_f1.groupby(["Metric"]).mean(numeric_only=True))

# display(spearman_rank_sv_metric_centralized_decentralized_no_f1[spearman_rank_sv_metric_centralized_decentralized_no_f1["SpearmanRank"] != 1.0])

In [None]:
# display(spearman_rank_sv_metric_centralized_decentralized_no_f1)

In [None]:
# display(pearson_correlation_sv_metric_centralized_decentralized_no_f1.groupby(["Dataset"]).mean(numeric_only=True))

# display(pearson_correlation_sv_metric_centralized_decentralized_no_f1.groupby(["Dataset", "Metric"]).mean(numeric_only=True))

# display(pearson_correlation_sv_metric_centralized_decentralized_no_f1[pearson_correlation_sv_metric_centralized_decentralized_no_f1["PearsonCorrelation"] != 1.0])

In [None]:
path_to_results_dataframes = get_results_from_route("har", "manual", "HAR_1_Maverick_Balanced_Laying_1")

results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_CrossEntropyLoss")
grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)

Now, these two functions compute the spearman and pearson correlation coefficient comparing the distances computed with the centralized SV.

In [None]:
def compute_spearman_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss):
    path_to_distance_dataframes = get_distances_from_route(dataset_name, type_of_partition, additional_parameter)
    path_to_results_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    spearman_rank_sv_metric_centralized_decentralized = pd.DataFrame()
    distances = []
    spearman_values = []
    spearman_p_values = []

    print(f"Dataset name: {dataset_name}")
    print(f"Type_of_partition name: {type_of_partition}")
    print(f"Additional_parameter name: {additional_parameter}")
    
    if type_of_loss == "MAE":
        results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_MAE")
    else:
        results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_CrossEntropyLoss")
    
    grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
    grouped_sv.drop("Aggregated", inplace=True)

    # display(grouped_sv)

    for file in os.listdir(path_to_distance_dataframes):
        distances.append(file.split('.')[0])
        if file == "volume.csv":
            volume_series = pd.read_csv(path_to_distance_dataframes + os.sep + file, index_col=0)
            volume_series.drop("Global", inplace=True)
            def statistic(x): # permute only `x`
                return scipy.stats.spearmanr(x, grouped_sv.loc["Centralized"]).statistic
            res_exact = scipy.stats.permutation_test((volume_series.loc[:],), statistic,
                permutation_type='pairings')
            spearman_values.append(res_exact.statistic[0])
            spearman_p_values.append(res_exact.pvalue[0])

        else:
            distance_dataframe = pd.read_csv(path_to_distance_dataframes + os.sep + file, index_col=0)
            spearman_list = []
            pvalue_list = []
            for index, row in grouped_sv.iterrows():
                if index == "Centralized":
                    # def statistic(x): # permute only `x`
                    #     return scipy.stats.spearmanr(x, grouped_sv.loc["Centralized"]).statistic
                    # res_exact = scipy.stats.permutation_test((distance_dataframe.loc[:, "Global"],), statistic,
                    #     permutation_type='pairings')
                    res_exact = scipy.stats.spearmanr(grouped_sv.loc["Centralized"], distance_dataframe.loc[:, "Global"])
                    spearman_list.append(res_exact.statistic)
                    pvalue_list.append(res_exact.pvalue)
                else:
                    # def statistic(x): # permute only `x`
                    #     return scipy.stats.spearmanr(x, grouped_sv.loc[str(index)]).statistic
                    # res_exact = scipy.stats.permutation_test((distance_dataframe.loc[:, str(index)],), statistic,
                    #     permutation_type='pairings')
                    res_exact = scipy.stats.spearmanr(grouped_sv.loc[str(index)], distance_dataframe.loc[:, str(index)])
                    spearman_list.append(res_exact.statistic)
                    pvalue_list.append(res_exact.pvalue)

            # Fisher transformation for averaging correlations.
            spearman_values.append(np.tanh(np.mean(np.arctanh(spearman_list))))
            spearman_p_values.append(np.mean(pvalue_list))

    spearman_rank_sv_metric_centralized_decentralized["Dataset"] = [dataset_name for _ in range(len(distances))]
    spearman_rank_sv_metric_centralized_decentralized["TypePartition"] = [type_of_partition for _ in range(len(distances))]
    spearman_rank_sv_metric_centralized_decentralized["AdditionalParameter"] = [additional_parameter for _ in range(len(distances))]
    spearman_rank_sv_metric_centralized_decentralized["Distance"] = distances
    spearman_rank_sv_metric_centralized_decentralized["SpearmanCorrelation"] = spearman_values
    spearman_rank_sv_metric_centralized_decentralized["p-value-spearman"] = spearman_p_values

    return spearman_rank_sv_metric_centralized_decentralized

In [None]:
def compute_pearson_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss="CrossEntropyLoss"):
    path_to_distance_dataframes = get_distances_from_route(dataset_name, type_of_partition, additional_parameter)
    path_to_results_dataframes = get_results_from_route(dataset_name, type_of_partition, additional_parameter)

    pearson_sv_metric_centralized_decentralized = pd.DataFrame()
    distances = []
    pearson_values = []
    pearson_p_values = []
    variance_scenario = []

    print(f"Dataset name: {dataset_name}")
    print(f"Type_of_partition name: {type_of_partition}")
    print(f"Additional_parameter name: {additional_parameter}")

    if type_of_loss == "MAE":
        results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_MAE")
    else:
        results_dataframe = pd.read_csv(path_to_results_dataframes + os.sep + "Shapley_Value" + os.sep + "SV_CrossEntropyLoss")
        
    grouped_sv = results_dataframe.groupby(["Evaluator"]).sum().reindex(sorted(results_dataframe.columns), axis=1).drop(["Evaluator", "Round"], axis=1)
    grouped_sv.drop("Aggregated", inplace=True)

    # display(grouped_sv)

    for file in os.listdir(path_to_distance_dataframes):
        distances.append(file.split('.')[0])
        if file == "volume.csv":
            volume_series = pd.read_csv(path_to_distance_dataframes + os.sep + file, index_col=0)
            volume_series.drop("Global", inplace=True)
            res_exact = scipy.stats.pearsonr(grouped_sv.loc["Centralized"], volume_series.to_numpy().reshape(-1))
            pearson_values.append(res_exact.statistic)
            pearson_p_values.append(res_exact.pvalue)
            # variance_scenario.append(np.var(grouped_sv.loc["Centralized"]))
        else:
            distance_dataframe = pd.read_csv(path_to_distance_dataframes + os.sep + file, index_col=0)
            if file == "yShiftDataframe.csv":
                distance_dataframe.replace([np.inf, -np.inf], 0, inplace=True)
                distance_dataframe.replace([np.NAN], 0, inplace=True)
            pearsonr_list = []
            pvalue_list = []
            for index, row in grouped_sv.iterrows():
                if index == "Centralized":
                    res_exact = scipy.stats.pearsonr(grouped_sv.loc["Centralized"], distance_dataframe.loc[:, "Global"])
                    pearsonr_list.append(res_exact.statistic)
                    pvalue_list.append(res_exact.pvalue)
                else:            
                    res_exact = scipy.stats.pearsonr(grouped_sv.loc[str(index)], distance_dataframe.loc[:, str(index)])
                    pearsonr_list.append(res_exact.statistic)
                    pvalue_list.append(res_exact.pvalue)

            # variance_scenario.append(np.var(grouped_sv.loc["Centralized"]))
            # Fisher transformation for averaging correlations.
            pearson_values.append(np.tanh(np.mean(np.arctanh(pearsonr_list))))
            pearson_p_values.append(np.mean(pvalue_list))

    pearson_sv_metric_centralized_decentralized["Dataset"] = [dataset_name for _ in range(len(distances))]
    pearson_sv_metric_centralized_decentralized["TypePartition"] = [type_of_partition for _ in range(len(distances))]
    pearson_sv_metric_centralized_decentralized["AdditionalParameter"] = [additional_parameter for _ in range(len(distances))]
    pearson_sv_metric_centralized_decentralized["Distance"] = distances
    pearson_sv_metric_centralized_decentralized["PearsonCorrelation"] = pearson_values
    pearson_sv_metric_centralized_decentralized["p-value-pearson"] = pearson_p_values
    # pearson_sv_metric_centralized_decentralized["VarianceSV"] = variance_scenario

    return pearson_sv_metric_centralized_decentralized

In [None]:
distances_to_compare = [
    ("har", "dirichlet", "1"),
    ("har", "dirichlet", "10"),
    ("har", "dirichlet", "100"),
    ("har", "manual", "HAR_1_Maverick_1_label_skew"),
    ("har", "manual", "HAR_1_Maverick_1_LessLabels"),
    ("har", "manual", "HAR_1_Maverick_1_MissingTwoLabels"),
    ("har", "manual", "HAR_1_Maverick_Laying"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_1"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_2"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_3"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_4"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_5"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_2"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_3"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_4"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_5"),
    ("edge-iot-coreset", "dirichlet", "10"),
    ("edge-iot-coreset", "dirichlet", "100"),
    ("edge-iot-coreset", "dirichlet", "1000"),
    ("edge-iot-coreset", "dirichlet", "10000"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_LeastAttack"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_OnlyNormal"),
    ("electric-consumption", "manual", "Electric_Consumption_Random_Sampling"),
    ("electric-consumption", "manual", "ElectricConsumptionFacilityType"),
    ("electric-consumption", "manual", "ElectricConsumptionNoniid"),
    ("electric-consumption", "manual", "ElectricConsumptionStateFactor"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore")
]

In [None]:
spearman_rank_sv_distance = pd.DataFrame()
pearson_rank_sv_distance = pd.DataFrame()


for dataset_name, type_of_partition, additional_parameter, type_of_loss in distances_to_compare:
    spearman_rank_sv_distance = pd.concat([spearman_rank_sv_distance, compute_spearman_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss)], ignore_index=True)
    pearson_rank_sv_distance = pd.concat([pearson_rank_sv_distance, compute_pearson_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss)], ignore_index=True)

correlation_sv_distance = spearman_rank_sv_distance.copy()
correlation_sv_distance["PearsonCorrelation"] = pearson_rank_sv_distance["PearsonCorrelation"]
correlation_sv_distance["p-value-pearson"] = pearson_rank_sv_distance["p-value-pearson"]
# correlation_sv_distance["VarianceSV"] = pearson_rank_sv_distance["VarianceSV"]

We now proceed to create the box plot with the Pearson correlation coefficient for all cases.

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
np.random.seed(1)

#garbage graph
fig = px.scatter(x=[0, 1, 2, 3, 4], y=[0, 1, 4, 9, 16])
fig.show()
fig.write_image("random.pdf")


distance_dictionary = {
    "performanceDegradation": "Performance Degradation (I)",
    "yShiftDataframe": "X->Y Shift (I)",
    "relevance": "Relevance (D)",
    "diversity": "Diversity (D)",
    "hellinger": "Hellinger Distance (I)",
    "wasserstein": "Wasserstein Distance (I)",
    "gaussian_mmd": "MMD - Gaussian Kernel (I)",
    "volume": "Volume of data (D)"
}

fig = go.Figure()
for metric, name_of_metric in distance_dictionary.items():
    sliced_df = correlation_sv_distance[correlation_sv_distance["Distance"] == metric]
    fig.add_trace(go.Box(y=sliced_df["PearsonCorrelation"], name=name_of_metric))


fig.update_layout(title_text="Pearson Correlation between Distance and Value", showlegend=False)

fig.show()

fig.write_image("images/distance_value_pearson_correlation.pdf")

We now proceed to create the box plot with the Pearson correlation coefficient for only the cases with Mavericks.

In [None]:
distances_only_mavericks_to_compare = [
    ("har", "manual", "HAR_1_Maverick_1_label_skew", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_LessLabels", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_MissingTwoLabels", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Laying", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_1", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_2", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_3", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_4", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_Balanced_Laying_5", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_2", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_3", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_4", "CrossEntropyLoss"),
    ("har", "manual", "HAR_1_Maverick_1_Balanced_WalkingUpstairs_5", "CrossEntropyLoss"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_LeastAttack", "CrossEntropyLoss"),
    ("edge-iot-coreset", "manual", "EdgeIIOT_Maverick_OnlyNormal", "CrossEntropyLoss"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickMultifamilyUncategorized", "MAE"),
    ("electric-consumption", "manual", "ElectricConsumption_FeatureSkew_MaverickGroceryStore", "MAE")
]

In [None]:
spearman_rank_sv_distance = pd.DataFrame()
pearson_rank_sv_distance = pd.DataFrame()


for dataset_name, type_of_partition, additional_parameter, type_of_loss in distances_only_mavericks_to_compare:
    spearman_rank_sv_distance = pd.concat([spearman_rank_sv_distance, compute_spearman_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss)], ignore_index=True)
    pearson_rank_sv_distance = pd.concat([pearson_rank_sv_distance, compute_pearson_correlation_distance(dataset_name, type_of_partition, additional_parameter, type_of_loss)], ignore_index=True)

correlation_sv_distance_only_mavericks = spearman_rank_sv_distance.copy()
correlation_sv_distance_only_mavericks["PearsonCorrelation"] = pearson_rank_sv_distance["PearsonCorrelation"]
correlation_sv_distance_only_mavericks["p-value-pearson"] = pearson_rank_sv_distance["p-value-pearson"]

In [None]:
import plotly.graph_objects as go
import numpy as np
np.random.seed(1)

distance_dictionary = {
    "performanceDegradation": "Performance Degradation (I)",
    "yShiftDataframe": "X->Y Shift (I)",
    "relevance": "Relevance (D)",
    "diversity": "Diversity (D)",
    "hellinger": "Hellinger Distance (I)",
    "wasserstein": "Wasserstein Distance (I)",
    "gaussian_mmd": "MMD - Gaussian Kernel (I)",
    "volume": "Volume of data (D)"
}

fig = go.Figure()
for metric, name_of_metric in distance_dictionary.items():
    sliced_df = correlation_sv_distance_only_mavericks[correlation_sv_distance_only_mavericks["Distance"] == metric]
    fig.add_trace(go.Box(y=sliced_df["PearsonCorrelation"], name=name_of_metric))


fig.update_layout(title_text="Pearson Correlation between Distance and Value only Mavericks", showlegend=False)

fig.show()

fig.write_image("images/distance_value_only_mavericks_pearson_correlation.pdf")