In [3]:
# === Modules import === # 
!pip install cumulator
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import pickle
from torchvision import datasets, transforms
from tqdm import tqdm
from pathlib import Path
from cumulator import base

ModuleNotFoundError: No module named 'torchvision'

In [16]:
# Define the CNN we are using for our task
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
        self.fc1 = nn.Linear(2048, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.conv3(x)
        x = F.relu(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


# This class represents the gradient for on NN model. Keeping Gradient for each layer distinctly stored. 
# It also allows us to compute easily the relative distance between the gradient of two agents for Weight Erosion.
class GradientStocker:
    def __init__(self, model_names):
        for item in model_names:
            setattr(self, item, 0)

    def get_attributes(self):
        return self.__dict__

    def add_gradient(self, model):
        for name, param in model.named_parameters():
            setattr(self, name, getattr(self, name) + param.grad.data.cpu())

    def euclidian_distance(self, grad_current_agent):
        """Computes the relative euclidean distance of the flattened tensor between the current model and the global model"""
        flattened_grad_selected = self.flatten(list(self.get_attributes().values()))
        flattened_grad_current = self.flatten(list(grad_current_agent.get_attributes().values()))
        return torch.dist(flattened_grad_selected, flattened_grad_current, 2) / torch.norm(flattened_grad_selected, 2)

    def flatten(self, gradient_list):
        """Returns an aggregated tensor of all the gradients for one model"""
        gradients = list(map(lambda g : torch.flatten(g), gradient_list))
        return torch.cat(gradients, 0)


def client_update(client_model, optimizer, train_loader, epoch=5):
    """Train a client_model on the train_loder data."""
    model_names = []
    for name, param in client_model.named_parameters():
        model_names.append(name)
    gradient_stocker = GradientStocker(model_names)
    for e in range(epoch):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = client_model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            gradient_stocker.add_gradient(client_model)
    return loss.item(), gradient_stocker


def weighted_average_gradients(gradients, weights):
    """Compute the weighted average gradient."""
    weighted_averages = {}
    for key in gradients[0].get_attributes().keys():
        weighted_averages[key] = weighted_average_from_key(key, gradients, weights)
    return weighted_averages

def weighted_average_from_key(key, gradients, weights):
    n = 0
    d = 0 
    for idx, g_dict in enumerate(gradients) :
        n += g_dict.get_attributes()[key] * weights[idx]
        d += weights[idx]
    return n / d

def compute_weight(alpha_prev, round, relative_distance, data_size, batch_size, distance_penalty, size_penalty):
    """ Computes the weight alpha for round r """
    size_factor = (1 + size_penalty * math.floor(((round - 1) * batch_size) / data_size)) 
    distance_factor = distance_penalty * relative_distance
    alpha = alpha_prev - size_factor * distance_factor 
    return max(0,alpha)

def update_grad(model, gradient, alpha): 
    """ Update the gradient for all parameters"""
    for name, param in model.named_parameters():
        param.data -= gradient[name].cuda() * alpha
    return model 

def share_weight_erosion_model(shared_model, client_models):
    """ Share the computed model with all agents"""
    for model in client_models:
        model.load_state_dict(shared_model.state_dict())

def evaluate(global_model, data_loader):
    """Compute loss and accuracy of a model on a data_loader."""
    global_model.eval()
    loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.cuda(), target.cuda()
            output = global_model(data)
            loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    loss /= len(data_loader.dataset)
    acc = correct / len(data_loader.dataset)

    return loss, acc

In [None]:
# === Run our model training using the Weight Erosion aggregation scheme === # 

def run_weight_erosion(train_loader, test_loader, num_clients, batch_size,
                       selected_agent_index, num_rounds, epochs, distribution, distribution_name='distribution'):

    distance_penalty = 0.1/num_clients
    size_penalty = 2
    dataPickle = []

    print("=== Weight Erosion ===")
    np.set_printoptions(precision=3)
    acc_best = 0
    round_best = 0
    weight_best = [0.1,0,0,0,0,0,0,0,0,0]

    # Instantiate models and optimizers
    shared_model = Net().cuda()
    client_models = [Net().cuda() for _ in range(num_clients)]
    for model in client_models:
        model.load_state_dict(shared_model.state_dict())

    opt = [optim.SGD(model.parameters(), lr=0.1) for model in client_models]

    grad_vector = [None for _ in range(num_clients)]
    weight_vector = np.ones(num_clients)

    for r in range(num_rounds):

        print('%d-th round' % r)

        # client update
        loss = np.zeros(num_clients)
        for i in range(num_clients):
            loss_tmp, grad_vector[i] = client_update(client_models[i], opt[i], train_loader[i], epoch=epochs)
            loss[i] = loss_tmp
            d_rel = grad_vector[selected_agent_index].euclidian_distance(grad_vector[i])
            weight_vector[i] = compute_weight(weight_vector[i], r + 1, d_rel, len(train_loader[i]), batch_size, distance_penalty, size_penalty)


        # Weight Erosion Scheme
        weighted_mean_gradient = weighted_average_gradients(grad_vector, weight_vector)
        shared_model = update_grad(shared_model, weighted_mean_gradient, 0.1)

        # Share model to all agents
        share_weight_erosion_model(shared_model, client_models)

        # Evalutate on the agent's test set 
        test_loss, acc = evaluate(shared_model, test_loader)


        print(f"Weight : {weight_vector}")
        print(f"Loss   : {loss}")
        print('Test loss %0.3g | Test acc: %0.3f \n' % (test_loss, acc))
        
        # Keep the accuracy for each round 
        dataPickle.append([acc,test_loss,loss[selected_agent_index], sum(weight_vector)/num_clients])
        
        # Update the best accuracy 
        if acc > acc_best:
            acc_best = acc
            round_best = r+1
            weight_best = weight_vector
            
    with open(Path.cwd()/'generated'/'pickles'/f'weight_erosion_{num_clients}_{distribution_name}.pickle', 'wb') as f:
        pickle.dump(dataPickle, f)

    return [acc_best, round_best, weight_best]


In [None]:
# === Run our model training using the Federated Average aggregation scheme === # 

def run_federated(train_loader, test_loader, num_clients,batch_size, 
                  selected_agent_index, num_rounds, epochs, distribution, distribution_name='distribution'):

    print("=== Federated ===")
    np.set_printoptions(precision=3)
    acc_best = 0
    round_best = 0
    weight_best = [0.1,0,0,0,0,0,0,0,0,0]
    dataPickle = []

    # Instantiate models and optimizers
    shared_model = Net().cuda()
    client_models = [Net().cuda() for _ in range(num_clients)]
    for model in client_models:
        model.load_state_dict(shared_model.state_dict())

    opt = [optim.SGD(model.parameters(), lr=0.1) for model in client_models]

    grad_vector = [None for _ in range(num_clients)]
    weight_vector = np.ones(num_clients)

    for r in range(num_rounds):

        print('%d-th round' % r)

        # client update
        loss = np.zeros(num_clients)
        for i in range(num_clients):
            loss_tmp, grad_vector[i] = client_update(client_models[i], opt[i], train_loader[i], epoch=epochs)
            loss[i] = loss_tmp
            weight_vector[i] = 1/num_clients


        # Weight Erosion Scheme
        weighted_mean_gradient = weighted_average_gradients(grad_vector, weight_vector)
        shared_model = update_grad(shared_model, weighted_mean_gradient, 0.1)

        # Share model to all agents
        share_weight_erosion_model(shared_model, client_models)

        # Evalutate on the agent's test set
        test_loss, acc = evaluate(shared_model, test_loader)

        print(f"Loss   : {loss}")
        print('Test loss %0.3g | Test acc: %0.3f\n' % (test_loss, acc))
        
        # Keep the accuracy for each round 
        dataPickle.append([acc,test_loss,loss[selected_agent_index]])
        
        # Update the best accuracy 
        if acc > acc_best:
            acc_best = acc
            round_best = r+1
            weight_best = weight_vector
            
    with open(Path.cwd()/'generated'/'pickles'/f'federated_{num_clients}_{distribution_name}.pickle', 'wb') as f:
        pickle.dump(dataPickle, f)
        
    return [acc_best, round_best, weight_best]


In [None]:
# === Run our model training Locally === # 

def run_local(train_loader, test_loader, num_clients, batch_size, 
              selected_agent_index, num_rounds, epochs, distribution, distribution_name='distribution'):

    print("=== Local ===")
    np.set_printoptions(precision=3)
    dataPickle = []

    # Instantiate models and optimizers
    shared_model = Net().cuda()
    client_models = [Net().cuda() for _ in range(num_clients)]
    for model in client_models:
        model.load_state_dict(shared_model.state_dict())

    opt = [optim.SGD(model.parameters(), lr=0.1) for model in client_models]

    grad_vector = 0
    weight_vector = np.ones(num_clients)
    
    acc_best = 0
    round_best = 0
    weight_best = [1,0,0,0,0,0,0,0,0,0]
    
    print('%d-th Client' % selected_agent_index)
    for r in range(num_rounds):
        
        print('%d-th round' % r)
        
        # client update
        loss, grad_vector = client_update(client_models[selected_agent_index], opt[selected_agent_index], train_loader[selected_agent_index], epoch=epochs)

        # Evalutate on the selected agent's test set 
        test_loss, acc = evaluate(client_models[selected_agent_index], test_loader)
        
        # Print the results 
        print(f"Loss   : {loss}")
        print('Test loss %0.3g | Test acc: %0.3f\n' % (test_loss, acc))
        
        # Keep the accuracy for each round 
        dataPickle.append([acc,test_loss,loss])
        
        # Update the best accuracy 
        if acc > acc_best:
            acc_best = acc
            round_best = r+1
            
    with open(Path.cwd()/'generated'/'pickles'/f'local_{num_clients}_{distribution_name}.pickle', 'wb') as f:
        pickle.dump(dataPickle, f)

    return [acc_best, round_best, weight_best]

In [None]:
def get_non_iid_loader_distribution(num_clients,batch_size,distribution,selected_agent_index, validation_size=0.1):
    traindata = datasets.MNIST('./data', train=True, download=True,transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))]))
    testdata = datasets.MNIST('./data', train=False, download=True,transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))]))

    target_labels = torch.stack([traindata.targets == i for i in range(10)])
    target_labels_test = torch.stack([testdata.targets == i for i in range(10)])
    target_labels_split = []
    target_labels_split_test = []

    #divide each target labels in small samples
    target_label_division = 100 #need to check if with this number we have len(target_labels_split) = 10 * target_label_division
    for i in range(10):
        target_labels_data =torch.where(target_labels[i])[0]

        target_labels_split += torch.split(target_labels_data, int((len(target_labels_data)) / (target_label_division-1)))
        target_labels_split_test += torch.split(torch.where(target_labels_test[i])[0], int((len(torch.where(target_labels_test[i])[0]))))

        target_labels_split = target_labels_split[:target_label_division*(i+1)] #remove when the split not givin you target_label_division samples but target_label_division +1 samples

    #merge selected samples in each client
    savedDistribution = distribution
    distribution = [target_label_division * x / (max(num_clients,10)/10) for x in distribution]
    samples_used = [0,0,0,0,0,0,0,0,0,0]
    next_samples_used = [0,0,0,0,0,0,0,0,0,0]
    split_client = []
    test_data = torch.tensor([],dtype=torch.long)

    for i in range(num_clients):
        split_client.append(torch.tensor([],dtype=torch.long))
        for n in range(10):
            next_samples_used[n] = samples_used[n] + distribution[n]
        distribution = distribution[1:] + distribution[:1] # shift to left
        
        for number in range(10):

            #add data to test
            if i == selected_agent_index and samples_used[number] < next_samples_used[number]:
                sizeDataTest = int(savedDistribution[number] * len(target_labels_split_test[number]))
                sizeDataTestLeft = len(target_labels_split_test[number]) - sizeDataTest
                t1, t2 = torch.split(target_labels_split_test[number], [sizeDataTest,sizeDataTestLeft])

                test_data = torch.cat((test_data, t1),0)

            while samples_used[number] < next_samples_used[number]:
                split_client[i] = torch.cat((split_client[i], target_labels_split[number*target_label_division+samples_used[number]]),0)
                samples_used[number] += 1

            if samples_used[number] > next_samples_used[number]:
                samples_used[number] -= 1



    traindata_split = [torch.utils.data.Subset(traindata, tl) for tl in split_client]
    testdata_split = torch.utils.data.Subset(testdata, test_data)
    train_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in traindata_split]

    #x_size = len(testdata_split)
    #size_train = int(math.ceil(x_size * (1 - validation_size)))
    #size_validation = int(math.floor(x_size * validation_size))
    #test_set, validation_set = torch.utils.data.random_split(testdata_split, [size_train, size_validation])

    test_loader = torch.utils.data.DataLoader(testdata_split, batch_size=batch_size, shuffle=True)
    #validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=batch_size, shuffle=True)

    return train_loader, test_loader

def get_iid_loader(num_clients,batch_size):
      traindata = datasets.MNIST('./data', train=True, download=True,transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))]))
      traindata_split = torch.utils.data.random_split(traindata, [int(traindata.data.shape[0] / num_clients) for _ in range(num_clients)])
      train_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in traindata_split]
      test_loader = torch.utils.data.DataLoader(datasets.MNIST('./data', train=False, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,))])), batch_size=batch_size, shuffle=True)

      return train_loader, test_loader

def get_iid_loader_with_validation(num_clients, batch_size, validation_size=0.1):
    traindata = datasets.MNIST('./data', train=True, download=True,transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))]))
    testdata = datasets.MNIST('./data', train=False, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,))]))
    
    traindata_split = torch.utils.data.random_split(traindata, [int(traindata.data.shape[0] / num_clients) for _ in range(num_clients)])
    testdata_split = torch.utils.data.random_split(testdata, [int(testdata.data.shape[0] / num_clients) for _ in range(num_clients)])

    train_loader = []
    validation_loader = []

    for x in traindata_split:
      x_size = len(x)
      size_train = int(math.ceil(x_size * (1 - validation_size)))
      size_validation = int(math.floor(x_size * validation_size))
      train_set, validation_set = torch.utils.data.random_split(x, [size_train, size_validation])

      train_loader.append(torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True))
      validation_loader.append(torch.utils.data.DataLoader(validation_set, batch_size=batch_size, shuffle=True))

    test_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in testdata_split]
    

    return train_loader, validation_loader, test_loader


def get_non_IID_loader_digit_pairs(num_clients,batch_size):

        traindata = datasets.MNIST('./data', train=True, download=True,transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))]))
        testdata = datasets.MNIST('./data', train=False, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))]))
        
        train_target_labels = torch.stack([traindata.targets == i for i in range(10)])
        test_target_labels = torch.stack([testdata.targets == i for i in range(10)])
        
        train_split_size = int(60000 / num_clients)
        test_split_size = int(10000 / num_clients)

        train_target_labels_split = []
        test_target_labels_split = []

        for i in range(num_clients):
            train_target_labels_split += torch.split(torch.where(train_target_labels[(2 * i):(2 * (i + 1))].sum(0))[0][:train_split_size], train_split_size)
            test_target_labels_split += torch.split(torch.where(test_target_labels[(2 * i):(2 * (i + 1))].sum(0))[0][:test_split_size], test_split_size)

        traindata_split = [torch.utils.data.Subset(traindata, tl) for tl in train_target_labels_split]
        testdata_split = [torch.utils.data.Subset(testdata, tl) for tl in test_target_labels_split]

        train_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in traindata_split]
        test_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in testdata_split]

        return train_loader, test_loader

# We want to give each agent 3 different digits 
# I'd say we do want to have all digitis at least once 
def generate_permutations(nb_agents=5, sample_size=3):
      available_labels = np.array([0,1,2,3,4,5,6,7,8,9])
      triplets = {}

      valid = False 
      while not valid :
        all_digits = []
        for i in range(nb_agents):
          triplets[i] = np.random.choice(available_labels,sample_size,replace=False)
          all_digits.extend(triplets[i])
        valid = len(np.unique(all_digits)) == len(available_labels)
      return triplets
 
    
def get_non_IID_loader_digit_trios(num_clients,batch_size):

        traindata = datasets.MNIST('./data', train=True, download=True,transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))]))
        testdata = datasets.MNIST('./data', train=False, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))]))
        
        train_target_labels = torch.stack([traindata.targets == i for i in range(10)])
        test_target_labels = torch.stack([testdata.targets == i for i in range(10)])
        
        train_split_size = int(60000 / num_clients)
        test_split_size = int(10000 / num_clients)

        train_target_labels_split = []
        test_target_labels_split = []

        triplets = generate_permutations(num_clients)

        for i in range(num_clients):
            i_labels = triplets[i]
            print(f"Agent {i} is assigned labels {i_labels}")
            train_target_labels_split += torch.split(torch.where(train_target_labels[i_labels].sum(0))[0][:train_split_size], train_split_size)
            test_target_labels_split += torch.split(torch.where(test_target_labels[i_labels].sum(0))[0][:test_split_size], test_split_size)

        traindata_split = [torch.utils.data.Subset(traindata, tl) for tl in train_target_labels_split]
        testdata_split = [torch.utils.data.Subset(testdata, tl) for tl in test_target_labels_split]

        train_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in traindata_split]
        test_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in testdata_split]

        return train_loader, test_loader

In [13]:
# Plots accuracy for each algorithms for w.r.t. the number of rounds

def plot(nb_round, WE_result, FedAvg_result, local_result, weight_avg, title):
    plt.figure(num=None, figsize=(9, 7), facecolor='w', edgecolor='k')
    plt.plot(nb_round, WE_result, '-xb', label='WE')
    plt.plot(nb_round, FedAvg_result, '-og', label="FedAvg")
    plt.plot(nb_round, local_result, '-+k', label="local")
    plt.plot(nb_round, weight_avg, color='r', linestyle='dashed', label='Average weight')
    plt.title(title)
    ax = plt.gca()
    ax.legend(loc=3)
    ax.set_xlabel("Communication round", fontsize=12)
    ax.set_ylabel("Test accuracy / Agents Average weight", fontsize=12)
    plt.savefig(Path.cwd()/'generated'/f'{title}.png')
    plt.show()

In [None]:
# === Run the Benchmarking === #  

# === parameters for the aggregation Schemes === #

selected_agent_index = 0
num_rounds = 30
epochs = 1

# === parameters for training and testing === #

batch_size = 32


# === Benchmarking Parameters === #

# These are all distributions and number of clients on which we are running our algorithms 
distributions = {'A' : [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                 'B' : [0, 0, 0, 0, 0.2, 0.6, 0.2, 0, 0, 0],
                 'C' : [0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 0],
                 'D' : [0, 0, 0, 0.4, 0.1, 0, 0.1, 0.4, 0, 0],
                 'E' : [0, 0, 0, 0.1, 0.2, 0.4, 0.2, 0.1, 0, 0],
                 'F' : [0, 0, 0.1, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0],
                 'G' : [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01 ]}

clients = [10, 20, 50, 100]

# We keep track of our computation's carbon footprint
cumulator = base.Cumulator()
cumulator.on()

for name, distribution in distributions.items():
    for num_clients in clients:
        print(' - Number Client %0.3g, distribution %s: %s' % (num_clients, name, distribution))
        train_loader, test_loader = get_non_iid_loader_distribution(num_clients,batch_size,distribution,selected_agent_index)

        dataPickle = []
        
        # === Run with Weight Erosion aggregation Scheme 
        dataPickle.append(run_weight_erosion(train_loader, test_loader, num_clients, batch_size, selected_agent_index, num_rounds, epochs, distribution, distribution_name=name))

        # === Run Federated Learning aggregation Scheme 
        dataPickle.append(run_federated(train_loader, test_loader, num_clients, batch_size, selected_agent_index, num_rounds, epochs, distribution, distribution_name=name))

        # === Run Local Training 
        dataPickle.append(run_local(train_loader, test_loader, num_clients,batch_size, selected_agent_index, num_rounds, epochs, distribution, distribution_name=name))
        
        # === Store the results as pickles 
        with open(Path.cwd()/'generated'/'pickles'/f'{num_clients}_{name}.pickle', 'wb') as f:
            pickle.dump(dataPickle, f)
            
cumulator.off()
print(f'The total carbon footprint generated by this benchmark is : {cumulator.total_carbon_footprint()} gCO2eq')
        

In [None]:
#=== Non-IID case 
#=== parameters for Schemes
selected_agent_index = 0
num_rounds = 20
epochs = 1

#=== parameters for training and testing
num_clients = 5
batch_size = 32
homogeneity = False

#train_loader, test_loader = get_specific_non_IID_loader(num_clients,batch_size,homogeneity) 
train_loader, test_loader = get_non_IID_loader_digit_trios(num_clients,batch_size)
#cumulator not done yet
cumulator = base.Cumulator()
cumulator.on()

run_weight_erosion_non_IID(train_loader,test_loader,num_clients,batch_size,selected_agent_index,num_rounds,epochs)

cumulator.off()
print('The total carbon footprint for these computations is : ',cumulator.total_carbon_footprint())