In [4]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
from tqdm import tqdm


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
        self.fc1 = nn.Linear(2048, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.conv3(x)
        x = F.relu(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


class GradientStocker:
    def __init__(self, model_names):
        for item in model_names:
            setattr(self, item, 0)

    def get_attributes(self):
        return self.__dict__

    def add_gradient(self, model):
        for name, param in model.named_parameters():
            setattr(self, name, getattr(self, name) + param.grad.data.cpu())

    def euclidian_distance(self, grad_current_agent):
        """Computes the relative euclidean distance of the flattened tensor between the current model and the global model"""
        flattened_grad_selected = self.flatten(list(self.get_attributes().values()))
        flattened_grad_current = self.flatten(list(grad_current_agent.get_attributes().values()))
        return torch.dist(flattened_grad_selected, flattened_grad_current, 2) / torch.norm(flattened_grad_selected, 2)

    def flatten(self, gradient_list):
        """Returns an aggregated tensor of all the gradients for one model"""
        gradients = list(map(lambda g : torch.flatten(g), gradient_list))
        return torch.cat(gradients, 0)


def client_update(client_model, optimizer, train_loader, epoch=5):
    """Train a client_model on the train_loder data."""
    model_names = []
    for name, param in model.named_parameters():
        model_names.append(name)
    gradient_stocker = GradientStocker(model_names)
    
    for e in range(epoch):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = client_model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            gradient_stocker.add_gradient(client_model)
    return loss.item(), gradient_stocker


def weighted_average_gradients(gradients, weights):
    """Compute the weighted average gradient."""
    weighted_averages = {}
    for key in gradients[0].get_attributes().keys():
        weighted_averages[key] = weighted_average_from_key(key, gradients, weights)
    return weighted_averages

def weighted_average_from_key(key, gradients, weights):
    n = 0
    d = 0 
    for idx, g_dict in enumerate(gradients) :
        n += g_dict.get_attributes()[key] * weights[idx]
        d += weights[idx]
    return n / d

def compute_weight(alpha_prev, round, relative_distance, data_size, batch_size, distance_penalty, size_penalty):
    """Computes the weight alpha for round r"""
    size_factor = (1 + size_penalty * math.floor(((round - 1) * batch_size) / data_size)) 
    distance_factor = distance_penalty * relative_distance
    alpha = alpha_prev - size_factor * distance_factor 
    return max(0,alpha)

def update_grad(model, gradient, alpha): 
    for name, param in model.named_parameters():
        param.data -= gradient[name].cuda() * alpha
    return model 

def share_weight_erosion_model(shared_model, client_models):
    for model in client_models:
        model.load_state_dict(shared_model.state_dict())

def evaluate(global_model, data_loader):
    """Compute loss and accuracy of a model on a data_loader."""
    global_model.eval()
    loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.cuda(), target.cuda()
            output = global_model(data)
            loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    loss /= len(data_loader.dataset)
    acc = correct / len(data_loader.dataset)

    return loss, acc

ModuleNotFoundError: No module named 'torchvision'

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [26]:
# IID case: all the clients have images of all the classes

# Hyperparameters

num_clients = 5
num_rounds = 10
epochs = 1
batch_size = 32
distance_penalty = 0.05
size_penalty = 2
selected_agent_index = 0

# weight_vector

weight_vector = np.ones(num_clients)

# Creating decentralized datasets

traindata = datasets.MNIST('./data', train=True, download=True,
                       transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])
                       )
traindata_split = torch.utils.data.random_split(traindata, [int(traindata.data.shape[0] / num_clients) for _ in range(num_clients)])
train_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in traindata_split]
test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=False, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])
        ), batch_size=batch_size, shuffle=True)
# Instantiate models and optimizers

to_share_model = Net().cuda()
client_models = [Net().cuda() for _ in range(num_clients)]
for model in client_models:
    model.load_state_dict(to_share_model.state_dict())

opt = [optim.SGD(model.parameters(), lr=0.1) for model in client_models]

grad_vector = [None for _ in range(num_clients)]
# Runnining Weight Erosion 

for r in range(num_rounds):
    # client update
    loss = 0
    for i in range(num_clients):
        loss_tmp, grad_vector[i] = client_update(client_models[i], opt[i], train_loader[i], epoch=epochs)
        loss += loss_tmp
        d_rel = relative_distance_vector(grad_vector[selected_agent_index], grad_vector[i])
        weight_vector[i] = compute_weight(weight_vector[i], r + 1, d_rel, len(train_loader[i]), batch_size, distance_penalty, size_penalty)
        print(f"Weight alpha for agent {i} : {weight_vector[i]}")
    
    # Weight Erosion Scheme 
    weighted_mean_gradient = weighted_average_gradients(grad_vector, weight_vector)
    to_share_model = update_grad(to_share_model, weighted_mean_gradient, 0.1)
    
    # Share model to all agents 
    share_weight_erosion_model(to_share_model, client_models)
    
    # Evalutate on the global test set (for now)
    test_loss, acc = evaluate(to_share_model, test_loader)

    print('%d-th round' % r)
    print('average train loss %0.3g | test loss %0.3g | test acc: %0.3f' % (loss / num_clients, test_loss, acc))

Weight alpha for agent 0 : 1.0
Weight alpha for agent 1 : 0.9751015901565552
Weight alpha for agent 2 : 0.9749845266342163
Weight alpha for agent 3 : 0.9743555784225464
Weight alpha for agent 4 : 0.9722981452941895
0-th round
average train loss 0.326 | test loss 0.158 | test acc: 0.954
Weight alpha for agent 0 : 1.0
Weight alpha for agent 1 : 0.9280137419700623
Weight alpha for agent 2 : 0.9273066520690918
Weight alpha for agent 3 : 0.9273982048034668
Weight alpha for agent 4 : 0.9252562522888184
1-th round
average train loss 0.137 | test loss 0.0893 | test acc: 0.972
Weight alpha for agent 0 : 1.0
Weight alpha for agent 1 : 0.8709380626678467
Weight alpha for agent 2 : 0.8714126944541931
Weight alpha for agent 3 : 0.8689270615577698
Weight alpha for agent 4 : 0.8669538497924805
2-th round
average train loss 0.0488 | test loss 0.0652 | test acc: 0.978
Weight alpha for agent 0 : 1.0
Weight alpha for agent 1 : 0.8078835606575012
Weight alpha for agent 2 : 0.8085888028144836
Weight alpha 

In [27]:
# NON-IID case: every client has images of two categories chosen from [0, 1], [2, 3], [4, 5], [6, 7], or [8, 9].

# Hyperparameters

num_clients = 5
num_rounds = 10
epochs = 1
batch_size = 32
distance_penalty = 0.05
size_penalty = 2
selected_agent_index = 0

# weight_vector

weight_vector = np.ones(num_clients)

# Creating decentralized datasets

traindata = datasets.MNIST('./data', train=True, download=True,
                       transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])
                       )
target_labels = torch.stack([traindata.targets == i for i in range(10)])
target_labels_split = []
for i in range(5):
    target_labels_split += torch.split(torch.where(target_labels[(2 * i):(2 * (i + 1))].sum(0))[0], int(60000 / num_clients))
traindata_split = [torch.utils.data.Subset(traindata, tl) for tl in target_labels_split]
train_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in traindata_split]

test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=False, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])
        ), batch_size=batch_size, shuffle=True)

# Instantiate models and optimizers

to_share_model = Net().cuda()
client_models = [Net().cuda() for _ in range(num_clients)]
for model in client_models:
    model.load_state_dict(to_share_model.state_dict())

opt = [optim.SGD(model.parameters(), lr=0.1) for model in client_models]


for r in range(num_rounds):
    # client update
    loss = 0
    for i in range(num_clients):
        loss_tmp, grad_vector[i] = client_update(client_models[i], opt[i], train_loader[i], epoch=epochs)
        loss += loss_tmp
        d_rel = relative_distance_vector(grad_vector[selected_agent_index], grad_vector[i])
        weight_vector[i] = compute_weight(weight_vector[i], r + 1, d_rel, len(train_loader[i]), batch_size, distance_penalty, size_penalty)
        print(f"Weight alpha for agent {i} : {weight_vector[i]}")
    
    # Weight Erosion Scheme 
    weighted_mean_gradient = weighted_average_gradients(grad_vector, weight_vector)
    to_share_model = update_grad(to_share_model, weighted_mean_gradient, 0.1)
    
    # Share model to all agents 
    share_weight_erosion_model(to_share_model, client_models)
    
    # Evalutate on the global test set (for now)
    test_loss, acc = evaluate(to_share_model, test_loader)

    print('%d-th round' % r)
    print('average train loss %0.3g | test loss %0.3g | test acc: %0.3f' % (loss / num_clients, test_loss, acc))

Weight alpha for agent 0 : 1.0
Weight alpha for agent 1 : 0.9775323271751404
Weight alpha for agent 2 : 0.917000949382782
Weight alpha for agent 3 : 0.9496455192565918
Weight alpha for agent 4 : 0.9249451756477356
0-th round
average train loss 0.432 | test loss 2.32 | test acc: 0.212
Weight alpha for agent 0 : 1.0
Weight alpha for agent 1 : 0.9514561891555786
Weight alpha for agent 2 : 0.7761341333389282
Weight alpha for agent 3 : 0.8420314788818359
Weight alpha for agent 4 : 0.8196696639060974
1-th round
average train loss 0.179 | test loss 2.41 | test acc: 0.334
Weight alpha for agent 0 : 1.0
Weight alpha for agent 1 : 0.9206491708755493
Weight alpha for agent 2 : 0.6402342915534973
Weight alpha for agent 3 : 0.7097185850143433
Weight alpha for agent 4 : 0.7099452018737793
2-th round
average train loss 0.083 | test loss 2.84 | test acc: 0.368
Weight alpha for agent 0 : 1.0
Weight alpha for agent 1 : 0.8791031241416931
Weight alpha for agent 2 : 0.4820711016654968
Weight alpha for age

In [None]:
# IID case: all the clients have images of all the classes

# Hyperparameters

num_clients = 5
num_rounds = 5
epochs = 1
batch_size = 32

# Communication matrix

comm_matrix = np.ones((num_clients, num_clients)) / num_clients
# comm_matrix = np.eye(num_clients)

# Creating decentralized datasets

traindata = datasets.MNIST('./data', train=True, download=True,
                       transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])
                       )
traindata_split = torch.utils.data.random_split(traindata, [int(traindata.data.shape[0] / num_clients) for _ in range(num_clients)])
train_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in traindata_split]

test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=False, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])
        ), batch_size=batch_size, shuffle=True)

# Instantiate models and optimizers

global_model = Net().cuda()
client_models = [Net().cuda() for _ in range(num_clients)]
for model in client_models:
    model.load_state_dict(global_model.state_dict())

opt = [optim.SGD(model.parameters(), lr=0.1) for model in client_models]

# Runnining Decentralized training

for r in range(num_rounds):
    # client update
    loss = 0
    for i in range(num_clients):
        loss += client_update(client_models[i], opt[i], train_loader[i], epoch=epochs)
    
    # diffuse params
    diffuse_params(client_models, comm_matrix)

    average_models(global_model, client_models)
    test_loss, acc = evaluate(global_model, test_loader)
    
    print('%d-th round' % r)
    print('average train loss %0.3g | test loss %0.3g | test acc: %0.3f' % (loss / num_clients, test_loss, acc))