In [None]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split, Subset, SubsetRandomSampler
import numpy as np

def cifar_iid(dataset, num_clients):
    # Number of classes in the dataset
    num_classes = len(dataset.classes)

    # Create a list to store indices for each class
    class_indices = [[] for _ in range(num_classes)]

    # Populate class_indices with the indices of each class
    for idx, target in enumerate(dataset.targets):
        class_indices[target].append(idx)

    # Shuffle indices within each class
    for indices in class_indices:
        np.random.shuffle(indices)

    # Calculate the number of samples per client per class
    samples_per_client_per_class = len(dataset) // (num_clients * num_classes)
    # Initialize the list of shards
    # a shard is the portion of the dataset belonging to one of the client
    # we separate each shard in 2 portions:
    # - one will be the actual subset of the dataset used for training
    # - the other will be used to create the validation dataset
    train_shards_indices = [[] for clients in range(num_clients)]
    val_shards_indices = []

    # Distribute the samples uniformly to the clients
    for class_idx in range(num_classes):
        class_indices_for_class = class_indices[class_idx]

        for client_idx in range(num_clients):
            start_idx = client_idx * int(samples_per_client_per_class * 0.8)
            end_idx = (client_idx + 1) * int(samples_per_client_per_class * 0.8)
            train_shards_indices[client_idx].extend(class_indices_for_class[start_idx:end_idx])
        val_shards_indices += class_indices_for_class[end_idx:]

    # Create subsets for each client
    client_subsets = [Subset(dataset, train_shard_indices) for train_shard_indices in train_shards_indices]
    valset = Subset(dataset, val_shards_indices)
    return client_subsets, valset

# Example usage
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
num_clients = 100
client_subsets, valset = cifar_iid(trainset, num_clients)


# Example to verify the class distribution for each client
from collections import Counter
for i, subset in enumerate(client_subsets):
    targets = [trainset.targets[idx] for idx in subset.indices]
    class_count = Counter(targets)
    print(f"Client {i} class distribution: {class_count}")


# Split each client subset into training and validation subsets
trainloaders = [DataLoader(client_subset, batch_size=4, shuffle=True, num_workers=2) for client_subset in client_subsets]
valloader = DataLoader(valset, batch_size=4, shuffle=False, num_workers=2)

# Example function to check classes of elements in a DataLoader
def check_classes(dataloader):
    all_labels = []

    for images, labels in dataloader:
        all_labels.extend(labels.numpy())

    # Count occurrences of each class
    class_counts = Counter(all_labels)
    print(f"Class distribution: {class_counts}")
    print(len(class_counts))

check_classes(trainloaders[0])


Files already downloaded and verified
Client 0 class distribution: Counter({0: 4, 1: 4, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 4, 8: 4, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4, 15: 4, 16: 4, 17: 4, 18: 4, 19: 4, 20: 4, 21: 4, 22: 4, 23: 4, 24: 4, 25: 4, 26: 4, 27: 4, 28: 4, 29: 4, 30: 4, 31: 4, 32: 4, 33: 4, 34: 4, 35: 4, 36: 4, 37: 4, 38: 4, 39: 4, 40: 4, 41: 4, 42: 4, 43: 4, 44: 4, 45: 4, 46: 4, 47: 4, 48: 4, 49: 4, 50: 4, 51: 4, 52: 4, 53: 4, 54: 4, 55: 4, 56: 4, 57: 4, 58: 4, 59: 4, 60: 4, 61: 4, 62: 4, 63: 4, 64: 4, 65: 4, 66: 4, 67: 4, 68: 4, 69: 4, 70: 4, 71: 4, 72: 4, 73: 4, 74: 4, 75: 4, 76: 4, 77: 4, 78: 4, 79: 4, 80: 4, 81: 4, 82: 4, 83: 4, 84: 4, 85: 4, 86: 4, 87: 4, 88: 4, 89: 4, 90: 4, 91: 4, 92: 4, 93: 4, 94: 4, 95: 4, 96: 4, 97: 4, 98: 4, 99: 4})
Client 1 class distribution: Counter({0: 4, 1: 4, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 4, 8: 4, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4, 15: 4, 16: 4, 17: 4, 18: 4, 19: 4, 20: 4, 21: 4, 22: 4, 23: 4, 24: 4, 25: 4, 26: 4, 27: 4, 28: 4, 29

  self.pid = os.fork()


Class distribution: Counter({5: 4, 3: 4, 0: 4, 36: 4, 67: 4, 39: 4, 59: 4, 55: 4, 94: 4, 8: 4, 6: 4, 82: 4, 71: 4, 47: 4, 17: 4, 72: 4, 29: 4, 22: 4, 89: 4, 1: 4, 88: 4, 9: 4, 65: 4, 54: 4, 81: 4, 96: 4, 52: 4, 11: 4, 73: 4, 25: 4, 98: 4, 79: 4, 41: 4, 75: 4, 40: 4, 62: 4, 42: 4, 31: 4, 74: 4, 13: 4, 37: 4, 21: 4, 16: 4, 48: 4, 20: 4, 18: 4, 60: 4, 69: 4, 26: 4, 93: 4, 15: 4, 34: 4, 77: 4, 49: 4, 92: 4, 2: 4, 4: 4, 66: 4, 84: 4, 56: 4, 27: 4, 85: 4, 45: 4, 33: 4, 86: 4, 53: 4, 23: 4, 50: 4, 83: 4, 32: 4, 38: 4, 19: 4, 57: 4, 90: 4, 95: 4, 78: 4, 51: 4, 99: 4, 14: 4, 76: 4, 63: 4, 7: 4, 64: 4, 87: 4, 91: 4, 58: 4, 28: 4, 12: 4, 97: 4, 30: 4, 61: 4, 43: 4, 35: 4, 46: 4, 10: 4, 70: 4, 44: 4, 24: 4, 80: 4, 68: 4})
100


In [89]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split, Subset, SubsetRandomSampler
import numpy as np
# Define function for sharding with Nc
import random
def cifar_noniid(dataset, num_clients, Nc):
    def class_clients_sharding(num_classes, Nc):
        class_clients = {key: set() for key in range(num_classes)}
        first_clients = list(range(num_classes))
        clients_list = [num // (Nc-1) for num in range((Nc-1)*100)]
        random.shuffle(first_clients)
        for i in range(num_classes):
            class_clients[i].add(first_clients[i])


        for j in range(1,Nc):
            class_list = list(range(num_classes))
            for i in range(num_classes):
                random_class = random.choice(class_list)
                class_list.remove(random_class)

                clients_list_cleaned = [client for client in clients_list if client not in class_clients[random_class]]

                random_client = random.choice(clients_list_cleaned)
                class_clients[random_class].add(random_client)
                clients_list.remove(random_client)

        return class_clients

    num_classes = len(dataset.classes)

    error = True
    while error:
        try:
            class_clients = class_clients_sharding(num_classes, Nc)
            error = False
        except Exception as e:
            pass
    for i in range(num_classes):
        print(class_clients[i])
    # Create a list to store indices for each class
    class_indices = [[] for _ in range(num_classes)]

    # Populate class_indices with the indices of each class
    for idx, target in enumerate(dataset.targets):
        class_indices[target].append(idx)

    # Shuffle indices within each class
    for indices in class_indices:
        np.random.shuffle(indices)

    train_shards_indices = [[] for clients in range(num_clients)]
    val_shards_indices = []
    samples_per_client_per_class = len(dataset) // (Nc * num_classes)

    # Distribute the samples uniformly to the clients
    for class_idx in range(num_classes):
        class_indices_for_class = class_indices[class_idx]
        clients = class_clients[class_idx].copy()
        for client_idx in range(Nc):
        #for client in class_clients[class_idx]:
            client = random.choice(list(clients))
            clients.remove(client)

            start_idx = client_idx * int(samples_per_client_per_class * 0.8)
            end_idx = (client_idx + 1) * int(samples_per_client_per_class * 0.8)
            train_shards_indices[client].extend(class_indices_for_class[start_idx:end_idx])
        val_shards_indices += class_indices_for_class[end_idx:]

    # Create subsets for each client
    client_subsets = [Subset(dataset, train_shard_indices) for train_shard_indices in train_shards_indices]
    valset = Subset(dataset, val_shards_indices)
    return client_subsets, valset

# Example usage
num_clients = 100
Nc = 50


transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
trainset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)

# Create client loaders using sharding with Nc
client_subsets, valset = cifar_noniid(trainset, num_clients, Nc)

# Example to verify the class distribution for each client
from collections import Counter
for i, subset in enumerate(client_subsets):
    targets = [trainset.targets[idx] for idx in subset.indices]
    class_count = Counter(targets)
    print(f"Client {i} class distribution: {class_count}")


# Creating the loaders
trainloaders = [DataLoader(client_subset, batch_size=4, shuffle=True, num_workers=2) for client_subset in client_subsets]
valloader = DataLoader(valset, batch_size=4, shuffle=False, num_workers=2)

# Example function to check classes of elements in a DataLoader
def check_classes(dataloader):
    all_labels = []

    for images, labels in dataloader:
        all_labels.extend(labels.numpy())

    # Count occurrences of each class
    class_counts = Counter(all_labels)
    print(f"Class distribution: {class_counts}")
    print(len(class_counts))

check_classes(trainloaders[0])

Files already downloaded and verified
{0, 2, 5, 7, 9, 11, 13, 16, 19, 20, 22, 23, 25, 26, 27, 32, 34, 35, 36, 38, 41, 43, 45, 47, 48, 49, 50, 51, 53, 54, 56, 57, 59, 61, 64, 72, 73, 75, 77, 78, 80, 82, 86, 90, 91, 92, 93, 95, 97, 99}
{4, 5, 6, 8, 9, 10, 12, 15, 17, 18, 19, 20, 21, 22, 28, 31, 33, 34, 36, 38, 40, 41, 46, 47, 48, 51, 52, 53, 55, 57, 59, 60, 62, 63, 66, 68, 69, 70, 73, 75, 80, 81, 84, 85, 88, 89, 91, 92, 95, 99}
{0, 1, 5, 8, 12, 13, 14, 16, 20, 21, 25, 28, 31, 33, 34, 35, 38, 40, 41, 44, 46, 48, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 68, 69, 70, 71, 72, 76, 77, 80, 85, 87, 90, 93, 98, 99}
{1, 2, 3, 4, 5, 7, 9, 10, 11, 13, 16, 17, 18, 21, 22, 25, 28, 29, 31, 33, 34, 36, 37, 38, 40, 41, 43, 44, 45, 46, 52, 53, 54, 55, 62, 64, 67, 68, 70, 73, 76, 81, 82, 84, 87, 89, 91, 92, 95, 98}
{3, 4, 6, 7, 9, 10, 12, 14, 15, 19, 20, 23, 24, 25, 26, 29, 31, 32, 35, 38, 39, 42, 47, 48, 49, 50, 51, 52, 57, 58, 60, 61, 62, 65, 67, 69, 70, 71, 77, 79, 81, 82, 83, 86, 87, 88,