In [1]:
import sys
import os
import argparse

# 添加环境
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../MyExpr")))
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../FedML")))
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../")))

print(sys.path)

['/home/guest/Fed_Expr', '/home/guest/Fed_Expr/FedML', '/home/guest/Fed_Expr/MyExpr', '/home/guest/Fed_Expr/MyExpr/notebook', '/home/guest/miniconda/envs/fedml/lib/python37.zip', '/home/guest/miniconda/envs/fedml/lib/python3.7', '/home/guest/miniconda/envs/fedml/lib/python3.7/lib-dynload', '', '/home/guest/miniconda/envs/fedml/lib/python3.7/site-packages', '/home/guest/miniconda/envs/fedml/lib/python3.7/site-packages/IPython/extensions', '/home/guest/.ipython']


In [None]:
import numpy as np
import torch

shard_per_user = 2
num_users = 100
num_classes = 10
shard_per_class = int(shard_per_user * num_users / num_classes)

# 随机给每个client分配 num_classes*shard_per_class 个 shard
rand_set_all = list(range(num_classes)) * shard_per_class
np.random.shuffle(rand_set_all)
rand_set_all = np.array(rand_set_all).reshape((num_users, -1))
# print(rand_set_all)

# for _ in range(10):
#     print(np.random.choice(10, replace=False))

# d = {1:1, 2:3, 3:4}
# print(d)
# d.pop(1)
# print(d)
# for key, value in d.items():
#     print(key, value)

In [None]:
import argparse

parser = argparse.ArgumentParser(description='Federated Learning')
parser.add_argument('--data_seed', default=0, type=int, help="Random seed for initializing data")
args = parser.parse_known_args()[0]


In [None]:
def client_noniid(dataset, num_users, shard_per_user, rand_set_all=[], seed=args.data_seed):
    """
    Sample non-IID client data from dataset in pathological manner - from LG-FedAvg implementation
    :param dataset:
    :param num_users:
    :return: (dictionary, where keys = client_id / index, and values are dataset indices), rand_set_all (all classes)

    shard_per_user should be a factor of the dataset size
    """
    dict_users = {i: np.array([], dtype='int64') for i in range(num_users)}

    idxs_dict = {}

    # 将dataset中的数据放入字典
    for i in range(len(dataset)):
        label = torch.tensor(dataset.targets[i]).item()
        if label not in idxs_dict.keys():
            idxs_dict[label] = []
        idxs_dict[label].append(i)

    # 统计标签种类
    num_classes = len(np.unique(dataset.targets))
    # 计算每个类要分出多少个shard
    shard_per_class = int(shard_per_user * num_users / num_classes)
    
    # 生成 label -> shards的字典
    for label in idxs_dict.keys():
        x = idxs_dict[label]
        # 计算分shard后多余的数据量
        num_leftover = len(x) % shard_per_class
        # 记录余数
        leftover = x[-num_leftover:] if num_leftover > 0 else []
        # 裁剪多余数据
        x = np.array(x[:-num_leftover]) if num_leftover > 0 else np.array(x)
        # reshape成[shard_num, shard_size]
        x = x.reshape((shard_per_class, -1))
        x = list(x)

        # 多余数据有限填充到x[i]
        for i, idx in enumerate(leftover):
            x[i] = np.concatenate([x[i], [idx]])
        # idxs_dict: label -> [shard_num, shard_zie]
        idxs_dict[label] = x

    # 总共num_classes*shard_per_class个shard， 随机给每个client分配 
    np.random.seed(seed)
    if len(rand_set_all) == 0:
        rand_set_all = list(range(num_classes)) * shard_per_class
        np.random.shuffle(rand_set_all)
        rand_set_all = np.array(rand_set_all).reshape((num_users, -1))

    # 按照上述shard划分，为每个user分配对应的shard
    # Divide and assign
    np.random.seed(seed)
    for i in range(num_users):
        rand_set_label = rand_set_all[i]
        rand_set = []
        for label in rand_set_label:
            # replace是指抽取不放回，但是这里并没有指定size，每次都独立取出一个idx。
            idx = np.random.choice(len(idxs_dict[label]), replace=False)
            # 弹出idx，防止重复取
            rand_set.append(idxs_dict[label].pop(idx))
        # print(rand_set)
        # 将rand_set拼接成一条数据，放入dic_user[i]中
        dict_users[i] = np.concatenate(rand_set)


    test = []
    for key, value in dict_users.items():
        x = np.unique(torch.tensor(dataset.targets)[value])
        assert(len(x)) <= shard_per_user
        test.append(value)
    test = np.concatenate(test)
    assert(len(test) == len(dataset))
    assert(len(set(list(test))) == len(dataset))

    return dict_users, rand_set_all

In [None]:
from collections import Counter

def compute_emd(targets_1, targets_2):
    """Calculates Earth Mover's Distance between two array-like objects (dataset labels)"""
    total_targets = []
    total_targets.extend(list(np.unique(targets_1)))
    total_targets.extend(list(np.unique(targets_2)))

    emd = 0

    counts_1 = Counter(targets_1)
    counts_2 = Counter(targets_2)

    size_1 = len(targets_1)
    size_2 = len(targets_2)

    for t in counts_1:
        count_2 = counts_2[t] if t in counts_2 else 0
        emd += np.abs((counts_1[t] / size_1) - (count_2 / size_2))

    for t in counts_2:
        count_1 = counts_1[t] if t in counts_1 else 0
        emd += np.abs((counts_2[t] / size_2) - (count_1 / size_1))

    return emd

In [None]:
from torchvision import transforms
from torchvision.datasets import utils, MNIST, CIFAR10

tra_trans = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ])
val_trans = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
# len(trainset) = 50000 len(testset) = 10000
trainset = CIFAR10(root="./../data", train=True, download=True, transform=tra_trans)
testset = CIFAR10(root="./../data", train=False, download=True, transform=val_trans)

In [None]:
d_u, rsa = client_noniid(trainset, 100, 5, seed=0)
print(len(d_u))
print(len(d_u[0]))
print(len(np.unique(d_u[0])))

# for key, value in d_u.items():
#     # 抽出当前client包含的class
#     # 有可能会给client放入同类
#     x = np.unique(torch.tensor(trainset.targets)[value])
#     # print(torch.tensor(trainset.targets)[value])
#     print("x=", x)
#     # print("value=", value)
#     # break
#     assert(len(x)) <= shard_per_user

emd = []
for ix in d_u:
    client_targets = [trainset.targets[x] for x in d_u[ix]]
    emd.append(compute_emd(client_targets, trainset.targets))
print(emd)
average_emd = np.mean([compute_emd([trainset.targets[x] for x in d_u[ix]], trainset.targets) for ix in d_u], axis=0)
# average_emd = np.mean(emd)
print(average_emd)

In [None]:
dic = {1:['a','b'], 3:['q','s'], 4:['g','u'], 5:['n','v']}
for i in dic:
    print(i)

In [28]:
import argparse

parser = argparse.ArgumentParser(description='Federated Learning')
parser.add_argument('--data_seed', default=0, type=int, help="Random seed for initializing data")
args = parser.parse_known_args()[0]


In [29]:
def client_noniid(dataset, num_users, shard_per_user, rand_set_all=[], seed=args.data_seed):
    """
    Sample non-IID client data from dataset in pathological manner - from LG-FedAvg implementation
    :param dataset:
    :param num_users:
    :return: (dictionary, where keys = client_id / index, and values are dataset indices), rand_set_all (all classes)

    shard_per_user should be a factor of the dataset size
    """
    dict_users = {i: np.array([], dtype='int64') for i in range(num_users)}

    idxs_dict = {}

    # 将dataset中的数据放入字典
    for i in range(len(dataset)):
        label = torch.tensor(dataset.targets[i]).item()
        if label not in idxs_dict.keys():
            idxs_dict[label] = []
        idxs_dict[label].append(i)

    # 统计标签种类
    num_classes = len(np.unique(dataset.targets))
    # 计算每个类要分出多少个shard
    shard_per_class = int(shard_per_user * num_users / num_classes)
    
    # 生成 label -> shards的字典
    for label in idxs_dict.keys():
        x = idxs_dict[label]
        # 计算分shard后多余的数据量
        num_leftover = len(x) % shard_per_class
        # 记录余数
        leftover = x[-num_leftover:] if num_leftover > 0 else []
        # 裁剪多余数据
        x = np.array(x[:-num_leftover]) if num_leftover > 0 else np.array(x)
        # reshape成[shard_num, shard_size]
        x = x.reshape((shard_per_class, -1))
        x = list(x)

        # 多余数据有限填充到x[i]
        for i, idx in enumerate(leftover):
            x[i] = np.concatenate([x[i], [idx]])
        # idxs_dict: label -> [shard_num, shard_zie]
        idxs_dict[label] = x

    # 总共num_classes*shard_per_class个shard， 随机给每个client分配 
    np.random.seed(seed)
    if len(rand_set_all) == 0:
        rand_set_all = list(range(num_classes)) * shard_per_class
        np.random.shuffle(rand_set_all)
        rand_set_all = np.array(rand_set_all).reshape((num_users, -1))

    # 按照上述shard划分，为每个user分配对应的shard
    # Divide and assign
    np.random.seed(seed)
    for i in range(num_users):
        rand_set_label = rand_set_all[i]
        rand_set = []
        for label in rand_set_label:
            # replace是指抽取不放回，但是这里并没有指定size，每次都独立取出一个idx。
            idx = np.random.choice(len(idxs_dict[label]), replace=False)
            # 弹出idx，防止重复取
            rand_set.append(idxs_dict[label].pop(idx))
        # print(rand_set)
        # 将rand_set拼接成一条数据，放入dic_user[i]中
        dict_users[i] = np.concatenate(rand_set)


    test = []
    for key, value in dict_users.items():
        x = np.unique(torch.tensor(dataset.targets)[value])
        assert(len(x)) <= shard_per_user
        test.append(value)
    test = np.concatenate(test)
    assert(len(test) == len(dataset))
    assert(len(set(list(test))) == len(dataset))

    return dict_users, rand_set_all

In [44]:
from collections import Counter

def compute_emd(targets_1, targets_2):
    """Calculates Earth Mover's Distance between two array-like objects (dataset labels)"""
    total_targets = []
    total_targets.extend(list(np.unique(targets_1)))
    total_targets.extend(list(np.unique(targets_2)))

    emd = 0

    counts_1 = Counter(targets_1)
    counts_2 = Counter(targets_2)

    size_1 = len(targets_1)
    size_2 = len(targets_2)

    for t in counts_1:
        count_2 = counts_2[t] if t in counts_2 else 0
        emd += np.abs((counts_1[t] / size_1) - (count_2 / size_2))

    for t in counts_2:
        count_1 = counts_1[t] if t in counts_1 else 0
        emd += np.abs((counts_2[t] / size_2) - (count_1 / size_1))

    return emd

In [30]:
from torchvision import transforms
from torchvision.datasets import utils, MNIST, CIFAR10

tra_trans = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ])
val_trans = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
# len(trainset) = 50000 len(testset) = 10000
trainset = CIFAR10(root="./../data", train=True, download=True, transform=tra_trans)
testset = CIFAR10(root="./../data", train=False, download=True, transform=val_trans)

Files already downloaded and verified
Files already downloaded and verified


In [56]:
d_u, rsa = client_noniid(trainset, 100, 5, seed=0)
print(len(d_u))
print(len(d_u[0]))
print(len(np.unique(d_u[0])))

# for key, value in d_u.items():
#     # 抽出当前client包含的class
#     # 有可能会给client放入同类
#     x = np.unique(torch.tensor(trainset.targets)[value])
#     # print(torch.tensor(trainset.targets)[value])
#     print("x=", x)
#     # print("value=", value)
#     # break
#     assert(len(x)) <= shard_per_user

emd = []
for ix in d_u:
    client_targets = [trainset.targets[x] for x in d_u[ix]]
    emd.append(compute_emd(client_targets, trainset.targets))
print(emd)
average_emd = np.mean([compute_emd([trainset.targets[x] for x in d_u[ix]], trainset.targets) for ix in d_u], axis=0)
# average_emd = np.mean(emd)
print(average_emd)

100
500
500
[1.5000000000000002, 1.8000000000000007, 2.1000000000000005, 1.5000000000000002, 1.8000000000000003, 1.5000000000000002, 2.1000000000000005, 1.8000000000000005, 1.8000000000000007, 1.5000000000000002, 1.8000000000000003, 1.8000000000000003, 1.5000000000000002, 1.5000000000000002, 1.8000000000000003, 1.5000000000000002, 2.1000000000000005, 1.5000000000000002, 2.1000000000000005, 1.8000000000000003, 1.8000000000000007, 1.5000000000000002, 1.8000000000000003, 1.5000000000000002, 1.5000000000000002, 2.1000000000000005, 1.8000000000000003, 1.5000000000000002, 1.8000000000000007, 1.8000000000000005, 1.8000000000000003, 1.8000000000000003, 1.8000000000000007, 2.1000000000000005, 1.5000000000000002, 1.8000000000000007, 1.5000000000000002, 1.8000000000000007, 1.5000000000000002, 2.1, 1.5000000000000002, 1.5000000000000002, 1.8000000000000003, 1.5000000000000002, 1.5000000000000002, 2.1000000000000005, 1.8000000000000003, 1.5000000000000002, 1.8000000000000005, 2.1, 1.800000000000000

In [57]:
dic = {1:['a','b'], 3:['q','s'], 4:['g','u'], 5:['n','v']}
for i in dic:
    print(i)

1
3
4
5
