# The notebook contains
### Code for data split iid and non-iid manner

### Import Library

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
from __future__ import print_function
import argparse, os, sys, csv, shutil, time, random, operator, pickle, ast, math
import numpy as np
import pandas as pd
from torch.optim import Optimizer
import torch.nn.functional as F
import torch
import pickle
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data as data
import torch.multiprocessing as mp

### Get cifar10 data

In [7]:
import torchvision.transforms as transforms
import torchvision.datasets as datasets
data_loc='/mnt/nfs/work1/amir/vshejwalkar/cifar10_data/'
# load the train dataset

train_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

cifar10_train = datasets.CIFAR10(root=data_loc, train=True, download=True, transform=train_transform)

cifar10_test = datasets.CIFAR10(root=data_loc, train=False, download=True, transform=train_transform)

X=[]
Y=[]
for i in range(len(cifar10_train)):
    X.append(cifar10_train[i][0].numpy())
    Y.append(cifar10_train[i][1])

for i in range(len(cifar10_test)):
    X.append(cifar10_test[i][0].numpy())
    Y.append(cifar10_test[i][1])

X=np.array(X)
Y=np.array(Y)

print('total data len: ',len(X))

if not os.path.isfile('./cifar10_shuffle.pkl'):
    all_indices = np.arange(len(X))
    np.random.shuffle(all_indices)
    pickle.dump(all_indices,open('./cifar10_shuffle.pkl','wb'))
else:
    all_indices=pickle.load(open('./cifar10_shuffle.pkl','rb'))

X=X[all_indices]
Y=Y[all_indices]

print(X.shape)
print(Y.shape)

Files already downloaded and verified
Files already downloaded and verified
total data len:  60000
(60000, 3, 32, 32)
(60000,)


In [8]:
nusers=50
user_tr_len=1000

total_tr_len=user_tr_len*nusers
val_len=5000
te_len=5000

# data loading
print('total data len: ',len(X))

if not os.path.isfile('./cifar10_shuffle.pkl'):
    all_indices = np.arange(len(X))
    np.random.shuffle(all_indices)
    pickle.dump(all_indices,open('./cifar10_shuffle.pkl','wb'))
else:
    all_indices=pickle.load(open('./cifar10_shuffle.pkl','rb'))


total data len:  60000


### Non-iid split using Dirichlet distribution

### Divide cifar10 data among 50 clients in Non-IID fashion (Christy)

In [9]:
user_tr_data_tensors = []
user_tr_label_tensors = []

# Group data by label
num_classes = len(np.unique(Y))
grouped_indices = [np.where(Y == i)[0] for i in range(num_classes)]

# split training/ testing/ validation sets
tr_indices = []
te_indices = []
val_indices = []

for i in range(num_classes):
    tr_indices.extend(grouped_indices[i][:5000])
    te_indices.extend(grouped_indices[i][5000:5500])
    val_indices.extend(grouped_indices[i][5500:])

total_tr_data=X[tr_indices]
total_tr_label=Y[tr_indices]

val_data=X[te_indices]
val_label=Y[te_indices]

te_data=X[val_indices]
te_label=Y[val_indices]

total_tr_data_tensor=torch.from_numpy(total_tr_data).type(torch.FloatTensor)
total_tr_label_tensor=torch.from_numpy(total_tr_label).type(torch.LongTensor)


# users have data with a skewed distribution of two dominant classes

grouped_indices = [np.where(total_tr_label == i)[0] for i in range(num_classes)]

dominant_classes_num = 2
labels = np.unique(Y)
repetitions = dominant_classes_num * nusers / len(labels)
number_pool = np.repeat(labels, repetitions)
np.random.shuffle(number_pool)
user_selections = [number_pool[i:i+dominant_classes_num] 
                   for i in range(0, len(number_pool), dominant_classes_num)]

dominent_num = 300
other_num = 50
for i in range(nusers):
    dominant_classes = user_selections[i]
    indices = []
    
    for j in range(num_classes):
        if j in dominant_classes:
            indices.extend(grouped_indices[j][:dominent_num])
            grouped_indices[j] = grouped_indices[j][dominent_num:]
        else:
            indices.extend(grouped_indices[j][:other_num])
            grouped_indices[j] = grouped_indices[j][other_num:]


    user_tr_data_tensor=torch.from_numpy(total_tr_data[indices]).type(torch.FloatTensor)
    user_tr_label_tensor=torch.from_numpy(total_tr_label[indices]).type(torch.LongTensor)

    user_tr_data_tensors.append(user_tr_data_tensor)
    user_tr_label_tensors.append(user_tr_label_tensor)

# dominant class=[i,i]
final_user_tr_data_tensors = []
final_user_tr_label_tensors = []
indices = [item for sublist in grouped_indices for item in sublist]
for data, label in zip(user_tr_data_tensors, user_tr_label_tensors):
    userLen = 1000 - len(data)
    if userLen>0:
        i = indices[:userLen]
        indices = indices[userLen:]
        new_data_tensor=torch.from_numpy(total_tr_data[i]).type(torch.FloatTensor)
        new_label_tensor = torch.from_numpy(total_tr_label[i]).type(torch.LongTensor)
        data = torch.cat((data, new_data_tensor), dim=0)
        label = torch.cat((label, new_label_tensor), dim=0)
    final_user_tr_data_tensors.append(data)
    final_user_tr_label_tensors.append(label)


In [10]:
print(final_user_tr_data_tensors[0].shape)
print(final_user_tr_label_tensors[0].shape)

tensor_int = final_user_tr_label_tensors[0].to(torch.int64)

unique_numbers, counts = torch.unique(tensor_int, return_counts=True)

for number, count in zip(unique_numbers, counts):
    print(f"Number {int(number)} occurs {int(count)} times")

torch.Size([1000, 3, 32, 32])
torch.Size([1000])
Number 0 occurs 50 times
Number 1 occurs 50 times
Number 2 occurs 50 times
Number 3 occurs 50 times
Number 4 occurs 300 times
Number 5 occurs 50 times
Number 6 occurs 50 times
Number 7 occurs 50 times
Number 8 occurs 300 times
Number 9 occurs 50 times


### Divide cifar10 data among 50 clients in Non-IID fashion (Linxin)

In [11]:
user_tr_data_tensors = []
user_tr_label_tensors = []

total_tr_data_copy = np.copy(total_tr_data)
total_tr_label_copy = np.copy(total_tr_label)

for i in range(nusers):
    # Generate random indices to extract
    random_indices = np.random.choice(len(total_tr_data_copy), user_tr_len, replace=False)
    
    user_tr_data_tensor = torch.from_numpy(total_tr_data_copy[random_indices]).type(torch.FloatTensor)
    user_tr_label_tensor = torch.from_numpy(total_tr_label_copy[random_indices]).type(torch.FloatTensor)

    total_tr_data_copy = np.delete(total_tr_data_copy, random_indices, axis=0)
    total_tr_label_copy = np.delete(total_tr_label_copy, random_indices, axis=0)

    user_tr_data_tensors.append(user_tr_data_tensor)
    user_tr_label_tensors.append(user_tr_label_tensor)
    print('user %d tr len %d, user_tr_data_tensor shape %s, user_tr_label_tensor shape %s'%(i,len(user_tr_data_tensor), user_tr_data_tensor.shape, user_tr_label_tensor.shape))



user 0 tr len 1000, user_tr_data_tensor shape torch.Size([1000, 3, 32, 32]), user_tr_label_tensor shape torch.Size([1000])
user 1 tr len 1000, user_tr_data_tensor shape torch.Size([1000, 3, 32, 32]), user_tr_label_tensor shape torch.Size([1000])
user 2 tr len 1000, user_tr_data_tensor shape torch.Size([1000, 3, 32, 32]), user_tr_label_tensor shape torch.Size([1000])
user 3 tr len 1000, user_tr_data_tensor shape torch.Size([1000, 3, 32, 32]), user_tr_label_tensor shape torch.Size([1000])
user 4 tr len 1000, user_tr_data_tensor shape torch.Size([1000, 3, 32, 32]), user_tr_label_tensor shape torch.Size([1000])
user 5 tr len 1000, user_tr_data_tensor shape torch.Size([1000, 3, 32, 32]), user_tr_label_tensor shape torch.Size([1000])
user 6 tr len 1000, user_tr_data_tensor shape torch.Size([1000, 3, 32, 32]), user_tr_label_tensor shape torch.Size([1000])
user 7 tr len 1000, user_tr_data_tensor shape torch.Size([1000, 3, 32, 32]), user_tr_label_tensor shape torch.Size([1000])
user 8 tr len 10

In [12]:
print(user_tr_data_tensors[0].shape)
print(user_tr_label_tensors[0].shape)

tensor_int = user_tr_label_tensors[35].to(torch.int64)

unique_numbers, counts = torch.unique(tensor_int, return_counts=True)

for number, count in zip(unique_numbers, counts):
    print(f"Number {int(number)} occurs {int(count)} times")

torch.Size([1000, 3, 32, 32])
torch.Size([1000])
Number 0 occurs 92 times
Number 1 occurs 113 times
Number 2 occurs 89 times
Number 3 occurs 99 times
Number 4 occurs 108 times
Number 5 occurs 106 times
Number 6 occurs 113 times
Number 7 occurs 86 times
Number 8 occurs 115 times
Number 9 occurs 79 times
