In [1]:
# First we need to mount the Google drive
import os
from google.colab import drive
drive.mount('/content/Hadrive')

Mounted at /content/Hadrive


In [2]:

configs = dict({
"1": { "lr_initial": 0.1, "decay": 0.9, "sigma": 0.00000001, "const_C": 1000}
})


for index, config in configs.items():
  print(f"index: {index}")
  print(f"config: {config}")
  for key,value in config.items():
    print(f"key: {key}, value: {value}")



index: 1
config: {'lr_initial': 0.1, 'decay': 0.9, 'sigma': 1e-08, 'const_C': 1000}
key: lr_initial, value: 0.1
key: decay, value: 0.9
key: sigma, value: 1e-08
key: const_C, value: 1000


In [None]:
#!mkdir /content/Hadrive/MyDrive/Test1
#!mkdir /content/Hadrive/MyDrive/Test1/Tutorial1/

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
# import torch.optim.lr_scheduler.StepLR as StepLR
# import torch.optim.lr_scheduler.LinearLR as LinearLR



from torchvision import datasets
from torchvision import transforms
import matplotlib.pyplot as plt

In [4]:
data_path = '/content/Hadrive/MyDrive/Test1/Tutorial1/'

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4915, 0.4823, 0.4468), (0.2470, 0.2435, 0.2616))
])

# cifar10 = datasets.CIFAR10(data_path, train=True, download=True, transform=transform)
# cifar10_val = datasets.CIFAR10(data_path, train=False, download=True, transform=transform)

nbsamples = 100000

#https://stackoverflow.com/questions/47432168/taking-subsets-of-a-pytorch-dataset
cifar10_org = datasets.CIFAR10(data_path, train=True, download=False, transform=transform)
#subset_org = list(range(0, len(cifar10_org), len(cifar10_org)//100))
subset_org = list(range(0, nbsamples))
#cifar10 = torch.utils.data.Subset(cifar10_org, subset_org)
cifar10 = cifar10_org

cifar10_val_org = datasets.CIFAR10(data_path, train=False, download=False, transform=transform)
#subset_org = list(range(0, len(cifar10_val_org), len(cifar10_val_org)//100))
subset_org = list(range(0, nbsamples))
#cifar10_val = torch.utils.data.Subset(cifar10_val_org, subset_org)
cifar10_val = cifar10_val_org

print(f"lencifar10: {len(cifar10)}")
print(f"lencifar10_val: {len(cifar10_val)}")

lencifar10: 50000
lencifar10_val: 10000


In [5]:
train_loader = torch.utils.data.DataLoader(cifar10, batch_size=64,
                                           shuffle=True)
val_loader = torch.utils.data.DataLoader(cifar10_val, batch_size=64,
                                           shuffle=True)

In [6]:
# model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(8 * 8 * 8, 32)
        # bài toán phân loại 10 lớp nên output ra 10 nodes
        self.fc2 = nn.Linear(32, 10)

    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        # flatten về dạng vector để cho vào neural network
        out = out.view(-1, 8 * 8 * 8)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out


In [7]:
import pickle

import torch
import torchvision
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
from collections import OrderedDict
from collections import defaultdict
from torch.func import functional_call, vmap, grad

def compute_loss(params, buffers, model, loss_fn, sample, target):
    batch = sample.unsqueeze(0)
    targets = target.unsqueeze(0)

    predictions = functional_call(model, (params, buffers), (batch,))
    loss = loss_fn(predictions, targets)
    return loss

def compute_gradients(model,loss_fn,samples,targets):
    '''
        We want to follow the tutorial in here to compute multiple grads in parallel:
                #https://pytorch.org/tutorials/intermediate/per_sample_grads.html?utm_source=whats_new_tutorials&utm_medium=per_sample_grads
                #https://pytorch.org/docs/stable/generated/torch.func.grad.html
                #https://towardsdatascience.com/introduction-to-functional-pytorch-b5bf739e1e6e
                #https://pytorch.org/functorch/stable/notebooks/per_sample_grads.html [PAY ATTENTION TO]
            Typically, we generate all gradients gis of samples sis in parallel in the helper function: compute_gradients
            The output of compute_gradients is an array called samples_grads
                sample s[0]: samples_grads[0][layer_1], samples_grads[0][layer_2], .... //g0
                    ...............
                sample s[L-1]: samples_grads[L-1][layer_1], samples_grads[L-1][layer_2], ....//g[L-1]
                where L is the number of samples in the mini-batch

            The compute_gradients call another helper function called compute_loss. This is used for computing the gradients in parallel
    '''

    ft_compute_grad = grad(compute_loss)
    ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, None, None, 0, 0))

    '''
    The ft_compute_grad function computes the gradient for a single (sample, target) pair.
    We can use vmap to get it to compute the gradient over an entire batch of samples and targets.
    Note that in_dims=(None, None, 0, 0) because we wish to
    map ft_compute_grad over the 0th dimension of the data and targets, and use the same params and buffers for each.
    '''

    params = {k: v.detach() for k, v in model.named_parameters()}
    buffers = {k: v.detach() for k, v in model.named_buffers()}

    ft_per_sample_grads = ft_compute_sample_grad(params, buffers,model, loss_fn,samples,targets)

    '''
    ft_per_sample_grads contains the STACKED gradients per layer.
    For example, we have two samples s0 and s1 and we have only two layers "bias" and "weight"
        s0 = ("weight": 1, "layer": 2)
        s1 = ("weight": 3, "layer": 4)
    Stacked gradients per layer means  = ("weight": [1,3], "bias":[2,4])
    Therefore, we have to unstack this stacked gradients to get back the gradient for each sample
    '''

    #get back per_sample_grad
    num_samples = len(samples)
    samples_grads = dict()

    for i in range(num_samples):
      samples_grads[i] = OrderedDict()

    '''
    1. Going through each layer in ft_per_sample_grads: key, value in ft_per_sample_grads.items()
    2. unstack the stacked of len(x) layers: unstacked_grads = torch.unbind(value, dim=0)
    3. redistribute the unstacked sample_layer_grad, i.e., samples_grads[i][key]

    Each sample has its own grad now but saved in the form of dictionary
    '''

    for key,value in ft_per_sample_grads.items():
        #unstack the grads for each layer
        unstacked_grads = torch.unbind(value, dim=0)
        i = 0
        for layer_grad in unstacked_grads:
            samples_grads[i][key] = layer_grad
            i += 1


    return samples_grads


def generate_private_grad(model,loss_fn,samples,targets,sigma,const_C):
    '''
        We generate private grad given a batch of samples (samples,targets) as introduced here https://arxiv.org/pdf/1607.00133.pdf
        The implementation flow is as follows:
            1. sample xi
            2. ===> gradient gi
            3. ===> clipped gradient gci
            4. ===> noisy aggregated (sum gci + noise)
            5. ===> normalized 1/B (sum gci + noise)

        We want to follow the tutorial in here to compute multiple grads in parallel:
            #https://pytorch.org/tutorials/intermediate/per_sample_grads.html?utm_source=whats_new_tutorials&utm_medium=per_sample_grads
            #https://pytorch.org/docs/stable/generated/torch.func.grad.html
            #https://towardsdatascience.com/introduction-to-functional-pytorch-b5bf739e1e6e
            #https://pytorch.org/functorch/stable/notebooks/per_sample_grads.html [PAY ATTENTION TO]
        Typically, we generate all gradients gis of samples sis in parallel in the helper function: compute_gradients
        The output of compute_gradients is an array called samples_grads
            sample s[0]: samples_grads[0][layer_1], samples_grads[0][layer_2], .... //g0
                ...............
            sample s[L-1]: samples_grads[L-1][layer_1], samples_grads[L-1][layer_2], ....//g[L-1]
            where L is the number of samples in the mini-batch

        The compute_gradients call another helper function called compute_loss. This is used for computing the gradients in parallel

        After that we compute the clipped gradients gci for each gi. In this case we use the following approach proposed here
            #https://www.tutorialspoint.com/python-pytorch-clamp-method
        To do it, we need to create a new field called whole_grad which containing all gradients of layers for a given sample si
        whole_grad allows us to compute the total_norm of sample si and then we can do the clipping

        After computing all clipped gradients, we need to aggregate all the clipped gradient per layer. This step helps us
        to compute the sum (clipped gradient gi) and then we add noise to each entry in the sum (clipped gradient gi)

        Finally, we normalize the private gradient and update the model.grad. This step allows optimizer update the model
    '''

    samples_grads = compute_gradients(model,loss_fn,samples,targets)

    #compute the size of batch for normalizing the private grad as done next steps
    num_samples = len(samples)

    #clipping the per_sample_grad
    for i in range(num_samples):
        norm_type = 2.0
        max_norm = const_C #This is clipping constant C
        '''
          This is naive layer wise clipping, i.e., compute the norm of the layer and clipping the layer grad based on that norm and const_C
          It means, we do not need to compute the norm of the gradiets of the whole model !!!!!
          step 1. For each sample, we loop through all the layer
          step 2. For each layer, we compute its norm and clip its gradient based on that norm and const_C
          #https://discuss.pytorch.org/t/how-to-clip-grad-norm-grads-from-torch-autograd-grad/137816/2
        '''
        for layer, grad in samples_grads[i].items():
          total_norm = torch.norm(grad, norm_type)
          clip_coef = max_norm / (total_norm + 1e-6)
          #https://www.tutorialspoint.com/python-pytorch-clamp-method
          #clamp(tensor,min,max)
          clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
          grad.detach().mul_(clip_coef_clamped)

    #Aggregate clipped grads
    '''
        aggregated_grad_dict looks like as follows if we have two samples s0 and s1 as described above.
            aggreated_grad_dict[key=weight]= {1, 3}
            aggreated_grad_dict[key=bias]= {2, 4}
        To get it, we have to loop through all samples and for each sample, we loop through each layer (key) to get it grad (value)
    '''

    aggregated_grad_dict = defaultdict(list)

    for sample in samples_grads.values():
        for layer, grad in sample.items():
            aggregated_grad_dict[layer].append(grad)

    #generate private grad per layer
    mean = 0
    std = sigma*const_C
    batch_size = num_samples
    for layer, list_grad in aggregated_grad_dict.items():
        #compute the sum of clipped gradients gi
        aggregated_grad_dict[layer] = np.sum(list_grad)
        #generate the noise ~ N(0,(C\sigma)^2I)
        #std -- is C\sigma as explain this in wikipage https://en.wikipedia.org/wiki/Normal_distribution N(mu,\sigma^2) and sigma is std
        noise = torch.normal(mean=mean, std=std, size=aggregated_grad_dict[layer].shape)
        #generate private gradient per layer
        aggregated_grad_dict[layer] = (aggregated_grad_dict[layer] + noise)/batch_size

    #update the model's grads
    '''
        Because we do not use loss_fn.backward() function to generate model.grad, model.grad is NONE
        We need to update the model.grad to make sure that optim.step() can operate normally
    '''

    for layer, param in model.named_parameters():
        param.grad =  aggregated_grad_dict[layer]

    return 0

def training_loop(n_epochs, optimizer, model, loss_fn, sigma, const_C, train_loader, val_loader, data_path):
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0

        for imgs, labels in train_loader:

          outputs = model(imgs)
          loss = loss_fn(outputs, labels)
          loss_train += loss.item()

          optimizer.zero_grad()
          '''
            generate_private_grad(model,loss_fn,imgs,labels,sigma,const_C)
              1. Compute the grad per sample
              2. Clipping the grad per sample
              3. Aggregate the clipped grads and add noise to sum of clipped grads
              4. Update the model.grad. This helps optimizer.step works as normal.
          '''
          #loss.backward()
          generate_private_grad(model,loss_fn,imgs,labels,sigma,const_C)
          optimizer.step()

        correct = 0
        with torch.no_grad():
            for data in val_loader:
                images, labels = data
                outputs = model(images)
                _, predicted = torch.max(outputs, 1)
                c = (predicted == labels).squeeze()
                correct += c.sum()
        if epoch == 1 or epoch % 1 == 0:
            print('Epoch {}, Training loss {}, Val accuracy {}'.format(
                epoch,
                loss_train / len(train_loader),
                correct / len(cifar10_val)))

        before_lr = optimizer.param_groups[0]["lr"]
        scheduler.step()
        after_lr = optimizer.param_groups[0]["lr"]
        print("Epoch %d: SGD lr %.4f -> %.4f" % (epoch, before_lr, after_lr))


        #save the model config
        model_state = model.state_dict()
        optimizer_state = optimizer.state_dict()
        scheduler_state = scheduler.state_dict()
        dict_state = dict()
        dict_state["epoch"] = epoch
        dict_state["sigma"] = sigma
        dict_state["const_C"] = const_C
        dict_state["model_state"] = model_state
        dict_state["optimizer_state"] = optimizer_state
        dict_state["scheduler_state"] = scheduler_state
        dict_state["train_loss"] = loss_train / len(train_loader)
        dict_state["val_acc"] = correct / len(cifar10_val)

        try:
            geeky_file = open(data_path + "epoch_" + str(epoch), 'wb')
            pickle.dump(dict_state, geeky_file)
            geeky_file.close()

        except:
            print("Something went wrong")

        #print(f"scheduler state: {scheduler_state}")

In [8]:
for index, config in configs.items():
  print(f"index: {index}")
  data_path_index = data_path + "config_" + str(index) + "_"
  model = Net()
  optimizer = optim.SGD(model.parameters(), lr=config["lr_initial"])
  loss_fn = nn.CrossEntropyLoss()
  '''
    LinearLR =>> new LR = initial LR - nb_epochs*(start_factor-end_factor)/total_iters
    example, initialLR = 0.1, start = 1.0, end_factor = 0.5, total_iters = 20
    (start_factor-end_factor)/total_iters = 0.025.
    ===> epoch 1: 0.1 - 1*0.025 = 0.0975
    ===> epoch 2: 0.1 - 2*0.025 = 0.0950....
  '''
  #scheduler = lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.5, total_iters=20)
  '''
   StepLR =>>> new LR = old LR * gamma
  '''
  scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
  '''
    LambdaLR: new LR = initialLR * f(epoch)
    For example: f(epoch) = 1/t
  '''
  # lambda1 = lambda epoch: 1/(epoch+1)
  # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1])

  training_loop(
      n_epochs = 20,
      optimizer = optimizer,
      model = model,
      sigma = config["sigma"],
      const_C = config["const_C"],
      loss_fn = loss_fn,
      train_loader = train_loader,
      val_loader = val_loader,
      data_path = data_path_index
  )


index: 1


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Epoch 1, Training loss 1.5926964417138063, Val accuracy 0.38440001010894775
Epoch 1: SGD lr 0.1000 -> 0.0900
Epoch 2, Training loss 1.2679662905690614, Val accuracy 0.5392000079154968
Epoch 2: SGD lr 0.0900 -> 0.0810
Epoch 3, Training loss 1.1451899649389565, Val accuracy 0.5745000243186951
Epoch 3: SGD lr 0.0810 -> 0.0729
Epoch 4, Training loss 1.0653740172953252, Val accuracy 0.5764999985694885
Epoch 4: SGD lr 0.0729 -> 0.0656
Epoch 5, Training loss 1.0145335215741715, Val accuracy 0.5806999802589417
Epoch 5: SGD lr 0.0656 -> 0.0590
Epoch 6, Training loss 0.9744223578811606, Val accuracy 0.6075000166893005
Epoch 6: SGD lr 0.0590 -> 0.0531
Epoch 7, Training loss 0.942285154481678, Val accuracy 0.6039999723434448
Epoch 7: SGD lr 0.0531 -> 0.0478
Epoch 8, Training loss 0.9159496396856235, Val accuracy 0.6105999946594238
Epoch 8: SGD lr 0.0478 -> 0.0430
Epoch 9, Training loss 0.8887649205944422, Val accuracy 0.6162999868392944
Epoch 9: SGD lr 0.0430 -> 0.0387
Epoch 10, Training loss 0.86

In [None]:
import pandas as pd
epoch = 1
path = data_path + "epoch_" + str(epoch)
obj = pd.read_pickle(path)
print(obj.keys())

dict_keys(['epoch', 'model_state', 'optimizer_state'])
