In [1]:
# First we need to mount the Google drive
import os
from google.colab import drive
drive.mount('/content/Hadrive')

Mounted at /content/Hadrive


In [2]:

configs = dict({
"1": { "lr_initial": 0.1, "decay": 0.9, "sigma": 0.00000001, "const_C": 1000}
})


for index, config in configs.items():
  print(f"index: {index}")
  print(f"config: {config}")
  for key,value in config.items():
    print(f"key: {key}, value: {value}")



index: 1
config: {'lr_initial': 0.1, 'decay': 0.9, 'sigma': 1e-08, 'const_C': 1000}
key: lr_initial, value: 0.1
key: decay, value: 0.9
key: sigma, value: 1e-08
key: const_C, value: 1000


In [None]:
#!mkdir /content/Hadrive/MyDrive/Test1
#!mkdir /content/Hadrive/MyDrive/Test1/Tutorial1/

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
# import torch.optim.lr_scheduler.StepLR as StepLR
# import torch.optim.lr_scheduler.LinearLR as LinearLR



from torchvision import datasets
from torchvision import transforms
import matplotlib.pyplot as plt

In [4]:
data_path = '/content/Hadrive/MyDrive/Test1/Tutorial1/'

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4915, 0.4823, 0.4468), (0.2470, 0.2435, 0.2616))
])

# cifar10 = datasets.CIFAR10(data_path, train=True, download=True, transform=transform)
# cifar10_val = datasets.CIFAR10(data_path, train=False, download=True, transform=transform)

nbsamples = 10

#https://stackoverflow.com/questions/47432168/taking-subsets-of-a-pytorch-dataset
cifar10_org = datasets.CIFAR10(data_path, train=True, download=False, transform=transform)
#subset_org = list(range(0, len(cifar10_org), len(cifar10_org)//100))
subset_org = list(range(0, nbsamples))
#cifar10 = torch.utils.data.Subset(cifar10_org, subset_org)
cifar10 = cifar10_org

cifar10_val_org = datasets.CIFAR10(data_path, train=False, download=False, transform=transform)
#subset_org = list(range(0, len(cifar10_val_org), len(cifar10_val_org)//100))
subset_org = list(range(0, nbsamples))
#cifar10_val = torch.utils.data.Subset(cifar10_val_org, subset_org)
cifar10_val = cifar10_val_org

#A subset from test dataset for computing the layer_wise_constant Ci
cifar10_surrogate = torch.utils.data.Subset(cifar10_val_org, list(range(0, nbsamples)))

print(f"lencifar10: {len(cifar10)}")
print(f"lencifar10_val: {len(cifar10_val)}")
print(f"lencifar10_surrogate: {len(cifar10_surrogate)}")

lencifar10: 50000
lencifar10_val: 10000
lencifar10_surrogate: 10


In [5]:
train_loader = torch.utils.data.DataLoader(cifar10, batch_size=64,
                                           shuffle=True)
val_loader = torch.utils.data.DataLoader(cifar10_val, batch_size=64,
                                           shuffle=True)
data_surrogate_loader = torch.utils.data.DataLoader(cifar10_surrogate, batch_size=nbsamples,
                                           shuffle=True)

In [6]:
# model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(8 * 8 * 8, 32)
        # bài toán phân loại 10 lớp nên output ra 10 nodes
        self.fc2 = nn.Linear(32, 10)

    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        # flatten về dạng vector để cho vào neural network
        out = out.view(-1, 8 * 8 * 8)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out


In [7]:
import pickle

import torch
import torchvision
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
from collections import OrderedDict
from collections import defaultdict
from torch.func import functional_call, vmap, grad

def generate_private_grad(model,loss_fn,samples,targets,sigma,dict_const_Ci):
    '''
        We generate private grad given a batch of samples (samples,targets) as introduced here https://arxiv.org/pdf/1607.00133.pdf
        The implementation flow is as follows:
            1. sample xi
            2. ===> gradient gi
            3. ===> clipped gradient gci
            4. ===> noisy aggregated (sum gci + noise)
            5. ===> normalized 1/B (sum gci + noise)

        We want to follow the tutorial in here to compute multiple grads in parallel:
            #https://pytorch.org/tutorials/intermediate/per_sample_grads.html?utm_source=whats_new_tutorials&utm_medium=per_sample_grads
            #https://pytorch.org/docs/stable/generated/torch.func.grad.html
            #https://towardsdatascience.com/introduction-to-functional-pytorch-b5bf739e1e6e
            #https://pytorch.org/functorch/stable/notebooks/per_sample_grads.html [PAY ATTENTION TO]
        Typically, we generate all gradients gis of samples sis in parallel in the helper function: compute_gradients
        The output of compute_gradients is an array called samples_grads
            sample s[0]: samples_grads[0][layer_1], samples_grads[0][layer_2], .... //g0
                ...............
            sample s[L-1]: samples_grads[L-1][layer_1], samples_grads[L-1][layer_2], ....//g[L-1]
            where L is the number of samples in the mini-batch

        The compute_gradients call another helper function called compute_loss. This is used for computing the gradients in parallel

        After that we compute the clipped gradients gci for each gi. In this case we use the following approach proposed here
            #https://www.tutorialspoint.com/python-pytorch-clamp-method

        We apply dynamic clipping methods, i.e., each layer has its own clipping constant Ci which is computed based on an known dataset (called surrogate).
        Typically, we compute the gradients of this known dataset surrogate dataset. We compute the average of the norm of each layer_i (Ci) and then we define
        the dynamic clipping constant of layer_i Ci = C*(Ci/max Ci). Note that master constant C can be diminised every epoch.

        After computing all clipped gradients, we need to aggregate all the clipped gradient per layer. This step helps us
        to compute the sum (clipped gradient gi) and then we add noise to each entry in the sum (clipped gradient gi)

        Finally, we normalize the private gradient and update the model.grad. This step allows optimizer update the model
    '''

    #copute the gradient of the whole batch
    outputs = model(samples)
    loss = loss_fn(outputs, targets)
    model.zero_grad()
    loss.backward()

    #generate private grad per layer
    mean = 0
    batch_size = len(samples)
    norm_type = 2.0

    for layer, param in model.named_parameters():
        #clipping the gradient
        #https://discuss.pytorch.org/t/how-to-clip-grad-norm-grads-from-torch-autograd-grad/137816/2
        max_norm = dict_const_Ci[layer] #This is clipping constant Ci for layer_i
        grad = param.grad
        total_norm = torch.norm(grad, norm_type)
        clip_coef = max_norm / (total_norm + 1e-6)
        #https://www.tutorialspoint.com/python-pytorch-clamp-method
        #clamp(tensor,min,max)
        clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
        param.grad.detach().mul_(clip_coef_clamped)

        #generate the noise and add it to the clipped gradients
        grad = param.grad
        std = sigma*dict_const_Ci[layer]
        #generate the noise ~ N(0,(C\sigma)^2I)
        #std -- is C\sigma as explain this in wikipage https://en.wikipedia.org/wiki/Normal_distribution N(mu,\sigma^2) and sigma is std
        noise = torch.normal(mean=mean, std=std, size=grad.shape)
        #generate private gradient per layer
        param.grad = (grad*batch_size + noise)/batch_size

    return 0

def generate_layerwise_clipping_constants(model,optimizer,loss_fn,data_surrogate_loader,const_C):
    '''
      We compute the layerwise clipping constant Ci based on data_surrogate
      Step 1. We compute the layer norm Ci
      Step 2. We redefine Ci = Const_C * (Ci/(max_i Ci))
    '''
    for imgs, labels in data_surrogate_loader:
        outputs = model(imgs)
        loss = loss_fn(outputs, labels)
        optimizer.zero_grad()
        loss.backward()

    dict_clipping_const = dict()
    norm_type = 2.0
    maxC = 0.0
    for layer, param in model.named_parameters():
        grad = param.grad
        dict_clipping_const[layer] = torch.norm(grad, norm_type)
        if(dict_clipping_const[layer] > maxC):
          maxC = dict_clipping_const[layer]

    #delete the information in the model.param.grad
    optimizer.zero_grad()

    #normalize the clipping constant Ci
    for layer in dict_clipping_const.keys():
      dict_clipping_const[layer] = const_C*(dict_clipping_const[layer]/maxC)

    return dict_clipping_const



def training_loop(n_epochs, optimizer, model, loss_fn, sigma, const_C, train_loader, val_loader, data_surrogate_loader, data_path):
    for epoch in range(1, n_epochs + 1):
        #generate the layerwise clipping constant Ci
        dict_const_Ci = generate_layerwise_clipping_constants(model,optimizer,loss_fn,data_surrogate_loader,const_C)

        loss_train = 0.0

        for imgs, labels in train_loader:

          outputs = model(imgs)
          loss = loss_fn(outputs, labels)
          loss_train += loss.item()

          optimizer.zero_grad()
          '''
            generate_private_grad(model,loss_fn,imgs,labels,sigma,const_C)
              1. Compute the grad per sample
              2. Clipping the grad per sample
              3. Aggregate the clipped grads and add noise to sum of clipped grads
              4. Update the model.grad. This helps optimizer.step works as normal.
          '''
          #loss.backward()
          generate_private_grad(model,loss_fn,imgs,labels,sigma,dict_const_Ci)
          optimizer.step()

        correct = 0
        with torch.no_grad():
            for data in val_loader:
                images, labels = data
                outputs = model(images)
                _, predicted = torch.max(outputs, 1)
                c = (predicted == labels).squeeze()
                correct += c.sum()
        if epoch == 1 or epoch % 1 == 0:
            print('Epoch {}, Training loss {}, Val accuracy {}'.format(
                epoch,
                loss_train / len(train_loader),
                correct / len(cifar10_val)))

        before_lr = optimizer.param_groups[0]["lr"]
        scheduler.step()
        after_lr = optimizer.param_groups[0]["lr"]
        print("Epoch %d: SGD lr %.4f -> %.4f" % (epoch, before_lr, after_lr))


        #save the model config
        model_state = model.state_dict()
        optimizer_state = optimizer.state_dict()
        scheduler_state = scheduler.state_dict()
        dict_state = dict()
        dict_state["epoch"] = epoch
        dict_state["sigma"] = sigma
        dict_state["const_C"] = const_C
        dict_state["dic_const_Ci"] = dict_const_Ci
        dict_state["model_state"] = model_state
        dict_state["optimizer_state"] = optimizer_state
        dict_state["scheduler_state"] = scheduler_state
        dict_state["train_loss"] = loss_train / len(train_loader)
        dict_state["val_acc"] = correct / len(cifar10_val)

        try:
            geeky_file = open(data_path + "epoch_" + str(epoch), 'wb')
            pickle.dump(dict_state, geeky_file)
            geeky_file.close()

        except:
            print("Something went wrong")

        #print(f"scheduler state: {scheduler_state}")

In [8]:
for index, config in configs.items():
  print(f"index: {index}")
  data_path_index = data_path + "config_" + str(index) + "_"
  model = Net()
  optimizer = optim.SGD(model.parameters(), lr=config["lr_initial"])
  loss_fn = nn.CrossEntropyLoss()
  '''
    LinearLR =>> new LR = initial LR - nb_epochs*(start_factor-end_factor)/total_iters
    example, initialLR = 0.1, start = 1.0, end_factor = 0.5, total_iters = 20
    (start_factor-end_factor)/total_iters = 0.025.
    ===> epoch 1: 0.1 - 1*0.025 = 0.0975
    ===> epoch 2: 0.1 - 2*0.025 = 0.0950....
  '''
  #scheduler = lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.5, total_iters=20)
  '''
   StepLR =>>> new LR = old LR * gamma
  '''
  scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
  '''
    LambdaLR: new LR = initialLR * f(epoch)
    For example: f(epoch) = 1/t
  '''
  # lambda1 = lambda epoch: 1/(epoch+1)
  # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1])

  training_loop(
      n_epochs = 20,
      optimizer = optimizer,
      model = model,
      sigma = config["sigma"],
      const_C = config["const_C"],
      loss_fn = loss_fn,
      train_loader = train_loader,
      val_loader = val_loader,
      data_surrogate_loader = data_surrogate_loader,
      data_path = data_path_index
  )


index: 1
Epoch 1, Training loss 1.5772466089414514, Val accuracy 0.424699991941452
Epoch 1: SGD lr 0.1000 -> 0.0900
Epoch 2, Training loss 1.2488595805966947, Val accuracy 0.5325999855995178
Epoch 2: SGD lr 0.0900 -> 0.0810
Epoch 3, Training loss 1.1349233345454917, Val accuracy 0.5580999851226807
Epoch 3: SGD lr 0.0810 -> 0.0729
Epoch 4, Training loss 1.0674584114643009, Val accuracy 0.5428000092506409
Epoch 4: SGD lr 0.0729 -> 0.0656
Epoch 5, Training loss 1.0214700245338937, Val accuracy 0.5730999708175659
Epoch 5: SGD lr 0.0656 -> 0.0590
Epoch 6, Training loss 0.9808162244232109, Val accuracy 0.6226999759674072
Epoch 6: SGD lr 0.0590 -> 0.0531
Epoch 7, Training loss 0.9489028581877803, Val accuracy 0.6092000007629395
Epoch 7: SGD lr 0.0531 -> 0.0478
Epoch 8, Training loss 0.9235048895449285, Val accuracy 0.5909000039100647
Epoch 8: SGD lr 0.0478 -> 0.0430
Epoch 9, Training loss 0.8997492524973877, Val accuracy 0.6121000051498413
Epoch 9: SGD lr 0.0430 -> 0.0387
Epoch 10, Training l

In [None]:
import pandas as pd
epoch = 1
path = data_path + "epoch_" + str(epoch)
obj = pd.read_pickle(path)
print(obj.keys())

dict_keys(['epoch', 'model_state', 'optimizer_state'])
