In [18]:
# First we need to mount the Google drive
import os
from google.colab import drive
drive.mount('/content/Hadrive')

Drive already mounted at /content/Hadrive; to attempt to forcibly remount, call drive.mount("/content/Hadrive", force_remount=True).


In [29]:

configs = dict({
"1": {"outer_n_epochs": 10, "outer_batch_size": 5000, "lr_outer_initial": 0.5,
      "inner_n_epochs": 2, "inner_batch_size": 64, "lr_inner_initial": 0.01, "decay": 0.9,
      "sigma": 0.00000001, "const_C": 1000}
})


for index, config in configs.items():
  print(f"index: {index}")
  print(f"config: {config}")
  for key,value in config.items():
    print(f"key: {key}, value: {value}")

index: 1
config: {'outer_n_epochs': 10, 'outer_batch_size': 5000, 'lr_outer_initial': 0.5, 'inner_n_epochs': 2, 'inner_batch_size': 64, 'lr_inner_initial': 0.01, 'decay': 0.9, 'sigma': 1e-08, 'const_C': 1000}
key: outer_n_epochs, value: 10
key: outer_batch_size, value: 5000
key: lr_outer_initial, value: 0.5
key: inner_n_epochs, value: 2
key: inner_batch_size, value: 64
key: lr_inner_initial, value: 0.01
key: decay, value: 0.9
key: sigma, value: 1e-08
key: const_C, value: 1000


In [None]:
#!mkdir /content/Hadrive/MyDrive/Test1
#!mkdir /content/Hadrive/MyDrive/Test1/Tutorial1/

In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets
from torchvision import transforms
from torch.func import functional_call, vmap, grad
from torch.utils.data import TensorDataset, DataLoader


import pickle
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict
from collections import defaultdict


In [31]:
data_path = '/content/Hadrive/MyDrive/Test1/Tutorial1/'

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4915, 0.4823, 0.4468), (0.2470, 0.2435, 0.2616))
])

# cifar10 = datasets.CIFAR10(data_path, train=True, download=True, transform=transform)
# cifar10_val = datasets.CIFAR10(data_path, train=False, download=True, transform=transform)

nbsamples = 100

#https://stackoverflow.com/questions/47432168/taking-subsets-of-a-pytorch-dataset
cifar10_org = datasets.CIFAR10(data_path, train=True, download=False, transform=transform)
#cifar10 = torch.utils.data.Subset(cifar10_org, list(range(0, nbsamples)))
cifar10 = cifar10_org

cifar10_val_org = datasets.CIFAR10(data_path, train=False, download=False, transform=transform)
#cifar10_val = torch.utils.data.Subset(cifar10_val_org, list(range(0, nbsamples)))
cifar10_val = cifar10_val_org

print(f"lencifar10: {len(cifar10)}")
print(f"lencifar10_val: {len(cifar10_val)}")

lencifar10: 50000
lencifar10_val: 10000


In [32]:
# model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(8 * 8 * 8, 32)
        # bài toán phân loại 10 lớp nên output ra 10 nodes
        self.fc2 = nn.Linear(32, 10)

    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        # flatten về dạng vector để cho vào neural network
        out = out.view(-1, 8 * 8 * 8)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out


In [33]:
import pickle

import torch
import torchvision
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
from collections import OrderedDict
from collections import defaultdict
from torch.func import functional_call, vmap, grad

def generate_private_grad(model,loss_fn,samples,targets,inner_n_epochs,inner_batch_size,sigma,const_C,val_loader):
    '''
        We generate private grad given a batch of samples (samples,targets) in batchclipping mode for classical mini-batch SGD
    '''

    #prepare a new dataloader based on given mini-batch
    mini_dataset = TensorDataset(samples,targets)
    mini_dataloader = DataLoader(mini_dataset,inner_batch_size,shuffle=True)

    #save the starting model state for compute the sum of gradients in final step
    model_state_start = model.state_dict()


    #training the model with given sub-dataset
    for epoch in range(1, inner_n_epochs + 1):
      for inputs,labels in mini_dataloader:
        #copute the gradient of the whole batch
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

      # #print the test accuracy
      # correct = 0
      # with torch.no_grad():
      #     for data in val_loader:
      #         images, labelsx = data
      #         outputsx = model(images)
      #         _, predicted = torch.max(outputsx, 1)
      #         c = (predicted == labelsx).squeeze()
      #         correct += c.sum()

      # if epoch == 1 or epoch % 1 == 0:
      #     print('Inner Epoch {}, Val accuracy {}'.format(epoch, correct / len(cifar10_val)))

    #extract the sum of gradients, i.e., sum_grads = model.state_dict_last - model.state_dict_start
    # sum_grads contains tensor
    model_state_last = model.state_dict()

    sum_grads = OrderedDict()
    for layer in model_state_start.keys():
         sum_grads[layer] = model_state_last[layer] - model_state_start[layer]


    #generate private grad per layer
    mean = 0
    std = sigma*const_C
    norm_type = 2.0
    #clipping the gradient
    #https://discuss.pytorch.org/t/how-to-clip-grad-norm-grads-from-torch-autograd-grad/137816/2
    for layer, grad in sum_grads.items():
        #clip the gradients
        max_norm = const_C #clipping constant C
        total_norm = torch.norm(grad.detach(), norm_type)
        clip_coef = max_norm / (total_norm + 1e-6)
        clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
        grad.detach().mul_(clip_coef_clamped)
        #generate the noise and add it to the clipped grads
        #generate the noise ~ N(0,(C\sigma)^2I)
        #std -- is C\sigma as explain this in wikipage https://en.wikipedia.org/wiki/Normal_distribution N(mu,\sigma^2) and sigma is std
        noise = torch.normal(mean=mean, std=std, size=grad.shape)
        #generate private gradient per layer
        grad = grad + noise

    #reset the model
    model.load_state_dict(model_state_start)
    #update the model.param.grad with noisy grads
    for layer, param in model.named_parameters():
        param.grad = sum_grads[layer]

    return 0


def training_loop(outer_n_epochs, optimizer, model, loss_fn, inner_n_epochs, inner_batch_size, lr_outer, sigma, const_C, train_loader, val_loader, data_path):
    '''
        Outer phrase: model = model - lr_outer*private_grad
        Inner phrase: compute private_grad using batch_clipping and running classical SGD
    '''
    #Outer phrase
    for epoch in range(1, outer_n_epochs + 1):
        loss_train = 0.0

        #extract mini_batch from train_loader and input it to inner phrase
        for imgs, labels in train_loader:

          outputs = model(imgs)
          loss = loss_fn(outputs, labels)
          loss_train += loss.item()

          optimizer.zero_grad()
          '''
            generate_private_grad(model,loss_fn,imgs,labels,inner_n_epochs,inner_batch_size,sigma,const_C,val_loader)
              1. Compute the grad for whole batch of samples
              2. Clip the gradient of the batch of samples
              3. Add noise to the clipped grad of the whole batch of samples
              4. Update the model.grad. This helps optimizer.step works as normal.
          '''
          #loss.backward()
          generate_private_grad(model,loss_fn,imgs,labels,inner_n_epochs,inner_batch_size,sigma,const_C,val_loader)

          #update the model
          for param in model.parameters():
              param.data = param.data - lr_outer*param.grad

        correct = 0
        with torch.no_grad():
            for data in val_loader:
                images, labels = data
                outputs = model(images)
                _, predicted = torch.max(outputs, 1)
                c = (predicted == labels).squeeze()
                correct += c.sum()
        if epoch == 1 or epoch % 1 == 0:
            print('Outer Epoch {}, Training loss {}, Val accuracy {}'.format(
                epoch,
                loss_train / len(train_loader),
                correct / len(cifar10_val)))

        # before_lr = optimizer.param_groups[0]["lr"]
        # scheduler.step()
        # after_lr = optimizer.param_groups[0]["lr"]
        # print("Epoch %d: SGD lr %.4f -> %.4f" % (epoch, before_lr, after_lr))


        #save the model config
        model_state = model.state_dict()
        optimizer_state = optimizer.state_dict()
        scheduler_state = scheduler.state_dict()
        dict_state = dict()
        dict_state["epoch"] = epoch
        dict_state["sigma"] = sigma
        dict_state["const_C"] = const_C
        dict_state["model_state"] = model_state
        dict_state["optimizer_state"] = optimizer_state
        dict_state["scheduler_state"] = scheduler_state
        dict_state["train_loss"] = loss_train / len(train_loader)
        dict_state["val_acc"] = correct / len(cifar10_val)

        try:
            geeky_file = open(data_path + "epoch_" + str(epoch), 'wb')
            pickle.dump(dict_state, geeky_file)
            geeky_file.close()

        except:
            print("Something went wrong")

        #print(f"scheduler state: {scheduler_state}")

In [34]:
'''
    = This is the implementation of the idea generalization of batch clipping.
    = In this setup has two phrases
        Outer phrase: model = model - lr_outer*private_grad
        Inner phrase: compute private_grad using batch_clipping and running classical SGD

      Pseudo_code:
          #Outer_phrase
          for epoch in range(1,nb_outer_epoches+1,1):
              A. for mini_batch in data_loader:
                  #Inner_phrase:
                  a. generate_private_gradient:
                    1. model_state_begin = model
                    2. for epoch' in range(1,nb_inner_epoches+1,1):
                          for inner_mini_batch in DataLoader(mini_batch):
                              = compute gradient of inner_mini_batch given model
                              = update the model with computed gradient from inner_mini_batch

                    3. model_state_last = model
                    4. compute the sum of all gradients = model_state_last - model_state_begin
                    5. compute clipped naive layerwise gradients = clipping(model_state_last - model_state_begin)
                    6. add noise to clipped naive layerwise to create private_gradient
                  b. Update the model, i.e., model = model - lr_outer*private_gradient
              B. Update lr_inner


    = Note: there are two learning rate schemes lr_inner and lr_outer
            Optimizer uses lr_inner in Inner_phrase
            Outer phrase updates its own learning rate lr_outer by itself. We may set lr_outer = 1/2.
'''

for index, config in configs.items():
  print(f"index: {index}")
  data_path_index = data_path + "config_" + str(index) + "_"
  model = Net()
  loss_fn = nn.CrossEntropyLoss()

  #inner setup
  optimizer = optim.SGD(model.parameters(), lr=config["lr_inner_initial"])
  scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

  train_loader = torch.utils.data.DataLoader(cifar10, batch_size=config["outer_batch_size"],shuffle=True)
  val_loader = torch.utils.data.DataLoader(cifar10_val, batch_size=config["outer_batch_size"],shuffle=True)



  training_loop(
      outer_n_epochs = config["outer_n_epochs"],
      optimizer = optimizer,
      model = model,
      loss_fn = loss_fn,
      inner_n_epochs = config["inner_n_epochs"],
      inner_batch_size = config["inner_batch_size"],
      lr_outer = config["lr_outer_initial"],
      sigma = config["sigma"],
      const_C = config["const_C"],
      train_loader = train_loader,
      val_loader = val_loader,
      data_path = data_path_index
  )

index: 1
Outer Epoch 1, Training loss 1.9300193905830383, Val accuracy 0.4250999987125397
Outer Epoch 2, Training loss 1.55693861246109, Val accuracy 0.461899995803833
Outer Epoch 3, Training loss 1.4203070282936097, Val accuracy 0.5023999810218811
Outer Epoch 4, Training loss 1.337641751766205, Val accuracy 0.5314000248908997
Outer Epoch 5, Training loss 1.2576587915420532, Val accuracy 0.5608000159263611
Outer Epoch 6, Training loss 1.2193223476409911, Val accuracy 0.5917999744415283
Outer Epoch 7, Training loss 1.2036351680755615, Val accuracy 0.5934000015258789
Outer Epoch 8, Training loss 1.1454653143882751, Val accuracy 0.6080999970436096
Outer Epoch 9, Training loss 1.2665251255035401, Val accuracy 0.5752000212669373
Outer Epoch 10, Training loss 1.0798550724983216, Val accuracy 0.6168000102043152


In [None]:
import pandas as pd
epoch = 1
path = data_path + "epoch_" + str(epoch)
obj = pd.read_pickle(path)
print(obj.keys())

dict_keys(['epoch', 'model_state', 'optimizer_state'])
