# This is a tutorial how to run this project (CIFAR10 - convnet)

## Loading Modules

In [17]:
import argparse

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import TensorDataset

import numpy as np
import copy

from models.convnet_model import convnet
from datasets.dataset_preprocessing import dataset_preprocessing
from utils.utils import compute_layerwise_C, compute_layerwise_C_average_norm

import validate_model
# import train_model

## Functions

In [18]:
def get_optimizer(opt_name,model,lr):
    if opt_name == "SGD":
        optimizer= optim.SGD(params=model.parameters(),
                             lr=lr,
                             )
    elif opt_name == "RMSprop":
        optimizer = optim.RMSprop(model.parameters(), lr=lr)
    elif opt_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=lr)
    else:
        raise NotImplementedError("Optimizer not recognized. Please check spelling")
    return optimizer

def accuracy(preds, labels):
    """
    Calculate the accuracy of predictions against labels.

    Parameters:
        preds (torch.Tensor): tensor of predictions
        labels (torch.Tensor): tensor of labels

    Returns:
        float: accuracy of predictions against labels
    """
    return (preds == labels).mean()

def DP_train(args, model, device, train_loader,optimizer):
    """
    Train a model using the DP-SGD algorithm.

    Args:
        model: The model to be trained.
        device: The device to use for training.
        train_loader: The data loader for the training data.
        optimizer: The optimizer to use.

    Returns:
        The mean accuracy of the model on the training data.
    """
    model.train()
    print("Training using %s optimizer" % optimizer.__class__.__name__)
    loss = 0
    # Get optimizer

    iteration = 0
    losses = []
    top1_acc = []

    for batch_idx, (batch_data,batch_target) in enumerate(train_loader): # Batch loop
        optimizer.zero_grad()
        # copy current model
        model_clone = convnet(num_classes=10).to(device) # get new instance
        model_clone.load_state_dict(model.state_dict()) # copy state
        optimizer_clone = get_optimizer("SGD", model, lr =args["lr_initial"])
        optimizer_clone.load_state_dict(optimizer.state_dict())
        
        # batch = train_batches[indice]
        batch = TensorDataset(batch_data,batch_target)
        micro_train_loader = torch.utils.data.DataLoader(batch, batch_size=args["microbatch_size"],
                                                         shuffle=True) # Load each data
        """ Original SGD updates"""
        for _ , (data,target) in enumerate(micro_train_loader):
            # print("microbatch shape", data.shape)
            optimizer_clone.zero_grad()
            iteration += 1
            data, target = data.to(device), target.to(device)
            # compute output
            output = model_clone(data)
            # compute loss
            loss = nn.CrossEntropyLoss()(output, target)
            loss = torch.mul(loss,args["loss_multi"])# Adjust losses
            losses.append(loss.item())
            # compute gradient
            loss.backward()

            # Add grad to sum of grad
            """
            Batch clipping each "microbatch"
            """
            # print("Clipping method:", args["clipping"])
            if(args["clipping"] == "layerwise"):
                """------------------------------------------------"""
                for layer_idx, param in enumerate(model_clone.parameters()):
                    """
                    Clip each layer gradients with args.max_grad_norm
                    """
                    torch.nn.utils.clip_grad_norm_(param, max_norm=args["each_layer_C"][layer_idx])

                    """ 
                    Accumulate gradients
                    """
                    if not hasattr(param, "sum_grad"):
                        param.sum_grad = param.grad

                    else:
                        param.sum_grad = param.sum_grad.add(param.grad)


            elif (args["clipping"] == "all"):
                """
                Compute flat list of gradient tensors and its norm 
                """
                # flat_grad_norm = calculate_full_gradient_norm(model_clone)
                # print("Current norm = ", flat_grad_norm)
                """
                Clip all gradients
                """
                torch.nn.utils.clip_grad_norm_(optimizer_clone.param_groups[0]['params'],args["max_grad_norm"])

                """
                Accumulate gradients
                """
                for param in model_clone.parameters():
                    if not hasattr(param, "sum_grad"):
                        param.sum_grad = param.grad
                    else:
                        param.sum_grad = param.sum_grad.add(param.grad)
            else:
                raise ValueError("Invalid clipping mode, available options: all, layerwise")

        # Copy sum of clipped grad to the model gradient
        for net1, net2 in zip(model.named_parameters(), model_clone.named_parameters()): # (layer_name, value) for each layer
            # Option 1:Get the sum of gradients
            net1[1].grad = net2[1].sum_grad
            # Option 2: Averaging the gradients
            # net1[1].grad = net2[1].sum_grad.div(len(micro_train_loader)) 

        # Reset sum_grad
        for param in model_clone.parameters():
            delattr(param, 'sum_grad')

        # Update model
        if(args["noise_multiplier"] > 0):
            for layer_idx, (name,param) in enumerate(model.named_parameters()):
                """
                Add Gaussian noise to gradients
                """
                """--------------STATIC NOISE-----------------"""
                # dist = torch.distributions.normal.Normal(torch.tensor(0.0),
                #                                          torch.tensor((2 * args.noise_multiplier *  args.max_grad_norm)))
                """--------------LAYERWISE NOISE-----------------"""

                if(args["clipping"]=="layerwise"):
                    dist = torch.distributions.normal.Normal(torch.tensor(0.0),
                                                         torch.tensor((2 * args["each_layer_C"][layer_idx] *  args["noise_multiplier"])))
                elif(args["clipping"]=="all"):
                    dist = torch.distributions.normal.Normal(torch.tensor(0.0),
                    torch.tensor((2 * args["max_grad_norm"] * args["noise_multiplier"])))
                # TODO: IC => N(0,C^2sigma^2), BC  => N(0, 4C^2sigma^2)
                noise = dist.rsample(param.grad.shape).to(device=device)

                # Compute noisy grad
                param.grad = (param.grad + noise).div(len(micro_train_loader))
                

        # Update model with noisy grad
        optimizer.step()

        """
        Calculate top 1 acc
        """
        batch_data, batch_target = batch_data.to(device), batch_target.to(device)
        output = model(batch_data)
        preds = np.argmax(output.detach().cpu().numpy(), axis=1)
        labels = batch_target.detach().cpu().numpy()
        acc1 = accuracy(preds, labels)
        top1_acc.append(acc1)
        if batch_idx % (args["log_interval"]*len(train_loader)) == 0:
            print(
                f"Loss: {np.mean(losses):.6f} "
                f"Acc@1: {np.mean(top1_acc):.6f} "
            )
    return np.mean(top1_acc)

## Data Preprocessing

In [19]:
dataset_name = 'CIFAR10'
train_batchsize = 64
test_batchsize = 1000
train_kwargs = {'batch_size': train_batchsize,  'shuffle': True}
test_kwargs = {'batch_size': test_batchsize, 'shuffle': False}
mode = "subsampling"
########## CUDA ############
use_cuda = True
if use_cuda:
    cuda_kwargs = {'num_workers': 2,
                    'pin_memory': True,
                    }
    train_kwargs.update(cuda_kwargs)
    test_kwargs.update(cuda_kwargs)
    
device = torch.device("cuda" if use_cuda else "cpu")
######### Get the dataset loader ##########
C_dataset_loader, train_loader, test_loader, dataset_size = dataset_preprocessing(dataset_name, train_kwargs, test_kwargs, mode)

Processing Cifar10 dataset
Sampling mode: subsampling
Files already downloaded and verified
Files already downloaded and verified
Finished normalizing dataset.

Training Set:
Image batch dimensions: torch.Size([64, 3, 32, 32])
Image label dimensions: torch.Size([64])
tensor([3, 3, 5, 9, 9, 1, 1, 6, 2, 7])

Testing Set:
Image batch dimensions: torch.Size([1000, 3, 32, 32])
Image label dimensions: torch.Size([1000])
tensor([3, 8, 8, 0, 6, 6, 1, 6, 3, 1])


## Model

In [20]:
model = convnet(num_classes=10).to(device)

##  Optimizer

In [21]:
optimizer_name = "SGD"
learning_rate = 0.01
optimizer = get_optimizer(optimizer_name, model, learning_rate)

## Arguments

In [22]:
const_C = 10
lr_initial = 0.1
# Get layerwise clipping constants
# copy current model
dummy_model = convnet(num_classes=10).to(device) # get new instance
dummy_model.load_state_dict(model.state_dict()) # copy state
dummy_optimizer = get_optimizer("SGD", model, lr =lr_initial)
dummy_optimizer.load_state_dict(optimizer.state_dict())

at_epoch = 1
each_layer_C = compute_layerwise_C(C_dataset_loader, dummy_model, at_epoch, device,
                                                        dummy_optimizer, const_C ,True)
args = dict({
"1": { "lr_initial": lr_initial, 
      "microbatch_size": train_batchsize,
      "loss_multi": 1,
      "clipping": "layerwise",
      "decay": 0.9, 
      "noise_multiplier": 0.001, 
      "max_grad_norm": const_C,
      "each_layer_C": each_layer_C,
      "log_interval": 1}
})

Generating layerwise C values


## Training

In [25]:
train_accuracy = []
test_accuracy = []
epochs = 10
for epoch in range(epochs):
    print("epoch:", epoch)
    train_accuracy.append(DP_train(args["1"], model, device, train_loader, optimizer))
    test_accuracy.append(validate_model.test(model, device, test_loader))

epoch: 0
Training using SGD optimizer
Loss: 2.296213 Acc@1: 0.156250 
	Testing accuracy:	Loss: 2.149885 Acc@1: 0.186500 
epoch: 1
Training using SGD optimizer
Loss: 2.191972 Acc@1: 0.140625 
	Testing accuracy:	Loss: 2.135762 Acc@1: 0.185800 
epoch: 2
Training using SGD optimizer
Loss: 2.138676 Acc@1: 0.281250 
	Testing accuracy:	Loss: 1.950133 Acc@1: 0.249000 
epoch: 3
Training using SGD optimizer
Loss: 2.053456 Acc@1: 0.281250 
	Testing accuracy:	Loss: 1.897092 Acc@1: 0.282300 
epoch: 4
Training using SGD optimizer
Loss: 1.870499 Acc@1: 0.296875 
	Testing accuracy:	Loss: 1.870864 Acc@1: 0.305700 
epoch: 5
Training using SGD optimizer
Loss: 1.755528 Acc@1: 0.328125 
	Testing accuracy:	Loss: 1.973768 Acc@1: 0.272300 
epoch: 6
Training using SGD optimizer
Loss: 1.742267 Acc@1: 0.312500 
	Testing accuracy:	Loss: 1.791249 Acc@1: 0.316300 
epoch: 7
Training using SGD optimizer
Loss: 1.758538 Acc@1: 0.406250 
	Testing accuracy:	Loss: 1.740989 Acc@1: 0.339500 
epoch: 8
Training using SGD opti