**Name: Javad Hezareh**

**Student Number: 98101074**

In [3]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
from torch.utils.data import DataLoader

## Initialization

In [4]:
cifar_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    transforms.CenterCrop(28)
])

mnist_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

L = 32 # group size

cifar_trainset = torchvision.datasets.CIFAR10(root='./cifar', train=True,
                                              download=True, transform=cifar_transform)
cifar_testset = torchvision.datasets.CIFAR10(root='./cifar', train=False,
                                             download=True, transform=cifar_transform)

mnist_trainset = torchvision.datasets.MNIST(root='./mnist', train=True,
                                            download=True, transform=mnist_transform)
mnist_testset = torchvision.datasets.MNIST(root='./mnist', train=False,
                                           download=True, transform=mnist_transform)

#################################### To Do (2 pts) #############################
# Define trainloader and testloader with given batch size.
# Set shuffle parameter to True for trainloader and False for testloader.
################################################################################
cifar_train_loader = DataLoader(cifar_trainset, batch_size=L, shuffle=True)
cifar_test_loader = DataLoader(cifar_testset, batch_size=L, shuffle=False)

mnist_train_loader = DataLoader(mnist_trainset, batch_size=L, shuffle=True)
mnist_test_loader = DataLoader(mnist_testset, batch_size=L, shuffle=False)
##################################### End ######################################

device = torch.device(torch.cuda.current_device()) if torch.cuda.is_available() else torch.device('cpu')
device

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 49190593.07it/s]


Extracting ./cifar/cifar-10-python.tar.gz to ./cifar
Files already downloaded and verified
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./mnist/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 139143670.27it/s]


Extracting ./mnist/MNIST/raw/train-images-idx3-ubyte.gz to ./mnist/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./mnist/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 95008387.31it/s]

Extracting ./mnist/MNIST/raw/train-labels-idx1-ubyte.gz to ./mnist/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./mnist/MNIST/raw/t10k-images-idx3-ubyte.gz



100%|██████████| 1648877/1648877 [00:00<00:00, 48428238.09it/s]


Extracting ./mnist/MNIST/raw/t10k-images-idx3-ubyte.gz to ./mnist/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./mnist/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 11441759.02it/s]


Extracting ./mnist/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./mnist/MNIST/raw



device(type='cuda', index=0)

In [5]:
class Net(nn.Module):
    def __init__(self, in_channel):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channel, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        output = torch.softmax(x, dim=1)
        return output

## differentially private SGD

In [6]:
criterion = nn.CrossEntropyLoss()
lr=0.001

In [7]:
def DP_SGD(model, trainloader, criterion, optimizer, sigma, C):
    """
    sigma: noise scale
    C: gradient norm bound
    """
    #################################### To Do (16 pts) ##########################
    model.train()
    for batch in trainloader:
        for param in model.parameters():
            param.accumulated_grads = []

        imgs, labels = batch
        imgs = imgs.to(device)
        labels = labels.to(device)

        # 1. For each sample, compute gradients of loss w.r.t. parameters (5 pts)
        for i in range(len(imgs)):
            img = imgs[i].unsqueeze(0)
            label = labels[i].unsqueeze(0)

            prob = model(img)
            loss = criterion(prob, label)
            loss.backward()

        # 2. Clip each parameter's per-sample gradient (4 pts)
            for param in model.parameters():
                grad = param.grad.detach().clone()
                clip_grad_norm_(grad, max_norm=C)
                param.accumulated_grads.append(grad)
                param.grad = None

        # 3. Add noise and aggregate back accumulated grads with torch.stack (4 pts)
        for param in model.parameters():
            accumulated_grad = torch.stack(param.accumulated_grads, dim=0)
            accumulated_grad += torch.randn_like(accumulated_grad) * sigma * C
            param.grad = accumulated_grad.sum(dim=0) / L
#             param.grad += torch.randn_like(param.grad) * sigma * C

        # 4. Update model's weights and reset grad for the next iteration (3 pts)
        for param in model.parameters():
            optimizer.step()
#             new_param = param - lr * param.grad
#             param = param.detach()
#             param.copy_(new_param)
            optimizer.zero_grad()
#             param.grad = None
    ##################################### End ####################################

## Evaluation
Now, we want to test this algorithm on CIFAR10 and MNIST datasets.

### Effect of $\sigma$ (and $\epsilon$) (6 pts)
Plot accuracy vs. epoch for various noise scale $\sigma = 2, 4, 8$ for both datasets. Plot both training and testing accuracy.

Change the code above if needed.

In [8]:
def cal_accuracy(model, dataloader):
    model.eval()
    with torch.no_grad():
        acc = 0
        N = 0
        for (imgs, labels) in dataloader:
            imgs = imgs.to(device)
            labels = labels.to(device)

            probs = model(imgs)
            preds = torch.argmax(probs, dim=1)
            acc += (preds == labels).sum().item()
            N += len(imgs)

    return acc / N

In [9]:
def train_DP(model, loader, epoch_n, sigma, C):
    train_loader = loader[0]
    test_loader = loader[1]
    
    train_accs = []
    test_accs = []
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    for epoch in range(epoch_n):
        DP_SGD(model, train_loader, criterion, optimizer, sigma, C)
        
        train_acc = cal_accuracy(model, train_loader)
        test_acc = cal_accuracy(model, test_loader)
        train_accs.append(train_acc)
        test_accs.append(test_acc)
        
        print(f'Epoch: {epoch}\ttrain acc: {train_acc:.3f}\ttest acc: {test_acc:.3f}')
        print('-'*50)
    
    return train_accs, test_accs

In [30]:
sigmas = [2, 4, 8]
epochs = [6, 6, 6]
C = 4

results = dict()
datasets = [(mnist_train_loader, mnist_test_loader), (cifar_train_loader, cifar_test_loader)]

for i, loader in enumerate(datasets):
    dataset_name = 'MNIST' if i==0 else 'CIFAR10'
    in_channel = 1 if i==0 else 3
    results[dataset_name] = {'train': dict(), 'test': dict()}

    for epoch_n, sigma in zip(epochs, sigmas):
        print(f"Dataset: {dataset_name}\tC: {C}\tSigma: {sigma}")
        print('-'*50)
        
        model = Net(in_channel).to(device)
        train_accs, test_accs = train_DP(model, loader, epoch_n, sigma, C)
        
        results[dataset_name]['train'][sigma] = train_accs
        results[dataset_name]['test'][sigma] = test_accs

Dataset: MNIST	C: 4	Sigma: 2
--------------------------------------------------
Epoch: 0	train acc: 0.458	test acc: 0.466
--------------------------------------------------
Epoch: 1	train acc: 0.527	test acc: 0.532
--------------------------------------------------
Epoch: 2	train acc: 0.530	test acc: 0.533
--------------------------------------------------
Epoch: 3	train acc: 0.550	test acc: 0.551
--------------------------------------------------
Epoch: 4	train acc: 0.586	test acc: 0.589
--------------------------------------------------
Epoch: 5	train acc: 0.572	test acc: 0.577
--------------------------------------------------
Dataset: MNIST	C: 4	Sigma: 4
--------------------------------------------------
Epoch: 0	train acc: 0.311	test acc: 0.314
--------------------------------------------------
Epoch: 1	train acc: 0.308	test acc: 0.312
--------------------------------------------------
Epoch: 2	train acc: 0.282	test acc: 0.289
--------------------------------------------------
Epo

In [None]:
for d_name in ['MNIST', 'CIFAR10']:
    for sigma in sigmas:
        plt.plot(results[d_name]['train'][sigma], label=f'{d_name};train;{sigma}')
        plt.plot(results[d_name]['test'][sigma], label=f'{d_name};test;{sigma}')
plt.legend();

### Effect of $\epsilon, \delta$ (6 pts)
Plot accuracy vs. $\epsilon$ (according to formulas in the lectures) in range $(10^{-1}, 10)$ for $\delta=10^{-i}, i\in\{2, 3, 4, 5\}$ for both datasets.

In [None]:
deltas = [10**(-i) for i in range(2, 6)]
epsilons = np.linspace(0.1, 10, num=5)
C_values = 4
epoch_n = 8

results = dict()
datasets = [(mnist_train_loader, mnist_test_loader), (cifar_train_loader, cifar_test_loader)]

for i, loader in enumerate(datasets):
    dataset_name = 'MNIST' if i==0 else 'CIFAR10'
    in_channel = 1 if i==0 else 3
    results[dataset_name] = {'train': dict(), 'test': dict()}

    for epsilon in epsilons:
        results[dataset_name]['train'][epsilon] = []
        results[dataset_name]['test'][epsilon] = []
        for delta in deltas:
            sigma = np.sqrt(2*np.log(1.25/delta))/epsilon
            print(f"Dataset: {dataset_name}\tC: {C}\tSigma: {sigma}")
            print('-'*50)
            
            model = Net(in_channel).to(device)
            train_accs, test_accs = train_DP(model, loader, epoch_n, sigma, C)
            
            results[dataset_name]['train'][epsilon].append(np.max(train_accs))
            results[dataset_name]['test'][epsilon].append(np.max(test_accs))

### Effect of clipping bound (3 pts)
Plot train and test accuracy vs. $C=1, 2, ..., 10$ for both datasets.

In [None]:
sigma = 4
C_values = range(1, 11)
epoch_n = 10

results = dict()
datasets = [(mnist_train_loader, mnist_test_loader), (cifar_train_loader, cifar_test_loader)]

for i, loader in enumerate(datasets):
    dataset_name = 'MNIST' if i==0 else 'CIFAR10'
    in_channel = 1 if i==0 else 3
    results[dataset_name] = {'train': [], 'test': []}

    for C in C_values:
        print(f"Dataset: {dataset_name}\tC: {C}\tSigma: {sigma}")
        print('-'*50)
        
        model = Net(in_channel).to(device)
        train_accs, test_accs = train_DP(model, loader, epoch_n, sigma, C)
        
        results[dataset_name]['train'].append(np.max(train_accs))
        results[dataset_name]['test'].append(np.max(test_accs))

Dataset: MNIST	C: 1	Sigma: 4
--------------------------------------------------
Epoch: 0	train acc: 0.350	test acc: 0.358
--------------------------------------------------
Epoch: 1	train acc: 0.711	test acc: 0.718
--------------------------------------------------
Epoch: 2	train acc: 0.779	test acc: 0.785
--------------------------------------------------
Epoch: 3	train acc: 0.793	test acc: 0.803
--------------------------------------------------
Epoch: 4	train acc: 0.801	test acc: 0.809
--------------------------------------------------
Epoch: 5	train acc: 0.801	test acc: 0.807
--------------------------------------------------
Epoch: 6	train acc: 0.797	test acc: 0.801
--------------------------------------------------
Epoch: 8	train acc: 0.806	test acc: 0.813
--------------------------------------------------
Epoch: 9	train acc: 0.811	test acc: 0.823
--------------------------------------------------
Dataset: MNIST	C: 2	Sigma: 4
--------------------------------------------------
Epo