In [111]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

import torchvision
import torchvision.transforms as transforms

import opacus
from opacus import PrivacyEngine

# Random Seeding
torch.manual_seed(0)
np.random.seed(0)

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [97]:
# Spatial size of training images. All images will be resized to this
#   size using a transformer.
image_size = 28

class Discriminator(nn.Module):
    def __init__(self, hidden_1=64, hidden_2=16):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(image_size**2, hidden_1, bias=False)
        self.fc2 = nn.Linear(hidden_1, hidden_2, bias=False)
        self.fc3 = nn.Linear(hidden_2, 1, bias=False)

    def forward(self, x):
        x = x.view(-1, image_size**2)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Generator similar to DCGAN
class Generator(nn.Module):
    def __init__(self, nz=100, ngf=32, nc=1):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # state size. (ngf) x 32 x 32
            nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
            nn.Sigmoid()
            # state size. (nc) x 64 x 64
        )

    def forward(self, input):
        return self.main(input)

# Setup Generator Weight Initialization
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

# Learning rate for optimizers
lr = 0.0002
# Beta1 hyperparam for Adam optimizers
beta1 = 0.5

# Setup model and optimizer
hidden_1 = 64
hidden_2 = 16
model = Discriminator(hidden_1, hidden_2).to(device)

# Initialize weights
netG.apply(weights_init)


optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
data_loader = train_loader
loss_fn = nn.BCEWithLogitsLoss()

In [98]:
sample = next(iter(data_loader))[0].to(device)
print(sample.shape)
output = model(sample)
print(output.shape)
target = torch.ones((sample.shape[0], 1)).to(device)
print(target.shape)
loss = loss_fn(output.flatten(), target.flatten())
print(loss.item())

torch.Size([64, 1, 28, 28])
torch.Size([64, 1])
torch.Size([64, 1])
0.703366756439209


In [102]:
# Given parameter clip bounds c_p, compute maximal ReLU activation bounds B_sigma
def compute_ReLU_bounds(model, c_p, input_size=(784,), input_bounds=1.0):
    sample = torch.ones(input_size).to(device) * input_bounds
    max_val = 0.0
    sum_mk_mkp1 = 0
    skip_first = True

    for layer in model.modules():
        if isinstance(layer, nn.Linear):
            W = torch.ones_like(layer.weight) * c_p
            sample = W @ sample
            
            max_val = max(max_val, sample.max().detach().item())
            
            if skip_first:
                skip_first = False
            else:
                sum_mk_mkp1 += W.shape[0] * W.shape[1]
                # sum_mk_mkp1 += (W.shape[0] + 1) * (W.shape[1] + 1)
            print(layer.weight.shape, max_val, sample.max().detach().item(), sum_mk_mkp1, W.shape[0], W.shape[1])

    return max_val, sum_mk_mkp1

# Setup parameters for Gradient Clip Calculation
c_p = 0.01
B_sigma_p = 1.0
B_sigma, sum_mk_mkp1 = compute_ReLU_bounds(model, c_p)

c_g = 2 * c_p * B_sigma * (B_sigma_p ** 2) * sum_mk_mkp1
c_g

torch.Size([64, 784]) 7.840001106262207 7.840001106262207 0 64 784
torch.Size([16, 64]) 7.840001106262207 5.0176005363464355 1024 16 64
torch.Size([1, 16]) 7.840001106262207 0.8028160929679871 1040 1 16


163.0720230102539

In [103]:
def param_grad_norm(model):
    gradient_norm = 0
    for param in model.parameters():
        gradient_norm += torch.sum(param.grad ** 2)
    gradient_norm = gradient_norm ** 0.5
    return gradient_norm

def param_grad_l1(model):
    gradient_norm = 0
    for param in model.parameters():
        gradient_norm += param.grad.abs().sum().item()
    return gradient_norm

In [104]:
max_norm = 0
first = True
epsilon = 1e-7
for idx in range(1000):
    if first:
        fill_val = c_p
        model.fc1.weight.data.fill_(fill_val)
        model.fc2.weight.data.fill_(fill_val)
        model.fc3.weight.data.fill_(fill_val)
        first = False
    else:
        # Randomize model weights (clip to c_p)
        model.fc1.weight.data = torch.clamp(torch.randn_like(model.fc1.weight), -c_p, c_p)
        model.fc2.weight.data = torch.clamp(torch.randn_like(model.fc2.weight), -c_p, c_p)
        model.fc3.weight.data = torch.clamp(torch.randn_like(model.fc3.weight), -c_p, c_p)
    
    # Print min max of model weights
    assert model.fc1.weight.min().item() >= -c_p - epsilon
    assert model.fc1.weight.max().item() <= c_p + epsilon
    assert model.fc2.weight.min().item() >= -c_p - epsilon
    assert model.fc2.weight.max().item() <= c_p + epsilon
    assert model.fc3.weight.min().item() >= -c_p - epsilon
    assert model.fc3.weight.max().item() <= c_p + epsilon
    
    for c in range(2):
        optimizer.zero_grad()

        # random sample
        sample = (torch.rand(784) > 0.1).to(torch.float32).to(device)
        # sample = torch.ones(784).to(device)
        assert sample.min().item() >= 0.0
        assert sample.max().item() <= 1.0

        sample_out = model(sample)

        # Assert all activations are below B_sigma
        activated_1 = F.relu(model.fc1(sample))
        activated_2 = F.relu(model.fc2(activated_1))
        activated_3 = torch.sigmoid(model.fc3(activated_2))
        assert activated_1.max().item() <= B_sigma
        assert activated_2.max().item() <= B_sigma
        assert activated_3.max().item() <= B_sigma
        
        target = torch.ones((1, 1)).to(device) * c

        loss = loss_fn(sample_out, target)
        loss.backward()

        grad_norm = param_grad_norm(model)
        # l1_norm = param_grad_l1(model)
        if grad_norm > max_norm:
            max_norm = grad_norm
            # print("New L1 Norm:", l1_norm, idx, c)
            print("New Max Norm:", max_norm, idx, c)
    # break
    if max_norm > c_g:
        print("Max Norm Exceeded")
        break

print("Max Norm:", max_norm)

New Max Norm: tensor(12.1363, device='cuda:0') 0 0
Max Norm: tensor(12.1363, device='cuda:0')


In [107]:
# Logical and physical batch sizes
BATCH_SIZE = 64
MAX_PHYSICAL_BATCH_SIZE = 64

# Privacy parameters
EPSILON = 50.0
DELTA = 1e-5
MAX_GRAD_NORM = 1.2
c_p = 0.01


# Training parameters
n_d = 5 # number of discriminator updates per generator update
n_g = 5e5 # number of generator updates
LR = 5e-5




In [112]:
# Setup MNIST dataset
transform = transforms.ToTensor()
train_set = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)

test_set = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1000, shuffle=False)

In [114]:
M = len(train_set)

sample_prob = BATCH_SIZE / M
print("Sample Probability:", sample_prob)


Sample Probability: 0.008533333333333334


In [108]:
privacy_engine = PrivacyEngine()

print(
    f"Before make_private(). "
    f"Model:{type(model)}, \nOptimizer:{type(optimizer)}, \nDataLoader:{type(data_loader)}"
)

model, optimizer, data_loader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=data_loader,
    max_grad_norm=MAX_GRAD_NORM,
    noise_multiplier=1.0,
)

print("="*20)

print(
    f"After make_private(). "
    f"Model:{type(model)}, \nOptimizer:{type(optimizer)}, \nDataLoader:{type(data_loader)}"
)

Before make_private(). Model:<class 'opacus.grad_sample.grad_sample_module.GradSampleModule'>, 
Optimizer:<class 'opacus.optimizers.optimizer.DPOptimizer'>, 
DataLoader:<class 'opacus.data_loader.DPDataLoader'>
After make_private(). Model:<class 'opacus.grad_sample.grad_sample_module.GradSampleModule'>, 
Optimizer:<class 'opacus.optimizers.optimizer.DPOptimizer'>, 
DataLoader:<class 'opacus.data_loader.DPDataLoader'>


In [None]:
# Train model
def train(model, optimizer, loss_fn, data_loader, epochs=10):
    model.train()
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(data_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            print(output.shape, target.shape)
            break
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()

            # Print max gradient
            max_grad = 0.0
            for param in model.parameters():
                if param.grad is not None:
                    max_grad = max(max_grad, param.grad.max().detach().item())
            print(max_grad)
            if max_grad > c_g:
                print("Gradient clipping required")
                break

            if batch_idx % 100 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(data_loader.dataset),
                    100. * batch_idx / len(data_loader), loss.item()))
    
    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
        epoch, len(data_loader.dataset), len(data_loader.dataset),
        100. * len(data_loader.dataset) / len(data_loader.dataset), loss.item()))
        
train(model, optimizer, loss_fn, data_loader, epochs=100)

torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([64, 10]) torch.Size([64])
torch.Size([

UnboundLocalError: local variable 'loss' referenced before assignment

In [None]:
for i, layer in enumerate(model.modules()):
    if isinstance(layer, nn.Linear):
        print("Linear layer: ", layer.weight.shape, layer.bias.shape)

Linear layer:  torch.Size([64, 784]) torch.Size([64])
Linear layer:  torch.Size([10, 64]) torch.Size([10])


Before make_private(). Model:<class '__main__.MLP'>, 
Optimizer:<class 'torch.optim.sgd.SGD'>, 
DataLoader:<class 'torch.utils.data.dataloader.DataLoader'>
After make_private(). Model:<class 'opacus.grad_sample.grad_sample_module.GradSampleModule'>, 
Optimizer:<class 'opacus.optimizers.optimizer.DPOptimizer'>, 
DataLoader:<class 'opacus.data_loader.DPDataLoader'>


