In [18]:
DEVICE='cuda:0'
DIMENSION=224
MINI_BS=100
BS=1000
LR=1e-4
MODEL='vit_base_patch16_224'
#MODEL_PATH='./checkpoints/nonDP_model_epoch_2.pth'
MODEL_PATH='./checkpoints/DP_model_begin.pth'
EPOCHS=10

n_acc_steps = BS // MINI_BS # gradient accumulation steps

EPSILON=2
DELTA=1e-5
CLIPPING_STYLE='all-layer'
CLIPPING_MODE='MixOpt'

In [19]:
import torch

device= torch.device(DEVICE if torch.cuda.is_available() else "cpu") #默认为cuda:0
print("device:",device)

device: cuda:0


In [20]:
print('==> Preparing data..')

import torchvision
transformation = torchvision.transforms.Compose([
    torchvision.transforms.Resize(DIMENSION),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.5, 0.5, 0.5),(0.5, 0.5, 0.5)),
])

trainset = torchvision.datasets.CIFAR100(root='data/', train=True, download=True, transform=transformation)
testset = torchvision.datasets.CIFAR100(root='data/', train=False, download=True, transform=transformation)

trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=MINI_BS, shuffle=True, num_workers=4) # shuffle会在数据加载前打乱数据集

testloader = torch.utils.data.DataLoader(
    testset, batch_size=100, shuffle=False, num_workers=4)


==> Preparing data..
Files already downloaded and verified
Files already downloaded and verified


In [21]:
import timm
from opacus.validators import ModuleValidator
import torch.nn as nn
import torch.optim as optim

checkpoint = torch.load(MODEL_PATH)

num_classes=100
print('==> Building model..', MODEL,'; BatchNorm is replaced by GroupNorm. Mode: ', CLIPPING_MODE)
net = timm.create_model(MODEL,pretrained=True,num_classes=num_classes)
net = ModuleValidator.fix(net); # fix使其能用于DP训练
net.load_state_dict(checkpoint['model_state_dict'])
net=net.to(device) 

print('Number of total parameters: ', sum([p.numel() for p in net.parameters()]))
print('Number of trainable parameters: ', sum([p.numel() for p in net.parameters() if p.requires_grad]))

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(net.parameters(), lr=LR)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])



==> Building model.. vit_base_patch16_224 ; BatchNorm is replaced by GroupNorm. Mode:  MixOpt
Number of total parameters:  85875556
Number of trainable parameters:  85875556


In [22]:
from opacus.accountants.utils import get_noise_multiplier
sigma=get_noise_multiplier( # Computes the noise level sigma to reach a total budget of (target_epsilon, target_delta) at the end of epochs, with a given sample_rate
                target_epsilon = EPSILON,
                target_delta = DELTA,
                sample_rate = BS/len(trainset),
                epochs = EPOCHS,
)

from fastDP import PrivacyEngine
privacy_engine = PrivacyEngine(
    net,
    batch_size=BS,
    sample_size=len(trainset),
    noise_multiplier=sigma,
    max_grad_norm=1,
    epochs=EPOCHS,
    clipping_mode=CLIPPING_MODE,
    clipping_style=CLIPPING_STYLE,
    origin_params=['patch_embed.proj.bias'],#['patch_embed.proj.bias'],
)
privacy_engine.attach(optimizer)



Using origin parameters for the ghost differentiation trick......
Number of trainable components:  150 ; Number of trainable layers:  75
>>>>>>>>>>>>>>>>> Applying  automatic  per-sample gradient clipping.
>>>>>>>>>>>>>>>>> Block heads for per-sample gradient clipping are defined as: ['patch_embed.proj']


In [35]:
#先进行训练，存储模型并得到true_gradient

from tqdm.notebook import tqdm
import numpy as np
SHOW_STEPS=100

STAGE='begin'
NUM=0
model_path = f'./DP_checkpoints/{STAGE}_{NUM}.pth'
torch.save({
    'model_state_dict': net.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, model_path)
print(f'Model saved to {model_path}')

#特别注意！在这里只有patch_embed.proj.bias层有反向传播机制(768维的grad向量)

# def get_gradient(net):
#     current_gradient = []
#     for param in net.parameters():
#         current_gradient.append(param.grad.view(-1).detach().cpu().numpy())
#     # print(len(current_gradient))
#     # print([i.size for i in current_gradient])
#     gradient=np.concatenate(current_gradient)
#     return gradient

def get_gradient(net):
    current_gradient = []
    for name, param in net.named_parameters():
        if param.grad is not None:
            if name=='patch_embed.proj.bias':
                #print(f"Parameter: {name}, Gradient: {param.grad}")
                current_gradient.append(param.grad.view(-1).detach().cpu().numpy())
    gradient=np.concatenate(current_gradient)
    #print(gradient.size)
    return gradient


true_gradient=[]

net.train()
train_loss = 0
correct = 0
total = 0


for batch_idx, (inputs, targets) in enumerate(tqdm(trainloader)): #这里的batch_idx应该相当于step？
    inputs, targets = inputs.to(device), targets.to(device)
    outputs = net(inputs)
    loss = criterion(outputs, targets) # 交叉熵函数作为LossFunction
    loss.backward()
    #print(loss)
    # 每个mini_batch都有自己的一个true gradient
    if ((batch_idx + 1) % n_acc_steps == 0) or ((batch_idx + 1) == len(trainloader)):
        present_true_gradient=get_gradient(net)
        true_gradient.append(present_true_gradient)
        print(len(true_gradient))
        optimizer.step() # 每积累n_acc_steps步的梯度后进行一次更新参数(每执行logical batch后更新一次)
        optimizer.zero_grad()

        #保存每个batch_size训练后的模型
        NUM=(batch_idx + 1) // n_acc_steps
        model_path = f'./DP_checkpoints/{STAGE}_{NUM}.pth'
        torch.save({
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, model_path)
        print(f'Model saved to {model_path}')
        
        
    train_loss += loss.item()
    _, predicted = outputs.max(1)
    total += targets.size(0)
    correct += predicted.eq(targets).sum().item()



    # print log
    if ((batch_idx + 1) % SHOW_STEPS == 0) or ((batch_idx + 1) == len(trainloader)):
        #privacy_spent = privacy_engine.get_privacy_spent(accounting_mode="all", lenient=False)
        tqdm.write("----------------------------------------------------------------------------------------")
        tqdm.write('Epoch: {}, step: {}, Train Loss: {:.3f} | Acc: {:.3f}% ({}/{})'.format(
            'none', batch_idx + 1, train_loss / (batch_idx + 1), 100. * correct / total, correct, total))
        #tqdm.write("Privacy Cost: ε_rdp: {:.3f} | α_rdp: {:.1f} | ε_low: {:.3f} | ε_estimate: {:.3f} | ε_upper: {:.3f}".format(
            #privacy_spent["eps_rdp"], privacy_spent["alpha_rdp"], privacy_spent["eps_low"], privacy_spent["eps_estimate"], privacy_spent["eps_upper"]))

np.save(f'./gradients/DP_{STAGE}_true_gradients.npy', true_gradient)


print('Epoch: ', 'none', "total: ", len(trainloader), 'Train Loss: %.3f | Acc: %.3f%% (%d/%d)'
                    % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))



Model saved to ./DP_checkpoints/begin_0.pth


  0%|          | 0/500 [00:00<?, ?it/s]



768
1
Model saved to ./DP_checkpoints/begin_1.pth
768
2
Model saved to ./DP_checkpoints/begin_2.pth
768
3
Model saved to ./DP_checkpoints/begin_3.pth
768
4
Model saved to ./DP_checkpoints/begin_4.pth
768
5
Model saved to ./DP_checkpoints/begin_5.pth
768
6
Model saved to ./DP_checkpoints/begin_6.pth
768
7
Model saved to ./DP_checkpoints/begin_7.pth
768
8
Model saved to ./DP_checkpoints/begin_8.pth
768
9
Model saved to ./DP_checkpoints/begin_9.pth
768
10
Model saved to ./DP_checkpoints/begin_10.pth
----------------------------------------------------------------------------------------
Epoch: none, step: 100, Train Loss: 4.256 | Acc: 10.710% (1071/10000)
768
11
Model saved to ./DP_checkpoints/begin_11.pth
768
12
Model saved to ./DP_checkpoints/begin_12.pth
768
13
Model saved to ./DP_checkpoints/begin_13.pth
768
14
Model saved to ./DP_checkpoints/begin_14.pth
768
15
Model saved to ./DP_checkpoints/begin_15.pth
768
16
Model saved to ./DP_checkpoints/begin_16.pth
768
17
Model saved to ./DP_

In [36]:
# 对每个true gradient分别计算相应的per-sample gradient
MINI_BS=1
BS=1
n_acc_steps = BS // MINI_BS # gradient accumulation steps

trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=MINI_BS, shuffle=True, num_workers=4) # shuffle会在数据加载前打乱数据集

testloader = torch.utils.data.DataLoader(
    testset, batch_size=100, shuffle=False, num_workers=4)

In [37]:
import timm
from opacus.validators import ModuleValidator
import torch.nn as nn
import torch.optim as optim

total_per_sample_gradient=[]

for i in range(50):
    print('i=',i)
    MODEL_PATH='DP_checkpoints/'+STAGE+"_"+str(i)+'.pth'
    checkpoint = torch.load(MODEL_PATH)
    num_classes=100
    print('==> Building model..', MODEL,'; BatchNorm is replaced by GroupNorm. Mode: ', CLIPPING_MODE)
    net = timm.create_model(MODEL,pretrained=True,num_classes=num_classes)
    net = ModuleValidator.fix(net); # fix使其能用于DP训练
    net.load_state_dict(checkpoint['model_state_dict'])
    net=net.to(device) 

    print('Number of total parameters: ', sum([p.numel() for p in net.parameters()]))
    print('Number of trainable parameters: ', sum([p.numel() for p in net.parameters() if p.requires_grad]))

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=LR)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    from opacus.accountants.utils import get_noise_multiplier
    sigma=get_noise_multiplier( # Computes the noise level sigma to reach a total budget of (target_epsilon, target_delta) at the end of epochs, with a given sample_rate
                    target_epsilon = EPSILON,
                    target_delta = DELTA,
                    sample_rate = BS/len(trainset),
                    epochs = EPOCHS,
    )
    from fastDP import PrivacyEngine
    privacy_engine = PrivacyEngine(
        net,
        batch_size=BS,
        sample_size=len(trainset),
        noise_multiplier=sigma,
        max_grad_norm=1,
        epochs=EPOCHS,
        clipping_mode=CLIPPING_MODE,
        clipping_style=CLIPPING_STYLE,
        origin_params=['patch_embed.proj.bias'],#['patch_embed.proj.bias'],
    )
    privacy_engine.attach(optimizer)


    i_per_sample_gradient=[]

    net.train()
    train_loss = 0
    correct = 0
    total = 0


    for batch_idx, (inputs, targets) in enumerate(tqdm(trainloader)):
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = net(inputs)
        loss = criterion(outputs, targets) # 交叉熵函数作为LossFunction
        loss.backward()

        if((batch_idx+1)%50==0):
            present_per_sample_gradient=get_gradient(net)
            i_per_sample_gradient.append(present_per_sample_gradient)
        
        optimizer.step() # 每积累n_acc_steps步的梯度后进行一次更新参数(每执行logical batch后更新一次)
        optimizer.zero_grad()


        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()


        if(batch_idx+1==1000):
            break
    print(len(i_per_sample_gradient))
    np.save(f'./gradients/DP_{STAGE}_{i}_gradients.npy', i_per_sample_gradient)
    #total_per_sample_gradient.append(i_per_sample_gradient)


i= 0
==> Building model.. vit_base_patch16_224 ; BatchNorm is replaced by GroupNorm. Mode:  MixOpt
Number of total parameters:  85875556
Number of trainable parameters:  85875556
Using origin parameters for the ghost differentiation trick......
Number of trainable components:  150 ; Number of trainable layers:  75
>>>>>>>>>>>>>>>>> Applying  automatic  per-sample gradient clipping.
>>>>>>>>>>>>>>>>> Block heads for per-sample gradient clipping are defined as: ['patch_embed.proj']




  0%|          | 0/50000 [00:00<?, ?it/s]

['cls_token', 'pos_embed'] are not supported by privacy engine; these parameters are not requiring gradient nor updated.
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
20
i= 1
==> Building model.. vit_base_patch16_224 ; BatchNorm is replaced by GroupNorm. Mode:  MixOpt
Number of total parameters:  85875556
Number of trainable parameters:  85875556
Using origin parameters for the ghost differentiation trick......
Number of trainable components:  150 ; Number of trainable layers:  75
>>>>>>>>>>>>>>>>> Applying  automatic  per-sample gradient clipping.
>>>>>>>>>>>>>>>>> Block heads for per-sample gradient clipping are defined as: ['patch_embed.proj']


  0%|          | 0/50000 [00:00<?, ?it/s]

['cls_token', 'pos_embed'] are not supported by privacy engine; these parameters are not requiring gradient nor updated.
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
20
i= 2
==> Building model.. vit_base_patch16_224 ; BatchNorm is replaced by GroupNorm. Mode:  MixOpt
Number of total parameters:  85875556
Number of trainable parameters:  85875556
Using origin parameters for the ghost differentiation trick......
Number of trainable components:  150 ; Number of trainable layers:  75
>>>>>>>>>>>>>>>>> Applying  automatic  per-sample gradient clipping.
>>>>>>>>>>>>>>>>> Block heads for per-sample gradient clipping are defined as: ['patch_embed.proj']


  0%|          | 0/50000 [00:00<?, ?it/s]

['cls_token', 'pos_embed'] are not supported by privacy engine; these parameters are not requiring gradient nor updated.
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
20
i= 3
==> Building model.. vit_base_patch16_224 ; BatchNorm is replaced by GroupNorm. Mode:  MixOpt
Number of total parameters:  85875556
Number of trainable parameters:  85875556
Using origin parameters for the ghost differentiation trick......
Number of trainable components:  150 ; Number of trainable layers:  75
>>>>>>>>>>>>>>>>> Applying  automatic  per-sample gradient clipping.
>>>>>>>>>>>>>>>>> Block heads for per-sample gradient clipping are defined as: ['patch_embed.proj']


  0%|          | 0/50000 [00:00<?, ?it/s]

['cls_token', 'pos_embed'] are not supported by privacy engine; these parameters are not requiring gradient nor updated.
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
20
i= 4
==> Building model.. vit_base_patch16_224 ; BatchNorm is replaced by GroupNorm. Mode:  MixOpt
Number of total parameters:  85875556
Number of trainable parameters:  85875556
Using origin parameters for the ghost differentiation trick......
Number of trainable components:  150 ; Number of trainable layers:  75
>>>>>>>>>>>>>>>>> Applying  automatic  per-sample gradient clipping.
>>>>>>>>>>>>>>>>> Block heads for per-sample gradient clipping are defined as: ['patch_embed.proj']


  0%|          | 0/50000 [00:00<?, ?it/s]

['cls_token', 'pos_embed'] are not supported by privacy engine; these parameters are not requiring gradient nor updated.
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
20
i= 5
==> Building model.. vit_base_patch16_224 ; BatchNorm is replaced by GroupNorm. Mode:  MixOpt
Number of total parameters:  85875556
Number of trainable parameters:  85875556
Using origin parameters for the ghost differentiation trick......
Number of trainable components:  150 ; Number of trainable layers:  75
>>>>>>>>>>>>>>>>> Applying  automatic  per-sample gradient clipping.
>>>>>>>>>>>>>>>>> Block heads for per-sample gradient clipping are defined as: ['patch_embed.proj']


  0%|          | 0/50000 [00:00<?, ?it/s]

['cls_token', 'pos_embed'] are not supported by privacy engine; these parameters are not requiring gradient nor updated.
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
20
i= 6
==> Building model.. vit_base_patch16_224 ; BatchNorm is replaced by GroupNorm. Mode:  MixOpt
Number of total parameters:  85875556
Number of trainable parameters:  85875556
Using origin parameters for the ghost differentiation trick......
Number of trainable components:  150 ; Number of trainable layers:  75
>>>>>>>>>>>>>>>>> Applying  automatic  per-sample gradient clipping.
>>>>>>>>>>>>>>>>> Block heads for per-sample gradient clipping are defined as: ['patch_embed.proj']


  0%|          | 0/50000 [00:00<?, ?it/s]

['cls_token', 'pos_embed'] are not supported by privacy engine; these parameters are not requiring gradient nor updated.
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
20
i= 7
==> Building model.. vit_base_patch16_224 ; BatchNorm is replaced by GroupNorm. Mode:  MixOpt
Number of total parameters:  85875556
Number of trainable parameters:  85875556
Using origin parameters for the ghost differentiation trick......
Number of trainable components:  150 ; Number of trainable layers:  75
>>>>>>>>>>>>>>>>> Applying  automatic  per-sample gradient clipping.
>>>>>>>>>>>>>>>>> Block heads for per-sample gradient clipping are defined as: ['patch_embed.proj']


  0%|          | 0/50000 [00:00<?, ?it/s]

['cls_token', 'pos_embed'] are not supported by privacy engine; these parameters are not requiring gradient nor updated.
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768
768


KeyboardInterrupt: 

: 