In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from tqdm.notebook import tqdm




# Load the dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST mean and std
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)

# Attack types

def fgsm(model, X, y, epsilon=0.1):
    """ Construct FGSM adversarial examples on the examples X"""
    delta = torch.zeros_like(X, requires_grad=True)
    loss = nn.CrossEntropyLoss()(model(X + delta), y)
    loss.backward()
    return epsilon * delta.grad.detach().sign()

def pgd_linf(model, X, y, epsilon=0.1, alpha=0.01, num_iter=10, randomize=False):
    """ Construct FGSM adversarial examples on the examples X"""
    if randomize:
        delta = torch.rand_like(X, requires_grad=True)
        delta.data = delta.data * 2 * epsilon - epsilon
    else:
        delta = torch.zeros_like(X, requires_grad=True)
        
    for t in range(num_iter):
        loss = nn.CrossEntropyLoss()(model(X + delta), y)
        loss.backward()
        delta.data = (delta + alpha*delta.grad.detach().sign()).clamp(-epsilon,epsilon)
        delta.grad.zero_()
    return delta.detach()

# Epoch types

def epoch_base(loader, model,device, opt=None,):
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        yp = model( X )
        loss = nn.CrossEntropyLoss()( yp, y )
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

def epoch_AT_vanilla(loader, model,device, opt=None,):
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        delta = pgd_linf(model, X, y)
        yp = model( X + delta )
        loss = nn.CrossEntropyLoss()( yp, y )
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)


def epoch_fast_AT(loader, model, device, opt=None,):
    total_loss, total_err = 0.,0.
    for X,y in loader:
        X,y = X.to(device), y.to(device)
        delta = fgsm(model, X, y, epsilon=0.1) #pgd_linf(model, X, y)
        yp = model( X + delta )
        loss = nn.CrossEntropyLoss()( yp, y )
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        total_err += (yp.max(dim=1)[1] != y).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

def epoch_free_AT(loader, model, device, opt=None,):

    num_repeats=10
    epsilon=0.1
    alpha=0.01

    total_loss, total_err = 0., 0.
    for X, y in loader:
        X, y = X.to(device), y.to(device)
        delta = torch.zeros_like(X, requires_grad=True)  # Initialize perturbation

        for _ in range(num_repeats):  # Update the adversarial example in-place
            yp = model(X + delta)  # Prediction on perturbed data
            loss = nn.CrossEntropyLoss()(yp, y)
            opt.zero_grad()
            loss.backward()  # Gradients w.r.t. delta and model parameters

            # Update delta within its allowable range and clamp
            delta.data = (delta + X.shape[0] * alpha * delta.grad.data).clamp(-epsilon, epsilon)
            delta.grad.zero_()

            # Update model parameters
            opt.step()

            total_err += (yp.max(dim=1)[1] != y).sum().item()
            total_loss += loss.item() * X.shape[0]

    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

# Training loop

def launch_experiment(model, device, train_loader, test_loader, opt, epochs, epoch_fn):

    print(*("{}".format(i) for i in ("Train Err", "Test Err", "Adv Err")), sep="\t")

    for t in tqdm(range(epochs)):

        train_err, train_loss = epoch_fn(train_loader, model, device, opt)
        test_err, test_loss = epoch_base(test_loader, model,device)
        adv_err, adv_loss = epoch_AT_vanilla(test_loader, model,device, opt)
        if t == 4:
            for param_group in opt.param_groups:
                param_group["lr"] = 1e-2
        print(*("{:.6f}".format(i) for i in (train_err, test_err, adv_err)), sep="\t")



In [2]:
device = 'cuda'

model = LeNet().to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

# launch_experiment(model, device, train_loader, test_loader, optimizer, 10, epoch_base)

# torch.save(model.state_dict(), './models/vanilla_model.pth')

model.load_state_dict(torch.load('./models/vanilla_model.pth'))


<All keys matched successfully>

In [3]:
import math
from functools import partial

import torch
import torch.nn.utils.parametrize as parametrize
from torch import nn

class LoRAParametrization(nn.Module):
    def __init__(self, fan_in, fan_out, rank, lora_alpha):
        super().__init__()
        # if weight is stored as (fan_out, fan_in), the memory layout of A & B follows (W + BA)x
        # otherwise, it's x(W + AB). This allows us to tie the weights between linear layers and embeddings
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.lora_A = nn.Parameter( torch.zeros( (rank, fan_in))  ).to(device)
        self.lora_B = nn.Parameter( torch.zeros( (fan_out, rank)) ).to(device)
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        self.lora_alpha, self.rank = lora_alpha, rank
        self.scaling = lora_alpha / rank

    def forward(self, X):
        return X + torch.matmul(self.lora_B, self.lora_A) * self.scaling

    @classmethod
    def from_linear(cls, layer, rank=4, lora_alpha=1):
        fan_out, fan_in = layer.weight.shape
        return cls(  fan_in, fan_out, rank=rank, lora_alpha=lora_alpha  )

    @classmethod
    def from_conv2d(cls, layer, rank=4, lora_alpha=1):
        fan_out, fan_in = layer.weight.view(layer.weight.shape[0], -1).shape
        return cls(   fan_in, fan_out, rank=rank, lora_alpha=lora_alpha   )
    

def apply_lora(layer, lora_config=None):
    """add lora parametrization to a layer, designed to be used with model.apply"""
    print(type(layer))
    if type(layer) in lora_config:
        for attr_name, parametrization in lora_config[type(layer)].items():
            parametrize.register_parametrization(layer, attr_name, parametrization(layer))

def add_lora(model, lora_config):
    """add lora parametrization to all layers in a model. Calling it twice will add lora twice"""
    print('apply lora')
    model.apply(partial(apply_lora, lora_config=lora_config))

In [5]:
import torch
# import lora 
# import utils
from functools import partial


# Example model creation and wrapping
# model = LeNet().to(device)
# replace_layers(model, rank=10, lora_alpha=1)

r = 10
lora_config = {  
    nn.Linear: { "weight": partial(LoRAParametrization.from_linear, rank=r),  }, } 

add_lora(model, lora_config)

#nn.Conv2d: { "weight": partial(lora.LoRAParametrization.from_conv2d, rank=r),  },

# Usage example:




apply lora
<class 'torch.nn.modules.conv.Conv2d'>
<class 'torch.nn.modules.activation.ReLU'>
<class 'torch.nn.modules.pooling.MaxPool2d'>
<class 'torch.nn.modules.conv.Conv2d'>
<class 'torch.nn.modules.activation.ReLU'>
<class 'torch.nn.modules.pooling.MaxPool2d'>
<class 'torch.nn.modules.linear.Linear'>
<class 'torch.nn.modules.activation.ReLU'>
<class 'torch.nn.modules.linear.Linear'>
<class 'torch.nn.modules.activation.ReLU'>
<class 'torch.nn.modules.linear.Linear'>
<class '__main__.LeNet'>


In [15]:
for name, module in model.named_modules():
    if len(name.split(".")) >= 4:
        print( module.lora_A )

tensor([[ 0.0479,  0.0126, -0.0324,  ..., -0.0025,  0.0224, -0.0488],
        [-0.0359,  0.0033,  0.0326,  ...,  0.0180, -0.0336, -0.0192],
        [ 0.0363,  0.0093,  0.0244,  ...,  0.0119,  0.0117,  0.0301],
        ...,
        [ 0.0330, -0.0180, -0.0012,  ..., -0.0234,  0.0460, -0.0443],
        [ 0.0042, -0.0138, -0.0299,  ...,  0.0204,  0.0075, -0.0096],
        [ 0.0009,  0.0329, -0.0406,  ..., -0.0124,  0.0075, -0.0462]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)
tensor([[ 7.0590e-02, -4.8569e-02, -3.9795e-02,  ...,  5.4550e-02,
         -6.0678e-02,  3.0068e-02],
        [ 4.7472e-02, -1.4940e-02,  8.3946e-02,  ..., -5.0750e-02,
         -2.1759e-02,  7.3230e-02],
        [ 5.9410e-02, -2.6483e-02,  4.5753e-03,  ...,  1.9894e-02,
         -4.6303e-02,  8.7604e-02],
        ...,
        [-7.0690e-02,  8.9681e-02,  1.9421e-02,  ...,  3.7469e-02,
         -7.4914e-02,  5.7802e-02],
        [ 2.6812e-02, -5.8574e-02, -7.8609e-02,  ...,  4.1899e-02,
         -5.8815e-02,  

In [6]:
for name, param in model.named_parameters():
    if 'mat' not in name:
        print(f'Freezing non-LoRA parameter {name}')
        param.requires_grad = False

for layer in [model.fc1, model.fc2, model.fc3]:
  layer.parametrizations["weight"][0].requires_grad = True

Freezing non-LoRA parameter conv1.weight
Freezing non-LoRA parameter conv1.bias
Freezing non-LoRA parameter conv2.weight
Freezing non-LoRA parameter conv2.bias
Freezing non-LoRA parameter fc1.bias
Freezing non-LoRA parameter fc1.parametrizations.weight.original
Freezing non-LoRA parameter fc2.bias
Freezing non-LoRA parameter fc2.parametrizations.weight.original
Freezing non-LoRA parameter fc3.bias
Freezing non-LoRA parameter fc3.parametrizations.weight.original


In [5]:
# Assuming `model` is your LeNet model instance
# def get_parametrized_parameters(model):
#     parametrized_params = []
#     # Iterate through all modules and their names
#     for name, module in model.named_modules():
#         print(name)
#         if len(name.split(".")) >= 4:
#             print(f"Found parametrized module: {name}")
#             # If it is, add its parameters to the list
#             # parametrized_params.extend(list(module.parameters()))
#             parametrized_params.append(module.lora_A)
#             parametrized_params.append(module.lora_B)
#     return parametrized_params

# def get_lora_parameters(model):
#     for name, param in model.named_parameters():
#         if "lora_A" in name or "lora_B" in name:
#             yield param

# # Create the optimizer with only LoRA parameters
# lora_parameters = list(get_lora_parameters(model))



# Extract only the parametrized parameters
# parametrized_params = get_parametrized_parameters(model)
# lora_optimizer = torch.optim.Adam(parametrized_params, lr=0.001)


# # Now you can pass these parameters to your optimizer
# optimizer = torch.optim.Adam(parametrized_params, lr=0.001)


# parameters = [
#     {"params": list(get_lora_params(model))},
# ]



In [9]:
for a in model.parameters():
    print(a)

Parameter containing:
tensor([[[[ 0.0587,  0.2342,  0.3621,  0.0996,  0.4510],
          [-0.5326, -0.2017,  0.0311,  0.1194,  0.2519],
          [-0.5429, -0.6312, -0.7025, -0.2250,  0.0957],
          [ 0.0416, -0.1242, -0.4433, -0.6349, -0.7583],
          [ 0.0354,  0.2414,  0.0525,  0.2043, -0.1693]]],


        [[[ 0.0912,  0.0358, -0.4404, -0.5290, -0.0712],
          [-0.0883, -0.1681, -0.5256,  0.0202,  0.3014],
          [-0.0254, -0.4263, -0.1254, -0.1304,  0.1608],
          [-0.2006, -0.3125, -0.0830, -0.0249,  0.2702],
          [-0.1989, -0.3673, -0.0072, -0.0538,  0.4465]]],


        [[[ 0.3955,  0.1718,  0.2610,  0.2601,  0.0172],
          [ 0.0756, -0.2035,  0.1144, -0.2000, -0.5734],
          [-0.0985, -0.0707, -0.6432, -0.7336, -0.3692],
          [-0.5537, -0.5153, -0.4343,  0.0858, -0.0300],
          [-0.4420, -0.0460, -0.1034,  0.3015, -0.0956]]],


        [[[ 0.0908, -0.2829,  0.1051,  0.0487, -0.2076],
          [-0.2205, -0.5628,  0.0378,  0.0342, -0.2413

In [7]:
# Step 2: Collect the parameters, pass them to the optimizer

# lora_parameters = [ model.fc1.parametrizations.weight[0].lora_A , model.fc1.parametrizations.weight[0].lora_B ] #utils.get_lora_params(model)  
# lora_optimizer = torch.optim.Adam( model.parameters(), lr=0.001)

# Step 3: Train the model
lora_optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
launch_experiment(model, 'cuda', train_loader, test_loader, lora_optimizer, 10, epoch_fast_AT) #epoch_free_AT

Train Err	Test Err	Adv Err


  0%|          | 0/10 [00:00<?, ?it/s]

0.105317	0.012300	0.155000
0.105317	0.012300	0.155000
0.105317	0.012300	0.155000


KeyboardInterrupt: 

In [36]:
print( model.fc1.weight )
print( model.fc1.parametrizations.weight[0].lora_A )
print( model.fc1.parametrizations.weight[0].lora_B )
print()
print( model.fc2.weight )
print( model.fc2.parametrizations.weight[0].lora_A )
print( model.fc2.parametrizations.weight[0].lora_B )
print()
print( model.fc3.weight )
print( model.fc3.parametrizations.weight[0].lora_A )
print( model.fc3.parametrizations.weight[0].lora_B )

tensor([[ 0.4101,  0.6101, -0.7784,  ...,  0.1910, -0.3317,  0.3116],
        [ 0.2397, -0.1602, -0.0614,  ..., -0.0139,  0.7553, -0.2833],
        [-0.2825, -0.0330, -0.0121,  ...,  0.6242,  0.4241,  0.2875],
        ...,
        [ 0.1932, -0.4889,  0.1380,  ..., -1.0026,  0.1322, -0.7923],
        [-0.1220, -0.6803, -0.4610,  ..., -1.7360, -2.0502, -0.9297],
        [ 0.1274,  0.1923, -0.3626,  ...,  0.2435, -0.0061, -0.6971]],
       device='cuda:0', grad_fn=<AddBackward0>)
tensor([[ 0.0045, -0.0224,  0.0418,  ..., -0.0270, -0.0161, -0.0293],
        [ 0.0445,  0.0337,  0.0454,  ...,  0.0097, -0.0300,  0.0135],
        [-0.0270,  0.0428, -0.0304,  ..., -0.0315,  0.0013,  0.0324],
        ...,
        [-0.0214,  0.0154, -0.0339,  ..., -0.0068, -0.0353,  0.0288],
        [-0.0398, -0.0296, -0.0346,  ..., -0.0406, -0.0293,  0.0298],
        [ 0.0302,  0.0198,  0.0044,  ...,  0.0285,  0.0111, -0.0377]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)
tensor([[0., 0., 0.,  ..., 0., 0.

In [38]:
print( model.fc1.weight )
print( model.fc1.parametrizations.weight[0].lora_A )
print( model.fc1.parametrizations.weight[0].lora_B )
print()
print( model.fc2.weight )
print( model.fc2.parametrizations.weight[0].lora_A )
print( model.fc2.parametrizations.weight[0].lora_B )
print()
print( model.fc3.weight )
print( model.fc3.parametrizations.weight[0].lora_A )
print( model.fc3.parametrizations.weight[0].lora_B )

tensor([[ 0.0411,  0.6837, -0.7472,  ...,  0.1592, -0.2402, -0.0792],
        [-0.0900, -0.9739, -0.0067,  ...,  0.0168,  0.5582,  0.8106],
        [-0.1236, -0.2490, -0.0379,  ...,  0.6027,  0.9698,  0.4862],
        ...,
        [ 0.0659, -0.4628, -0.3113,  ..., -0.6657, -0.8887, -1.5158],
        [ 0.0145, -1.0332, -0.2172,  ..., -2.2406, -2.5094, -1.5576],
        [ 0.7025, -0.2122, -1.4907,  ..., -0.0513,  0.3789,  0.8024]],
       device='cuda:0', grad_fn=<AddBackward0>)
tensor([[ 0.0045, -0.0224,  0.0418,  ..., -0.0270, -0.0161, -0.0293],
        [ 0.0445,  0.0337,  0.0454,  ...,  0.0097, -0.0300,  0.0135],
        [-0.0270,  0.0428, -0.0304,  ..., -0.0315,  0.0013,  0.0324],
        ...,
        [-0.0214,  0.0154, -0.0339,  ..., -0.0068, -0.0353,  0.0288],
        [-0.0398, -0.0296, -0.0346,  ..., -0.0406, -0.0293,  0.0298],
        [ 0.0302,  0.0198,  0.0044,  ...,  0.0285,  0.0111, -0.0377]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)
tensor([[0., 0., 0.,  ..., 0., 0.

In [25]:
import torch
import torch.nn as nn
import torch.nn.utils.parametrize as parametrize


# class LowRankParametrization(nn.Module):
#     def __init__(self, param, rank):
#         super().__init__()
#         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#         self.rank = rank
#         self.u = nn.Parameter(torch.randn(param.size(0), rank)).to(device)
#         self.v = nn.Parameter(torch.randn(rank, param.size(1))).to(device)

#     def forward(self, X):
#         return X+torch.matmul(self.u, self.v)

# rank = 5  # Desired rank

# # Ensure the model's weight is initialized if needed
# parametrization1 = LowRankParametrization(model.fc1.weight, rank)
# parametrize.register_parametrization(model.fc1, 'weight', parametrization1)

# parametrization2 = LowRankParametrization(model.fc2.weight, rank)
# parametrize.register_parametrization(model.fc2, 'weight', parametrization2)

# parametrization3 = LowRankParametrization(model.fc3.weight, rank)
# parametrize.register_parametrization(model.fc3, 'weight', parametrization3)

optimizer_lora = torch.optim.Adam(model.parameters(), lr=0.01)

launch_experiment(model, device, train_loader, test_loader, optimizer_lora, 2, epoch_base)

Train Err	Test Err	Adv Err


  0%|          | 0/2 [00:00<?, ?it/s]

0.894483	0.897200	0.000000
0.897900	0.886500	0.000000


In [9]:
model.fc1.parametrizations #.weight[0].lora_A

AttributeError: 'Linear' object has no attribute 'parametrizations'

In [7]:
print( model.fc1.parametrizations.weight[0].lora_A )
print( model.fc1.parametrizations.weight[0].lora_B )

tensor([[ 1.2010e-02,  3.3419e-02,  2.3072e-03,  ..., -2.7030e-02,
          6.6422e-03, -8.2613e-03],
        [ 2.5634e-02, -1.2746e-02, -4.7297e-02,  ..., -3.3771e-02,
          1.5986e-05, -7.8924e-03],
        [ 2.5030e-02, -7.2367e-03, -2.7895e-03,  ...,  2.4114e-02,
         -1.2479e-02, -2.7373e-02],
        ...,
        [-5.6186e-03,  3.6513e-02,  1.7150e-03,  ...,  4.4266e-02,
         -3.6977e-02,  4.3982e-02],
        [ 1.0757e-03, -3.6546e-02, -1.8330e-02,  ..., -1.4210e-02,
         -2.2639e-02, -3.1511e-02],
        [ 2.6687e-02, -3.8629e-02,  1.7767e-02,  ...,  4.4685e-02,
          2.0076e-02, -2.1628e-03]], device='cuda:0',
       grad_fn=<ToCopyBackward0>)
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]], device='cuda:0',
       grad_fn=<ToCopyBackward0>)


In [11]:
device = 'cuda'

model = LeNet().to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

launch_experiment(model, device, train_loader, test_loader, optimizer, 10, epoch_AT_vanilla)

Train Err	Test Err	Adv Err


  0%|          | 0/10 [00:00<?, ?it/s]

0.091950	0.023900	0.034500
0.031017	0.012200	0.022500
0.022017	0.011000	0.019400
0.018900	0.011100	0.016300
0.016267	0.008800	0.017800
0.055883	0.041500	0.045100
0.036633	0.017700	0.031100
0.031867	0.015900	0.030600
0.030267	0.017800	0.028500
0.028067	0.016000	0.032700


In [12]:
device = 'cuda'

model = LeNet().to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

launch_experiment(model, device, train_loader, test_loader, optimizer, 10, epoch_AT_free)

Train Err	Test Err	Adv Err


  0%|          | 0/10 [00:00<?, ?it/s]

0.103967	0.015100	0.023700
0.037150	0.011400	0.018900
0.027267	0.008600	0.015800
0.022833	0.009400	0.014500
0.019883	0.009700	0.014400
0.059267	0.022300	0.029700
0.037867	0.014800	0.024800
0.036583	0.015700	0.020700
0.034833	0.016500	0.023600
0.031733	0.013200	0.020600
